In [68]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split # We can train and test our data by importing this
from sklearn.metrics import accuracy_score

*Data Collection and Pre-Processing*

In [69]:
data = pd.read_csv('/content/mail_data.csv')
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [70]:
#Chaning the data -> 'Not Spam'
data['Category'] = data['Category'].replace('ham', 'Not spam')
data

Unnamed: 0,Category,Message
0,Not spam,"Go until jurong point, crazy.. Available only ..."
1,Not spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,Not spam,U dun say so early hor... U c already then say...
4,Not spam,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,Not spam,Will ü b going to esplanade fr home?
5569,Not spam,"Pity, * was in mood for that. So...any other s..."
5570,Not spam,The guy did some bitching but I acted like i'd...


In [71]:
#replace all null values
mail_data = data.where((pd.notnull(data)), '')
mail_data

Unnamed: 0,Category,Message
0,Not spam,"Go until jurong point, crazy.. Available only ..."
1,Not spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,Not spam,U dun say so early hor... U c already then say...
4,Not spam,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,Not spam,Will ü b going to esplanade fr home?
5569,Not spam,"Pity, * was in mood for that. So...any other s..."
5570,Not spam,The guy did some bitching but I acted like i'd...


In [72]:
mail_data.head()

Unnamed: 0,Category,Message
0,Not spam,"Go until jurong point, crazy.. Available only ..."
1,Not spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,Not spam,U dun say so early hor... U c already then say...
4,Not spam,"Nah I don't think he goes to usf, he lives aro..."


In [73]:
mail_data.shape

(5572, 2)

Labeling our data: Spam = 0     Not Spam = 1


In [74]:

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'Not spam', 'Category',] = 1

In [75]:
# Separating our data: X = text Y= Spam/Not Spam
x = mail_data['Message']
y = mail_data['Category']


In [76]:
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


Splitting the Data into training and test data

In [77]:
X_train, X_test, Y_train,Y_test = train_test_split(x,y, test_size = 0.2, random_state=3) # 80% will go to x_train and y_train, the rest will go to testing(20%)

Feature Extraction

In [78]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [79]:
print(X_test_features)

  (0, 7271)	0.1940327008179069
  (0, 6920)	0.20571591693537986
  (0, 5373)	0.2365698724638063
  (0, 5213)	0.1988547357502182
  (0, 4386)	0.18353336340308998
  (0, 1549)	0.2646498848307188
  (0, 1405)	0.3176863938914351
  (0, 1361)	0.25132445289897426
  (0, 1082)	0.2451068436245027
  (0, 1041)	0.28016206931555726
  (0, 405)	0.2381316303003606
  (0, 306)	0.23975986557206702
  (0, 20)	0.30668032384591537
  (0, 14)	0.26797874471323896
  (0, 9)	0.2852706805264544
  (0, 1)	0.2381316303003606
  (1, 7368)	0.29957800964520975
  (1, 6732)	0.42473488678029325
  (1, 6588)	0.3298937975962767
  (1, 6507)	0.26731535902873493
  (1, 6214)	0.3621564482127515
  (1, 4729)	0.22965776503163893
  (1, 4418)	0.3457696891316818
  (1, 3491)	0.496093956101028
  (2, 7205)	0.22341717215670331
  :	:
  (1110, 3167)	0.5718357066163949
  (1111, 7353)	0.4991205841293424
  (1111, 6787)	0.40050175714278885
  (1111, 6033)	0.4714849709283488
  (1111, 3227)	0.44384935772735523
  (1111, 2440)	0.4137350055985486
  (1112, 7071)

Training the Model  - Logisitc Regression

In [80]:
model = LogisticRegression()

In [81]:
#Training the Model with the training data
model.fit(X_train_features,Y_train) #.fit is like training our model

Evaluating the Trained Model

In [82]:
#Prediction on training data
#Going to give all the X_train_features and ask to predict the Y_train values
prediction_on_training = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train,prediction_on_training)

In [83]:
print(f'Accuracy on Training data is: {accuracy_on_training_data}')

Accuracy on Training data is: 0.9670181736594121


In [84]:
#Prediciton on Test data
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test,prediction_on_test_data)

In [85]:
print(f'Accuracy on Testing data is: {accuracy_on_test_data}')

Accuracy on Testing data is: 0.9659192825112107


Building the Predicition System


In [88]:
print("Hello. This model can predict if your email is a spam or not. ")

print()
input_mail= input("Please Enter the Email: ")
input_mail_list = [input_mail]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail_list)

# making prediction

prediction = model.predict(input_data_features)
print(f'The Prediction is: {prediction} ')


if (prediction[0]==1):
  print('This is NOT a Spam Email')

else:
  print('This is a spam mail')

Hello. This model can predict if your email is a spam or not. 

Please Enter the Email: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
The Prediction is: [1] 
This is NOT a Spam Email
