In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
#numerical statistics that reflects the importance of a word in a collection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from joblib import dump


In [3]:
spam = pd.read_csv('mail_data.csv')
print(spam)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [4]:
#labelling spam as 0 and ham as 1
spam.loc[spam['Category'] == 'spam','Category'] = 0
spam.loc[spam['Category'] == 'ham','Category'] = 1
print(spam)

     Category                                            Message
0           1  Go until jurong point, crazy.. Available only ...
1           1                      Ok lar... Joking wif u oni...
2           0  Free entry in 2 a wkly comp to win FA Cup fina...
3           1  U dun say so early hor... U c already then say...
4           1  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567        0  This is the 2nd time we have tried 2 contact u...
5568        1               Will ü b going to esplanade fr home?
5569        1  Pity, * was in mood for that. So...any other s...
5570        1  The guy did some bitching but I acted like i'd...
5571        1                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [5]:
#seperating as message and spam or ham
X = spam['Message']
Y = spam['Category']

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=5)
#random state to split the data equally each time

In [7]:
#transform the data to feature vectors that can be used as input
#goes through the dataset and assigns some values to the words based on their number of occurence
#score less than 1 ignore it
#stop_words = english contains all the words like is was did which should not be included
#to convert all the letters to lowercase
features = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = features.fit_transform(X_train)
X_test_features = features.transform(X_test)

In [8]:
#converting the values of y to int
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')
print(Y_train)
print(Y_test)

1658    1
1509    1
3266    0
5199    1
3217    0
       ..
3046    1
1725    1
4079    1
2254    1
2915    0
Name: Category, Length: 4457, dtype: int32
2095    0
5343    1
564     0
3849    1
3317    1
       ..
3473    1
884     1
5006    1
3397    0
1309    1
Name: Category, Length: 1115, dtype: int32


In [9]:
model = LogisticRegression()

In [10]:
model.fit(X_train_features,Y_train)

In [11]:
prediction = model.predict(X_train_features)
accuracy = accuracy_score(Y_train,prediction)
print(accuracy)

0.9676912721561588


In [12]:
prediction = model.predict(X_test_features)
accuracy = accuracy_score(Y_test,prediction)
print(accuracy)

0.9605381165919282


In [13]:
input = ["Hey Sarthak More Greetings from the FutureSkills PRIME Community! 🎉 It seems that you have not yet signed-up on the Learning Platform! In a World powered by Digital Transformation, Emerging Technologies profoundly impact our lives.Most of these technological changes are remarkable, but are we truly equipped for the future? 🤔Digital Technologies like Artificial Intelligence, Big Data Analytics, Cloud Computing, Cybersecurity etc., possess an immense potential to Trailblaze your Professional Trajectory. 🔥"]

input_features = features.transform(input)

prediction = model.predict(input_features)
if prediction[0]==1:
  print("The mail is authentic")


if prediction[0]==0:
  print("ALERT!!!It is a spam mail")


print(prediction)


The mail is authentic
[1]


In [15]:
dump(model, 'spam_email_prediction.joblib')

['spam_email_prediction.joblib']