In [1]:
# Importing required packages 
import pandas as pd
import warnings
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier ,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score,accuracy_score, classification_report, f1_score ,confusion_matrix

In [44]:
# Reading the data
data = pd.read_csv("Email Spam detection/messages.csv")

In [45]:
data

Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0
...,...,...,...
2888,love your profile - ysuolvpv,hello thanks for stopping by ! ! we have taken...,1
2889,you have been asked to join kiddin,"the list owner of : "" kiddin "" has invited you...",1
2890,anglicization of composers ' names,"judging from the return post , i must have sou...",0
2891,"re : 6 . 797 , comparative method : n - ary co...",gotcha ! there are two separate fallacies in t...,0


In [46]:
data.label.value_counts() # Count of Values in Label/ Target column

0    2412
1     481
Name: label, dtype: int64

In [48]:
data.isna().any()         # Their are nan values in the column 'subject'

subject     True
message    False
label      False
dtype: bool

In [49]:
data[data.subject.isna()]

Unnamed: 0,subject,message,label
1,,"lang classification grimes , joseph e . and ba...",0
13,,syntax the antisymmetry of syntax richard s . ...,0
69,,computational ling bengt sigurd ( ed ) compute...,0
107,,"phonology & phonetics burquest , donald a . an...",0
258,,phonology & phonetics leiden in last : hil pho...,0
...,...,...,...
2296,,the latest issue ( 1994 n01 ) of etudes de let...,0
2309,,b a r g a i n a i r f a r e s your 1 - stop tr...,1
2555,,"data to : = 20 date : fri , 06 feb 1998 22 : 3...",1
2562,,"epac . pt , e . carnoali @ genie . com , e . c...",1


In [50]:
data = data.fillna(" ") #filling all the NAN values with spaces

In [51]:
# Combining the subject and messages into single text column which will be input for NLP models later on
data["text"] = data["subject"].astype(str) + " " + data["message"]

In [52]:
data

Unnamed: 0,subject,message,label,text
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0,job posting - apple-iss research center conten...
1,,"lang classification grimes , joseph e . and ba...",0,"lang classification grimes , joseph e . and ..."
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0,query : letter frequencies for text identifica...
3,risk,a colleague and i are researching the differin...,0,risk a colleague and i are researching the dif...
4,request book information,earlier this morning i was on the phone with a...,0,request book information earlier this morning ...
...,...,...,...,...
2888,love your profile - ysuolvpv,hello thanks for stopping by ! ! we have taken...,1,love your profile - ysuolvpv hello thanks for ...
2889,you have been asked to join kiddin,"the list owner of : "" kiddin "" has invited you...",1,you have been asked to join kiddin the list ow...
2890,anglicization of composers ' names,"judging from the return post , i must have sou...",0,anglicization of composers ' names judging fro...
2891,"re : 6 . 797 , comparative method : n - ary co...",gotcha ! there are two separate fallacies in t...,0,"re : 6 . 797 , comparative method : n - ary co..."


In [53]:
# Creating the DF required for the task keeping only the combined column "text" and "label"
nlp_df = data.drop(["subject","message"],axis=1)

In [54]:
nlp_df

Unnamed: 0,label,text
0,0,job posting - apple-iss research center conten...
1,0,"lang classification grimes , joseph e . and ..."
2,0,query : letter frequencies for text identifica...
3,0,risk a colleague and i are researching the dif...
4,0,request book information earlier this morning ...
...,...,...
2888,1,love your profile - ysuolvpv hello thanks for ...
2889,1,you have been asked to join kiddin the list ow...
2890,0,anglicization of composers ' names judging fro...
2891,0,"re : 6 . 797 , comparative method : n - ary co..."


In [55]:
# Removing Unnecessary numbers and converting the text into lowercase
nlp_df["text"] = nlp_df["text"].str.lower()
nlp_df["text"] = nlp_df["text"].str.replace('[0-9]','')
#nlp_df["text"] = nlp_df["text"].str.replace('[^\w\s]','')

In [56]:
nlp_df

Unnamed: 0,label,text
0,0,job posting - apple-iss research center conten...
1,0,"lang classification grimes , joseph e . and ..."
2,0,query : letter frequencies for text identifica...
3,0,risk a colleague and i are researching the dif...
4,0,request book information earlier this morning ...
...,...,...
2888,1,love your profile - ysuolvpv hello thanks for ...
2889,1,you have been asked to join kiddin the list ow...
2890,0,anglicization of composers ' names judging fro...
2891,0,"re : . , comparative method : n - ary compar..."


In [58]:
y = nlp_df["label"]
X = nlp_df["text"]

In [60]:
# Splitting into training and Test dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=88)

## Converting into TF-IDF 

In [65]:
vectorizer = TfidfVectorizer(min_df =1,stop_words='english',use_idf=True,analyzer='word',
                             ngram_range=(1,1),max_features=15000)
x_train = vectorizer.fit_transform(X_train)
x_test  = vectorizer.transform(X_test)

tf-idf stands for Term frequency-inverse document frequency. It is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. The tf-idf weight is a weight often used in information retrieval and text mining. Variations of the tf-idf weighting scheme are often used by search engines in scoring and ranking a document's relevance given a query.

In [66]:
x_train.shape

(2314, 15000)

In [67]:
x_test.shape

(579, 15000)

## Machine Learning Models 

### Logistic Regression

In [68]:
logisticRegr = LogisticRegression(solver='liblinear',class_weight='balanced',random_state=5,tol=0.001,max_iter=1000)
logisticRegr.fit(x_train, y_train)

LogisticRegression(class_weight='balanced', max_iter=1000, random_state=5,
                   solver='liblinear', tol=0.001)

In [69]:
predictions = logisticRegr.predict(x_test)

In [70]:
cm = confusion_matrix(y_test, predictions)
print(cm)

[[472   0]
 [  5 102]]


In [71]:
accuracy_score(y_test, predictions)

0.9913644214162349

In [72]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       472
           1       1.00      0.95      0.98       107

    accuracy                           0.99       579
   macro avg       0.99      0.98      0.99       579
weighted avg       0.99      0.99      0.99       579



### Random Forest Model 

In [73]:
rand = RandomForestClassifier(n_estimators=100,criterion='entropy',max_features=None,class_weight='balanced')
rand.fit(x_train, y_train)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_features=None)

In [74]:
prediction2 = rand.predict(x_test)

In [75]:
print('\n','CONFUSION MATRIX','\n',confusion_matrix(y_test, prediction2))
print('\n','ACCURACY','\n',accuracy_score(y_test, prediction2))
print('\n','REPORT','\n',classification_report(y_test,prediction2))


 CONFUSION MATRIX 
 [[468   4]
 [  8  99]]

 ACCURACY 
 0.9792746113989638

 REPORT 
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       472
           1       0.96      0.93      0.94       107

    accuracy                           0.98       579
   macro avg       0.97      0.96      0.97       579
weighted avg       0.98      0.98      0.98       579



### SVM 

In [76]:
from sklearn import svm
SVMM = svm.LinearSVC(class_weight='balanced',verbose=0, random_state=None,max_iter=1000)  

In [77]:
SVMM.fit(x_train,y_train)
predictions3 = SVMM.predict(x_test)

In [78]:
print('\n','CONFUSION MATRIX','\n',confusion_matrix(y_test, predictions3))
print('\n','ACCURACY','\n',accuracy_score(y_test, predictions3))
print('\n','REPORT','\n',classification_report(y_test,predictions3))


 CONFUSION MATRIX 
 [[472   0]
 [  7 100]]

 ACCURACY 
 0.9879101899827288

 REPORT 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       472
           1       1.00      0.93      0.97       107

    accuracy                           0.99       579
   macro avg       0.99      0.97      0.98       579
weighted avg       0.99      0.99      0.99       579



### Conclusion

The Logistic Regression is observed as best model for this data to detect spam emails, after converting the text into TF-IDF.

In [79]:
#Storing the best model
import joblib

# Saving the model as a pickle in a file
joblib.dump(logisticRegr,"email_spam_prediction.pkl")

['email_spam_prediction.pkl']

We can use this saved model later on for email spam detection