# This program will detect if an email is spam (1) or not (0)

In [1]:
#to avoid warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Import the libraries :
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [3]:
# reads csv file
data=pd.read_csv("data/emails.csv")

In [4]:
# print the first 5 rows
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [5]:
data.shape

(5728, 2)

In [6]:
#To get the column names in the data set :
data.columns

Index(['text', 'spam'], dtype='object')

In [7]:
#checking and removing duplicates
data.drop_duplicates(inplace=True)
print(data.shape)

(5695, 2)


In [8]:
data.isna().sum()

text    0
spam    0
dtype: int64

In [9]:
# download the stopwords package
import nltk 
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/vishnu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
def process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean

In [11]:
# to show the tokenization
data['text'].head().apply(process)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [12]:
#convert the text into a matrix of token counts :
from sklearn.feature_extraction.text import CountVectorizer
message = CountVectorizer(analyzer=process).fit_transform(data['text'])

In [13]:
#split the data into training and testing sets
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(message, data['spam'], test_size=0.20, random_state=0)

In [14]:
# To see the shape of the data
print(message.shape)

(5695, 37229)


# LOGISTIC REGRESSION

In [15]:
from sklearn.linear_model import LogisticRegression
LR_classifier=LogisticRegression()
LR_classifier.fit(xtrain,ytrain)
ypred=LR_classifier.predict(xtest)
ypred

array([1, 0, 0, ..., 0, 0, 0])

In [16]:
LR_classifier.score(xtrain,ytrain)

1.0

In [17]:
LR_classifier.score(xtest,ytest)

0.9859525899912204

In [18]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [19]:
accuracy = accuracy_score(ytest,ypred)
accuracy

0.9859525899912204

In [20]:
lr_accuracy = accuracy*100
lr_accuracy

98.59525899912204

In [21]:
confusion_matrix(ytest,ypred)

array([[864,   6],
       [ 10, 259]])

In [22]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       870
           1       0.98      0.96      0.97       269

    accuracy                           0.99      1139
   macro avg       0.98      0.98      0.98      1139
weighted avg       0.99      0.99      0.99      1139



# Logistic Regression With Hyper parameter tuning

In [23]:
#hyper parameters
from sklearn.model_selection import GridSearchCV #this is used for hyper parameter tuning

solver_values = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty_values = ['l1', 'l2', 'elasticnet', 'none']
parameters = {'solver':solver_values , 'penalty': penalty_values }

In [24]:
linear_classifier = GridSearchCV(LR_classifier, parameters)
linear_classifier.fit(xtrain, ytrain)

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']})

In [31]:
linear_classifier.best_estimator_

LogisticRegression(solver='newton-cg')

In [32]:
linear_classifier.best_params_

{'penalty': 'l2', 'solver': 'newton-cg'}

In [33]:
linear_classifier_final = LogisticRegression(penalty = 'l2',solver='lbfgs')
linear_classifier_final.fit(xtrain, ytrain)

LogisticRegression()

In [34]:
y_pred=linear_classifier_final.predict(xtest)

In [35]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest,y_pred)

array([[864,   6],
       [ 10, 259]])

In [36]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(ytest,y_pred)
accuracy

0.9859525899912204

In [37]:
lrh_accuracy = accuracy*100
lrh_accuracy

98.59525899912204

# Ridge Classifier

In [39]:
from sklearn.linear_model import RidgeClassifier
ridge = RidgeClassifier()
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}
ridge_classifier = GridSearchCV(ridge, parameters)
ridge_classifier.fit(xtrain, ytrain)

GridSearchCV(estimator=RidgeClassifier(),
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20]})

In [40]:
ridge_classifier.best_params_

{'alpha': 20}

In [41]:
ridge_classifier = RidgeClassifier(alpha=20)

ridge_classifier.fit(xtrain,ytrain)

RidgeClassifier(alpha=20)

In [42]:
y_pred_ridge=ridge_classifier.predict(xtest)

In [43]:
confusion_matrix(ytest,y_pred_ridge)

array([[857,  13],
       [ 31, 238]])

In [44]:
accuracy = accuracy_score(ytest,y_pred_ridge)
accuracy

0.961369622475856

In [45]:
rc_accuracy = accuracy*100 
rc_accuracy

96.1369622475856

# Random Forest Classifier

In [46]:
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier
cls=RandomForestClassifier()

In [47]:
n_estimators=[25,50,75,100] 
criterion=['gini','entropy'] 
max_depth=[3,5,10] 
parameters={'n_estimators': n_estimators,'criterion':criterion,'max_depth':max_depth} 
RFC_cls = GridSearchCV(cls, parameters)
RFC_cls.fit(xtrain,ytrain)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 5, 10],
                         'n_estimators': [25, 50, 75, 100]})

In [48]:
RFC_cls.best_params_

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 75}

In [49]:
from sklearn.ensemble import RandomForestClassifier
cls=RandomForestClassifier(n_estimators=50,criterion='entropy',max_depth=10)

In [50]:
cls.fit(xtrain,ytrain)

RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=50)

In [51]:
y_pred=cls.predict(xtest)

In [52]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest,y_pred)

array([[870,   0],
       [183,  86]])

In [53]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(ytest,y_pred)
accuracy

0.839332748024583

In [54]:
rfc_accuracy = accuracy*100
rfc_accuracy

83.9332748024583

In [56]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       870
           1       0.98      0.96      0.97       269

    accuracy                           0.99      1139
   macro avg       0.98      0.98      0.98      1139
weighted avg       0.99      0.99      0.99      1139



# XG Boost

In [57]:
import xgboost as xgb
classifier = xgb.XGBClassifier()
classifier.fit(xtrain, ytrain)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [58]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(xtest)
cm = confusion_matrix(ytest, y_pred)
print(cm)
accuracy = accuracy_score(ytest, y_pred)
accuracy

[[866   4]
 [ 13 256]]


0.9850746268656716

In [59]:
xg_accuracy = accuracy*100
xg_accuracy

98.50746268656717

In [60]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = xtrain, y = ytrain, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))

Accuracy: 98.55 %


# XG Boost Hyperparameter Tuning

In [64]:
data_dmatrix = xgb.DMatrix(data=message,label=data['spam'])

In [65]:
params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}
cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=5000, early_stopping_rounds=5, metrics="error", as_pandas=True, seed=123)

In [66]:
cv_results

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.099122,0.006105,0.100263,0.009062
1,0.079894,0.006494,0.08288,0.015298
2,0.096925,0.019932,0.108698,0.02514
3,0.07208,0.014212,0.074452,0.021117
4,0.070674,0.01679,0.082005,0.014856
5,0.066899,0.014987,0.077439,0.011421
6,0.067162,0.015981,0.078844,0.011569
7,0.061807,0.018895,0.065497,0.018266
8,0.060753,0.015251,0.071293,0.011076
9,0.064441,0.01896,0.073576,0.014087


In [67]:
from sklearn.model_selection import GridSearchCV

In [68]:
parameters={"objective":["binary:logistic"],'colsample_bytree': [0.3],'learning_rate': [0.1],
                'max_depth': [5], 'alpha': [10],'n_estimators':[6]}

In [69]:
xgb_final = GridSearchCV(classifier, parameters)
xgb_final.fit(xtrain,ytrain)

GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0, gpu_id=-1,
                                     importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=0,
                                     num_parallel_tree=1, random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, subsample=1,
                                     tree_method='exact', validate_parameters=1,
                            

In [70]:
xgb_final.predict(xtest)

array([0, 0, 0, ..., 0, 0, 0])

In [71]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = xgb_final.predict(xtest)
cm = confusion_matrix(ytest, y_pred)
print(cm)
accuracy = accuracy_score(ytest, y_pred)
accuracy

[[858  12]
 [ 73 196]]


0.9253731343283582

In [72]:
xght_accuracy = accuracy*100
xght_accuracy

92.53731343283582

# NAIVE BAYES

In [73]:
# create and train the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(xtrain, ytrain)

In [74]:
#To see the classifiers prediction and actual values on the data set :
print(classifier.predict(xtrain))
print(ytrain.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [75]:
# Evaluating the model on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(xtrain)
print(classification_report(ytrain, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytrain, pred))
print("Accuracy: \n", accuracy_score(ytrain, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


Confusion Matrix: 
 [[3445   12]
 [   1 1098]]
Accuracy: 
 0.9971466198419666


In [76]:
#print the predictions
print(classifier.predict(xtest))

[1 0 0 ... 0 0 0]


In [77]:
#print the actual values
print(ytest.values)

[1 0 0 ... 0 0 0]


In [78]:
# Evaluating the model on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(xtest)
print(classification_report(ytest, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytest, pred))
print("Accuracy: \n", accuracy_score(ytest, pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


Confusion Matrix: 
 [[862   8]
 [  1 268]]
Accuracy: 
 0.9920983318700615


# SMOTE

In [79]:
from sklearn.model_selection import RandomizedSearchCV, KFold
kf = KFold(n_splits = 10, shuffle = True)

In [80]:
params = {'C':[0.0001, 0.001, 0.01, 0.1, 1, 1, 2, 3, 4, 5, 10, 20, 50],
          'tol':[1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]}

In [83]:
search = GridSearchCV(LR_classifier, params, cv = kf.split(message,data['spam'] ), n_jobs = -1, scoring = 'f1')

In [85]:
search.fit(message,data['spam'])

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7ff42ce4f890>,
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 1, 2, 3, 4, 5, 10,
                               20, 50],
                         'tol': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1]},
             scoring='f1')

In [86]:
#Oversampling the train_set to overcome imbalanced data
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_res, y_res = smote.fit_resample(xtrain, ytrain)

In [87]:
model_LR = search.best_estimator_

In [88]:
model_LR.fit(X_res, y_res)

LogisticRegression(C=3, tol=1)

In [89]:
y_pred5 = model_LR.predict(xtest)

In [90]:
pred = model_LR.predict(xtest)

In [91]:
from sklearn.metrics import f1_score
f1_score(ytest, y_pred5)

0.9504587155963302

In [92]:
accuracy = accuracy_score(ytest,pred)
accuracy

0.9762949956101844

In [93]:
smote_accuracy = accuracy*100
smote_accuracy

97.62949956101843

In [94]:
#saving the model
import joblib
joblib.dump(model_LR, 'model/Spam.pkl')

['model/Spam.pkl']