### Importing Libraries

In [3]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

### Importing Data

In [5]:
data = pd.read_csv('Datasets/spam.csv')

In [6]:
data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data = data.replace(['ham', 'spam'], [0, 1])

In [8]:
data.head()

Unnamed: 0,Label,EmailText
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Label      5572 non-null   int64 
 1   EmailText  5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [10]:
data['Label'].value_counts()

0    4825
1     747
Name: Label, dtype: int64

In [13]:
corpus = []
ps = PorterStemmer()

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/sachit/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Processing Messages

In [15]:
for i in range(0, 5572):

    # Applying Regular Expression
    
    '''
    Replace email addresses with 'emailaddr'
    Replace URLs with 'httpaddr'
    Replace money symbols with 'moneysymb'
    Replace phone numbers with 'phonenumbr'
    Replace numbers with 'numbr'
    '''
    msg = data['EmailText'][i]
    msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', data['EmailText'][i])
    msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', data['EmailText'][i])
    msg = re.sub('([A-Z]{3}|[A-Z]?[\$€¥])?\s?(\d{1,3}((,\d{1,3})+)?(.\d{1,3})?(.\d{1,3})?(,\d{1,3})?)', 'moneysymb', data['EmailText'][i])
    msg = re.sub('\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumbr', data['EmailText'][i])
    msg = re.sub('\d+(\.\d+)?', 'numbr', data['EmailText'][i])
    
    ''' Remove all punctuations '''
    msg = re.sub('[^\w\d\s]', ' ', data['EmailText'][i])
    
    if i<2:
        print("\t\t\t\t MESSAGE ", i)
    
    if i<2:
        print("\n After Regular Expression - Message ", i, " : ", msg)
    
    # Each word to lower case
    msg = msg.lower()    
    if i<2:
        print("\n Lower case Message ", i, " : ", msg)
    
    # Splitting words to Tokenize
    msg = msg.split()    
    if i<2:
        print("\n After Splitting - Message ", i, " : ", msg)
    
    # Stemming with PorterStemmer handling Stop Words
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    if i<2:
        print("\n After Stemming - Message ", i, " : ", msg)
    
    # preparing Messages with Remaining Tokens
    msg = ' '.join(msg)
    if i<2:
        print("\n Final Prepared - Message ", i, " : ", msg, "\n\n")
    
    # Preparing WordVector Corpus
    corpus.append(msg)

				 MESSAGE  0

 After Regular Expression - Message  0  :  Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   

 Lower case Message  0  :  go until jurong point  crazy   available only in bugis n great world la e buffet    cine there got amore wat   

 After Splitting - Message  0  :  ['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']

 After Stemming - Message  0  :  ['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']

 Final Prepared - Message  0  :  go jurong point crazi avail bugi n great world la e buffet cine got amor wat 


				 MESSAGE  1

 After Regular Expression - Message  1  :  Ok lar    Joking wif u oni   

 Lower case Message  1  :  ok lar    joking wif u oni   

 After Splitting - Message  1  :  ['ok', 'lar', 'joking', 'wif', '

In [16]:
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [17]:
x.shape

(5572, 7220)

In [19]:
y = data['Label']

In [20]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: Label, dtype: int64

## Splitting Data

In [21]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y,test_size= 0.20, random_state = 10)

## Training Model

### SVM

In [22]:
from sklearn import svm

In [23]:
svm_classifier = svm.SVC()

In [24]:
svm_classifier.fit(xtrain, ytrain)

SVC()

In [25]:
#Prediction

svm_pred = svm_classifier.predict(xtest)

### Results

In [27]:
# Evaluating
cm = confusion_matrix(ytest, svm_pred)

In [28]:
cm

array([[965,   0],
       [ 27, 123]])

In [29]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, svm_pred))

Accuracy : 0.97578 




## Naive Bayes

In [30]:
from sklearn.naive_bayes import GaussianNB

In [31]:
nb_classifier = GaussianNB()

In [32]:
nb_classifier.fit(xtrain, ytrain)

GaussianNB()

In [33]:
nb_pred = nb_classifier.predict(xtest)

In [35]:
nb_cm = confusion_matrix(ytest, nb_pred)
nb_cm

array([[838, 127],
       [ 13, 137]])

In [36]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, nb_pred))

Accuracy : 0.87444 




## Random Forest

In [37]:
rf_classifier = RandomForestClassifier()

In [39]:
rf_classifier.fit(xtrain, ytrain)

RandomForestClassifier()

In [41]:
rf_pred = rf_classifier.predict(xtest)

In [42]:
# Confusion matrix
rf_cm = confusion_matrix(ytest, rf_pred)
rf_cm

array([[965,   0],
       [ 34, 116]])

In [43]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, rf_pred))

Accuracy : 0.96951 




### Hence SVM gives the maximum score.

## <font color = "green"> Hyperparameter Tuning </font>

### <font color = "blue"> RandomizedSearchCV </font>

In [44]:
from sklearn.model_selection import RandomizedSearchCV

In [49]:
#Randomized Search CV

# Kernels
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
# Coefficient for kernel
gamma = ['auto', 'scale']
# Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive.
C = [1.0, 10.0, 100.0, 500.0, 1000.0]

In [57]:
# Create the random grid

random_grid = {'kernel': kernels,
               'gamma': gamma,
               'C': C}

In [58]:
# Random search of parameters, using 5 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = svm.SVC(), param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=10, n_jobs = 1)

In [59]:
rf_random.fit(xtrain,ytrain)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ......................C=1.0, gamma=auto, kernel=rbf; total time=  20.6s
[CV] END ......................C=1.0, gamma=auto, kernel=rbf; total time=  20.9s
[CV] END ......................C=1.0, gamma=auto, kernel=rbf; total time=  20.7s
[CV] END ......................C=1.0, gamma=auto, kernel=rbf; total time=  24.3s
[CV] END ......................C=1.0, gamma=auto, kernel=rbf; total time=  29.1s
[CV] END ................C=500.0, gamma=auto, kernel=sigmoid; total time=  11.1s
[CV] END ................C=500.0, gamma=auto, kernel=sigmoid; total time=  11.3s
[CV] END ................C=500.0, gamma=auto, kernel=sigmoid; total time=  10.9s
[CV] END ................C=500.0, gamma=auto, kernel=sigmoid; total time=  11.7s
[CV] END ................C=500.0, gamma=auto, kernel=sigmoid; total time=  11.4s
[CV] END ...............C=1000.0, gamma=auto, kernel=sigmoid; total time=  10.8s
[CV] END ...............C=1000.0, gamma=auto, ke

RandomizedSearchCV(cv=5, estimator=SVC(), n_jobs=1,
                   param_distributions={'C': [1.0, 10.0, 100.0, 500.0, 1000.0],
                                        'gamma': ['auto', 'scale'],
                                        'kernel': ['linear', 'poly', 'rbf',
                                                   'sigmoid']},
                   random_state=10, scoring='neg_mean_squared_error',
                   verbose=2)

In [60]:
rf_random.best_params_

{'kernel': 'linear', 'gamma': 'auto', 'C': 500.0}

In [61]:
random_pred = rf_random.predict(xtest)

In [62]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, random_pred))

Accuracy : 0.98296 




### Accuracy score increased!!!

### <font color = "blue"> GridSearchCV </font>

In [63]:
from sklearn.model_selection import GridSearchCV

In [68]:
param_grid = {'kernel': ['linear'],
               'gamma': ['auto'],
               'C': [300.0, 350.0, 400.0, 450.0, 500.0, 550.0, 600.0, 650.0, 700.0, 750.0]}

In [69]:
grid_search = GridSearchCV(estimator = svm.SVC(), param_grid = param_grid, cv = 5, n_jobs= -1, verbose = 2)
grid_search.fit(xtrain, ytrain)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


GridSearchCV(cv=5, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [300.0, 350.0, 400.0, 450.0, 500.0, 550.0, 600.0,
                               650.0, 700.0, 750.0],
                         'gamma': ['auto'], 'kernel': ['linear']},
             verbose=2)

In [70]:
best_grid = grid_search.best_estimator_
best_grid

SVC(C=300.0, gamma='auto', kernel='linear')

In [71]:
final_pred = best_grid.predict(xtest)

In [72]:
print ("Accuracy after GridSearchCV is: %0.5f \n\n" % accuracy_score(ytest, final_pred))

Accuracy after GridSearchCV is: 0.98296 




## Final Accuracy score is: 0.983