# Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import spacy

# Uploading the dataset

In [2]:
path = "D:\\Work\\My work\\Message classification\\archive\\spam.csv"

In [4]:
dataset = pd.read_csv(path, encoding='latin-1')

# Preliminary viewing of the dataset

In [5]:
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [7]:
dataset['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

# Checking for null values and empty reviews

In [8]:
dataset.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [10]:
dataset.drop("Unnamed: 2", inplace = True, axis = 1)

In [11]:
dataset.drop("Unnamed: 3", inplace = True, axis = 1)

In [12]:
dataset.drop("Unnamed: 4", inplace = True, axis = 1)

In [13]:
dataset.isnull().sum()

v1    0
v2    0
dtype: int64

In [14]:
blank = []
for i, s, rv in dataset.itertuples():
    if rv.isspace()==True:
        blank.append(i)

In [15]:
blank

[]

# Splitting the dataset

### Into matrix of features and target variable

In [20]:
x = dataset.iloc[:, 1]
y = dataset.iloc[:, 0]

### Applying Spacy preprocessing

In [17]:
nlp = spacy.load("en_core_web_lg")

In [18]:
def preprocessing(text):
    doc = nlp(text)
    lemma = []
    for token in doc:
        if token.is_stop==False:
            lemma.append(token.lemma_)
    processed_text = " ".join(lemma)
    return processed_text

In [21]:
x = x.apply(preprocessing)

### Dividing into training and testing dataset

In [22]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

for i, (train_index, test_index) in enumerate(sss.split(x, y)):
    train = train_index
    test = test_index

In [24]:
i1 = []
for i in train:
    i1.append(i)
    
i2 = []
for i in test:
    i2.append(i)

x_train = x.iloc[i1]
x_test = x.iloc[i2]

In [25]:
y_train = y.iloc[i1]
y_test = y.iloc[i2]

# Applying transformations


In [26]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
y_train_encoded = lb.fit_transform(y_train)
y_test_encoded = lb.transform(y_test)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x_train_encoded = vectorizer.fit_transform(x_train)
x_test_encoded = vectorizer.transform(x_test)

# Building the models

In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report

### Decision Tree Classifier

In [29]:
from sklearn.tree import DecisionTreeClassifier
classifier1 = DecisionTreeClassifier()
classifier1.fit(x_train_encoded, y_train_encoded)

y_pred = classifier1.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[949  17]
 [ 27 122]]
The f1 score is 0.8472222222222222
The accuracy score is 0.9605381165919282
The classification report is: 
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       966
           1       0.88      0.82      0.85       149

    accuracy                           0.96      1115
   macro avg       0.93      0.90      0.91      1115
weighted avg       0.96      0.96      0.96      1115



### Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression
classifier2 = LogisticRegression()
classifier2.fit(x_train_encoded, y_train_encoded)

y_pred = classifier2.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[965   1]
 [ 30 119]]
The f1 score is 0.8847583643122676
The accuracy score is 0.9721973094170404
The classification report is: 
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       0.99      0.80      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115



### Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier
classifier3 = RandomForestClassifier(n_jobs=-1)
classifier3.fit(x_train_encoded, y_train_encoded)

y_pred = classifier3.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[965   1]
 [ 23 126]]
The f1 score is 0.9130434782608696
The accuracy score is 0.97847533632287
The classification report is: 
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.85      0.91       149

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



### Naive Bayes

In [32]:
from sklearn.naive_bayes import MultinomialNB
classifier4 = MultinomialNB()
classifier4.fit(x_train_encoded, y_train_encoded)

y_pred = classifier4.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[966   0]
 [ 34 115]]
The f1 score is 0.8712121212121211
The accuracy score is 0.9695067264573991
The classification report is: 
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



### K-NN

In [33]:
from sklearn.neighbors import KNeighborsClassifier
classifier5 = KNeighborsClassifier()
classifier5.fit(x_train_encoded, y_train_encoded)

y_pred = classifier5.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[966   0]
 [101  48]]
The f1 score is 0.4873096446700507
The accuracy score is 0.9094170403587444
The classification report is: 
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       966
           1       1.00      0.32      0.49       149

    accuracy                           0.91      1115
   macro avg       0.95      0.66      0.72      1115
weighted avg       0.92      0.91      0.89      1115



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### SVM

### Linear support vector

In [34]:
from sklearn.svm import LinearSVC
classifier6 = LinearSVC()
classifier6.fit(x_train_encoded, y_train_encoded)

y_pred = classifier6.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[962   4]
 [ 11 138]]
The f1 score is 0.9484536082474226
The accuracy score is 0.9865470852017937
The classification report is: 
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.97      0.93      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



### Sigmoid SVC

In [35]:
from sklearn.svm import SVC
classifier7 = SVC(kernel = 'sigmoid')
classifier7.fit(x_train_encoded, y_train_encoded)

y_pred = classifier7.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[961   5]
 [ 11 138]]
The f1 score is 0.9452054794520547
The accuracy score is 0.9856502242152466
The classification report is: 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.97      0.93      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



### Voting Method

##### Soft Voting 

In [36]:
from sklearn.ensemble import VotingClassifier
classifier8 = VotingClassifier(
    estimators=[('lr', LogisticRegression()), ('svc', SVC(kernel = 'sigmoid', probability = True)), ('rf', RandomForestClassifier())],
    voting='soft')
classifier8.fit(x_train_encoded, y_train_encoded)

y_pred = classifier8.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[962   4]
 [ 13 136]]
The f1 score is 0.9411764705882352
The accuracy score is 0.9847533632286996
The classification report is: 
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.97      0.91      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115



##### Hard Voting

In [37]:
from sklearn.ensemble import VotingClassifier
classifier9 = VotingClassifier(
 estimators=[('lr', LogisticRegression()), ('svc', LinearSVC()), ('rf', RandomForestClassifier())],
    voting='hard')
classifier9.fit(x_train_encoded, y_train_encoded)

y_pred = classifier9.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[965   1]
 [ 19 130]]
The f1 score is 0.9285714285714286
The accuracy score is 0.9820627802690582
The classification report is: 
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.87      0.93       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### Stacking

In [38]:
from sklearn.ensemble import StackingClassifier
level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('svm', LinearSVC()))
level0.append(('bayes', MultinomialNB()))
level0.append(('random', RandomForestClassifier()))

level1 = LogisticRegression()

classifier10 = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
classifier10.fit(x_train_encoded, y_train_encoded)

y_pred = classifier10.predict(x_test_encoded)
print("The confusion matrix is given by: ")
print(confusion_matrix(y_test_encoded, y_pred))
print(f'The f1 score is {f1_score(y_test_encoded, y_pred)}')
print(f'The accuracy score is {accuracy_score(y_test_encoded, y_pred)}')
print("The classification report is: ")
print(classification_report(y_test_encoded, y_pred))

The confusion matrix is given by: 
[[961   5]
 [  9 140]]
The f1 score is 0.9523809523809523
The accuracy score is 0.9874439461883409
The classification report is: 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.97      0.94      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

