In [1]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

In [7]:
spam_dataframe = pd.read_csv("spam_ham_dataset.csv")
spam_dataframe.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [8]:
spam_dataframe.drop("Unnamed: 0", axis=1, inplace=True)
spam_dataframe.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [9]:
spam_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   label      5171 non-null   object
 1   text       5171 non-null   object
 2   label_num  5171 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 121.3+ KB


In [10]:
# We can covert "label" columns with pd.get_dummies to get 0 and 1
# but we already got a necessary column
# spam_dataframe["label"] = pd.get_dummies(spam_dataframe["label"], dtype='int', drop_first=True)

In [12]:
label_encoder = LabelEncoder()
spam_dataframe["text"] = label_encoder.fit_transform(spam_dataframe["text"])

In [None]:
spam_dataframe.head()

Unnamed: 0,label,text,label_num
0,ham,1209,0
1,ham,1985,0
2,ham,2774,0
3,spam,3158,1
4,ham,3613,0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(spam_dataframe[["text"]],
                                                    spam_dataframe["label_num"],
                                                    test_size=0.15)

In [16]:
spam_model = SVC()
spam_model.fit(X_train, y_train)

In [17]:
model_predictions = spam_model.predict(X_test)
print(confusion_matrix(y_test, model_predictions))

[[563   0]
 [211   2]]


In [18]:
print(classification_report(y_test, model_predictions))

              precision    recall  f1-score   support

           0       0.73      1.00      0.84       563
           1       1.00      0.01      0.02       213

    accuracy                           0.73       776
   macro avg       0.86      0.50      0.43       776
weighted avg       0.80      0.73      0.62       776



In [20]:
# We can see that our model poorly predicted our spam text
# 211(FN) wrong and 2(TN) correct for spam

In [21]:
# We will use GridSearchCV to find the best parameters for our model
parameters_grid = {"C": [0.01, 0.1, 1, 10, 100],
                   "gamma": [10, 1, 0.1, 0.01, 0.001],
                   "kernel": ["rbf"]
                   }

In [22]:
grid = GridSearchCV(SVC(), parameters_grid, verbose=10, refit=True)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5; 1/25] START C=0.01, gamma=10, kernel=rbf...............................
[CV 1/5; 1/25] END C=0.01, gamma=10, kernel=rbf;, score=0.708 total time=   0.8s
[CV 2/5; 1/25] START C=0.01, gamma=10, kernel=rbf...............................
[CV 2/5; 1/25] END C=0.01, gamma=10, kernel=rbf;, score=0.708 total time=   0.8s
[CV 3/5; 1/25] START C=0.01, gamma=10, kernel=rbf...............................
[CV 3/5; 1/25] END C=0.01, gamma=10, kernel=rbf;, score=0.708 total time=   0.8s
[CV 4/5; 1/25] START C=0.01, gamma=10, kernel=rbf...............................
[CV 4/5; 1/25] END C=0.01, gamma=10, kernel=rbf;, score=0.708 total time=   0.8s
[CV 5/5; 1/25] START C=0.01, gamma=10, kernel=rbf...............................
[CV 5/5; 1/25] END C=0.01, gamma=10, kernel=rbf;, score=0.706 total time=   0.9s
[CV 1/5; 2/25] START C=0.01, gamma=1, kernel=rbf................................
[CV 1/5; 2/25] END .C=0.01, gamma=1, kernel=rbf

In [25]:
# The best parameters for our model
grid.best_params_

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}

In [26]:
grid_model_predictions = grid.predict(X_test)

In [27]:
print(confusion_matrix(y_test, grid_model_predictions))

[[521  42]
 [ 92 121]]


In [28]:
print(classification_report(y_test, grid_model_predictions))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89       563
           1       0.74      0.57      0.64       213

    accuracy                           0.83       776
   macro avg       0.80      0.75      0.76       776
weighted avg       0.82      0.83      0.82       776



In [29]:
# We can see that our model did overall better, 
# but we can improve it with more data rows
# Data is borrowed from Kaggle (Spam Mails Dataset)