# Support vector machine classification using the dataset (spam.csv)

In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score,confusion_matrix,precision_score,recall_score,accuracy_score

from nltk.stem import WordNetLemmatizer

In [2]:
import sys
sys.path.append('../')
from processing.read_spam_csv import read_csv_file

df = read_csv_file()

In [3]:
df.shape

(5573, 2)

In [4]:
df.head()

Unnamed: 0,Category,Content
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df["Category"]=df.Category.map({'ham':0,'spam':1})
df.head()

Unnamed: 0,Category,Content
1,0,"Go until jurong point, crazy.. Available only ..."
2,0,Ok lar... Joking wif u oni...
3,1,Free entry in 2 a wkly comp to win FA Cup fina...
4,0,U dun say so early hor... U c already then say...
5,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
dataset=df.values
np.random.shuffle(dataset)

In [7]:
X = dataset[:,1]
Y = dataset[:,0]
Y= Y.astype('int32')

In [8]:
from processing.fix_contents import separate_email

lemmatizer = WordNetLemmatizer()
for i in range(X.shape[0]):
    content = separate_email(X[i])[0]
    X[i] = (" ").join([lemmatizer.lemmatize(word.lower(), pos='v') for word in content.split(" ")])

In [9]:
vectorizer = CountVectorizer()
X_transformed = vectorizer.fit_transform(X)

In [10]:
X_train = X_transformed[0:4000,:]
Y_train = Y[0:4000]
X_test = X_transformed[4000:,:]
Y_test = Y[4000:]

In [11]:
X_train.shape

(4000, 8138)

In [12]:
def print_metrics(Y_true, Y_predicted):
    print("Accuracy Score :" + str(accuracy_score(Y_true,Y_predicted)))
    print("Precision Score :" + str(precision_score(Y_true,Y_predicted)))
    print("Recall Score :" + str(recall_score(Y_true,Y_predicted)))
    print("ROC AUC Score :" + str(roc_auc_score(Y_true,Y_predicted)))
    print("Confusion Matrix : \n")
    print(confusion_matrix(Y_true,Y_predicted))

In [13]:
model = SVC(C=1000)
model.fit(X_train,Y_train)

In [14]:
Y_predicted_svm = model.predict(X_test)

In [15]:
print_metrics(Y_test, Y_predicted_svm)

Accuracy Score :0.9841068022886205
Precision Score :1.0
Recall Score :0.8863636363636364
ROC AUC Score :0.9431818181818181
Confusion Matrix : 

[[1353    0]
 [  25  195]]


In [16]:
import joblib

joblib.dump(model, '../../trained_models/SVM_Spam.joblib') # saving model for later use

['../../trained_models/SVM_Spam.pkl']