# Support vector machine classification using the dataset (spam.csv)

In [29]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score,confusion_matrix,precision_score,recall_score,accuracy_score
import chardet
from gensim import parsing

In [30]:
df = pd.read_csv("../datasets/kaggle_email_spam/spam.csv",encoding = ('ISO-8859-1'))

In [31]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [32]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [33]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)

In [34]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [35]:
df.columns = ["Category","Content"]

In [36]:
df["Category"]=df.Category.map({'ham':0,'spam':1})

In [37]:
df.head()

Unnamed: 0,Category,Content
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [38]:
dataset=df.values

In [39]:
np.random.shuffle(dataset)

In [40]:
X = dataset[:,1]
Y = dataset[:,0]
Y= Y.astype('int32')

In [41]:
for i in range(X.shape[0]):
    X[i] = parsing.stem_text(X[i].lower())

In [42]:
vectorizer = CountVectorizer()
X_transformed = vectorizer.fit_transform(X)

In [43]:
X_train = X_transformed[0:4000,:]
Y_train = Y[0:4000]
X_test = X_transformed[4000:,:]
Y_test = Y[4000:]

In [44]:
X_train.shape

(4000, 8264)

In [45]:
def print_metrics(Y_true, Y_predicted):
    print("Accuracy Score :" + str(accuracy_score(Y_true,Y_predicted)))
    print("Precision Score :" + str(precision_score(Y_true,Y_predicted)))
    print("Recall Score :" + str(recall_score(Y_true,Y_predicted)))
    print("ROC AUC Score :" + str(roc_auc_score(Y_true,Y_predicted)))
    print("Confusion Matrix : \n")
    print(confusion_matrix(Y_true,Y_predicted))

In [46]:
svm_clf = SVC(C=2000)
svm_clf.fit(X_train,Y_train)

SVC(C=2000)

In [47]:
Y_predicted_svm = svm_clf.predict(X_test)

In [48]:
print_metrics(Y_test, Y_predicted_svm)

Accuracy Score :0.9783715012722646
Precision Score :0.9888888888888889
Recall Score :0.8476190476190476
ROC AUC Score :0.9230753094189218
Confusion Matrix : 

[[1360    2]
 [  32  178]]
