Support Vector Machine Classification

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score,confusion_matrix,precision_score,recall_score,accuracy_score
import chardet
from gensim import parsing

In [2]:
pwd

'D:\\HASSAN\\pandas'

In [3]:
df = pd.read_csv("spam.csv",encoding = ('ISO-8859-1'))

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [6]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [9]:
df.columns = ["Category","Content"]

In [10]:
df["Category"]=df.Category.map({'ham':0,'spam':1})

In [11]:
df.head()

Unnamed: 0,Category,Content
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
dataset=df.values

In [13]:
np.random.shuffle(dataset)

In [14]:
X = dataset[:,1]
Y = dataset[:,0]
Y= Y.astype('int32')

In [15]:
for i in range(X.shape[0]):
    X[i] = parsing.stem_text(X[i].lower())

In [16]:
vectorizer = CountVectorizer()
X_transformed = vectorizer.fit_transform(X)

In [17]:
X_train = X_transformed[0:4000,:]
Y_train = Y[0:4000]
X_test = X_transformed[4000:,:]
Y_test = Y[4000:]

In [18]:
X_train.shape

(4000, 8264)

In [19]:
def print_metrics(Y_true, Y_predicted):
    print("Accuracy Score :" + str(accuracy_score(Y_true,Y_predicted)))
    print("Precision Score :" + str(precision_score(Y_true,Y_predicted)))
    print("Recall Score :" + str(recall_score(Y_true,Y_predicted)))
    print("ROC AUC Score :" + str(roc_auc_score(Y_true,Y_predicted)))
    print("Confusion Matrix : \n")
    print(confusion_matrix(Y_true,Y_predicted))

In [20]:
svm_clf = SVC(C=2000)
svm_clf.fit(X_train,Y_train)

In [21]:
Y_predicted_svm = svm_clf.predict(X_test)

In [22]:
print_metrics(Y_test, Y_predicted_svm)

Accuracy Score :0.9834605597964376
Precision Score :1.0
Recall Score :0.856353591160221
ROC AUC Score :0.9281767955801106
Confusion Matrix : 

[[1391    0]
 [  26  155]]


Naive Bayes Multinomial NB Classification

In [23]:
bayes_clf=MultinomialNB()
bayes_clf.fit(X_train,Y_train)

In [24]:
Y_predicted = bayes_clf.predict(X_test)

In [25]:
print_metrics(Y_test,Y_predicted)

Accuracy Score :0.982824427480916
Precision Score :0.9184782608695652
Recall Score :0.9337016574585635
ROC AUC Score :0.9614590242720567
Confusion Matrix : 

[[1376   15]
 [  12  169]]


Logistic Regression

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

In [29]:
from sklearn.linear_model import LogisticRegression

In [32]:
model = LogisticRegression()

In [33]:
model.fit(X_train,Y_train)

In [40]:
Y_predicted_LG = model.predict(X_test)

In [42]:
Accuracy_training = metrics.accuracy_score(Y_predicted_LG, Y_test)
Accuracy_training

0.9866412213740458

In [43]:
print_metrics(Y_test,Y_predicted_LG)

Accuracy Score :0.9866412213740458
Precision Score :0.9938271604938271
Recall Score :0.8895027624309392
ROC AUC Score :0.9443919275849878
Confusion Matrix : 

[[1390    1]
 [  20  161]]


In [44]:
Y_train_predicted = model.predict(X_train)

In [45]:
Train_accuracy = metrics.accuracy_score(Y_train_predicted,Y_train)

In [46]:
Train_accuracy

0.99775

In [47]:
print_metrics(Y_train_predicted,Y_train)

Accuracy Score :0.99775
Precision Score :0.9840989399293286
Recall Score :1.0
ROC AUC Score :0.9986930002904444
Confusion Matrix : 

[[3434    9]
 [   0  557]]
