In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt1
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
%matplotlib inline

In [2]:
#import the dataset
path = "./datasets/P6_P7_spam.csv"
df = pd.read_csv(path,encoding = 'latin-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df = df[['v1','v2']]

In [5]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# remove punctuations and stopwords
def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    if(len(clean) < 9):
        for i in range(9-len(clean)):
            clean.append(11304)
    return clean[:9]

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/sujay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
df['v2'] = df['v2'].apply(text_process)
df['length'] = df['v2'].apply(len)
unique_words = []
for msg in df['v2']:
    for word in msg:
        unique_words.append(word)
unique_words = set(unique_words)
df.head()

Unnamed: 0,v1,v2,length
0,ham,"[Go, jurong, point, crazy, Available, bugis, n...",9
1,ham,"[Ok, lar, Joking, wif, u, oni, 11304, 11304, 1...",9
2,spam,"[Free, entry, 2, wkly, comp, win, FA, Cup, final]",9
3,ham,"[U, dun, say, early, hor, U, c, already, say]",9
4,ham,"[Nah, dont, think, goes, usf, lives, around, t...",9


In [9]:
len(unique_words)

8508

In [10]:
vocab = {}
i=0
for word in unique_words:
    vocab[word]=i
    i+=1

In [11]:
vocab

{'KEEP': 0,
 'surely': 1,
 'bhayandar': 2,
 'il': 3,
 'lunch': 4,
 'mac': 5,
 'walking': 6,
 'sfirst': 7,
 'U': 8,
 'AL': 9,
 'dream': 10,
 'MSGWe': 11,
 'march': 12,
 'rinu': 13,
 'perspective': 14,
 'elections': 15,
 'flies': 16,
 'electricity': 17,
 'fixed': 18,
 'staffsciencenusedusgphyhcmkteachingpc1323': 19,
 'perf': 20,
 'idc': 21,
 'HAV': 22,
 'ah': 23,
 'islands': 24,
 'laurie': 25,
 'definitely': 26,
 'preferably': 27,
 'immunisation': 28,
 'assessment': 29,
 'calls': 30,
 'sculpture': 31,
 'fallen': 32,
 '4GET': 33,
 'needle': 34,
 'gooddhanush': 35,
 'marrgeremembr': 36,
 'Refused': 37,
 'å£79': 38,
 'delay': 39,
 'home': 40,
 'till': 41,
 'rubber': 42,
 'dosomething': 43,
 'Beauty': 44,
 'MONOC': 45,
 'Mob': 46,
 'å£200': 47,
 'talk': 48,
 'roommates': 49,
 'engin': 50,
 'yeovil': 51,
 'pub': 52,
 'Woods': 53,
 'Laughing': 54,
 'vl': 55,
 'doors': 56,
 '6230': 57,
 'laden': 58,
 'HI': 59,
 'Glad': 60,
 'SITUATION': 61,
 'Games': 62,
 'information': 63,
 'Cutter': 64,
 'Bee

In [12]:
def transform_data(data_set,vocab):
    ds=[]
    for row in data_set:
        temp = []
        for word in row:
            temp.append(vocab[word])
        ds.append(temp)
    return ds

In [13]:
msg_train,msg_test,label_train,label_test = train_test_split(transform_data(df['v2'],vocab),df['v1'],test_size=0.2)

In [14]:
msg_train[:5]

[[5730, 449, 7189, 7177, 953, 7110, 1475, 5291, 1475],
 [2659, 8, 4339, 7655, 6671, 5176, 6135, 6524, 4266],
 [1068, 5915, 5472, 8245, 6940, 2919, 2919, 2919, 2919],
 [2417, 4504, 5922, 1300, 6310, 847, 5778, 273, 7718],
 [5458, 1675, 1450, 6163, 729, 2919, 2919, 2919, 2919]]

In [15]:
# improves accuracy
# tfidf_transformer = TfidfTransformer(use_idf = False)
# msg_train=tfidf_transformer.transform(msg_train)
# msg_test=tfidf_transformer.transform(msg_test)
# msg_train.shape

In [16]:
spam_detect_model = SVC()
spam_detect_model.fit(msg_train,label_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [17]:
y_pred = spam_detect_model.predict(msg_test)
y_pred[:5]

array(['ham', 'ham', 'ham', 'ham', 'ham'], dtype=object)

In [18]:
print("accuracy: ",accuracy_score(label_test,y_pred))
print(classification_report(label_test,y_pred))
print(confusion_matrix(label_test,y_pred))

accuracy:  0.9192825112107623
              precision    recall  f1-score   support

         ham       0.92      1.00      0.96       969
        spam       1.00      0.38      0.55       146

    accuracy                           0.92      1115
   macro avg       0.96      0.69      0.76      1115
weighted avg       0.93      0.92      0.90      1115

[[969   0]
 [ 90  56]]
