In [94]:
%config Completer.use_jedi = False

In [95]:
import pandas as pd
import numpy as np
from plotly import express as plt

In [253]:
df = pd.read_csv('./datalab/spam.csv', encoding='latin1')

In [254]:
df.head(2)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,


In [255]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [256]:
df.v1.value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: v1, dtype: float64

In [257]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
v1    5572 non-null object
v2    5572 non-null object
dtypes: object(2)
memory usage: 774.3 KB


In [101]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

In [258]:
stop_words = set(stopwords.words('english')) 
filtered_sentence = []
word_tokens = ''

In [259]:
def clean_text(x):
    word_tokens = word_tokenize(x)
    filtered_sentence = ' '.join(w.lower() for w in word_tokens if w.isalpha() if not w in stop_words)
    return filtered_sentence

In [260]:
df['v2'] = df['v2'].apply(lambda x: x.lower())

In [261]:
df['v2'] = df['v2'].apply(clean_text)

In [262]:
df.head()

Unnamed: 0,v1,v2
0,ham,go jurong point available bugis n great world ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts may...
3,ham,u dun say early hor u c already say
4,ham,nah think goes usf lives around though


In [117]:
from collections import Counter

In [132]:
count = Counter()

In [136]:
data = df[df['v1']=='spam']['v2'].values

In [148]:
tokens  = []
for sms in data[:500]:
    token = word_tokenize(sms)
    tokens.extend(token)

In [149]:
counter = Counter(tokens)

In [154]:
counter.most_common(10)

[('call', 217),
 ('free', 149),
 ('txt', 121),
 ('ur', 106),
 ('u', 102),
 ('text', 80),
 ('claim', 74),
 ('mobile', 73),
 ('stop', 71),
 ('reply', 66)]

In [164]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [263]:
X = df['v2'].values

In [264]:
Y = df['v1'].values

In [167]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [269]:
from sklearn.naive_bayes import MultinomialNB

In [270]:
clf = MultinomialNB() 

In [419]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=47)

In [420]:
tf_train = vectorizer.fit_transform(X_train.flatten())

In [421]:
tf_test = vectorizer.transform(X_test.flatten())

In [422]:
print(tf_train.shape)
print(tf_test.shape)

(4179, 6057)
(1393, 6057)


In [423]:
clf.fit(tf_train, y_train.flatten())
pred = clf.predict(tf_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.2f" % score)

accuracy:   0.98


In [424]:
cm = confusion_matrix(y_test, pred)

In [425]:
cm

array([[1202,   13],
       [  17,  161]], dtype=int64)

In [426]:
from sklearn.linear_model import PassiveAggressiveClassifier

In [427]:
linear_clf = PassiveAggressiveClassifier(max_iter=30, random_state=41)
linear_clf.fit(tf_train, y_train.flatten())
pred = linear_clf.predict(tf_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.2f" % score)
cm = confusion_matrix(y_test, pred)
cm

accuracy:   0.98


array([[1211,    4],
       [  25,  153]], dtype=int64)