In [206]:
import pandas as pd

In [207]:
with open('spam.csv', 'r', encoding='utf-8', errors='ignore') as infile:
    spam_data = pd.read_csv(infile)
    spam_data = spam_data.iloc[:, :-3]
spam_data.shape

(5572, 2)

In [208]:
spam_data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [209]:
spam_data.rename(columns={'v1': 'class', 'v2': 'message'}, inplace=True)

In [210]:
columns_titles = ["message","class"]
spam_data= spam_data.reindex(columns=columns_titles)
spam_data.head()

Unnamed: 0,message,class
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [211]:
spam_data.isnull().any()

message    False
class      False
dtype: bool

In [212]:
spam_data['class'].value_counts()

ham     4825
spam     747
Name: class, dtype: int64

In [213]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
spam_data['class'] = le.fit_transform(spam_data['class'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'ham': 0, 'spam': 1}


In [214]:
#Cleaning

spam_data['message'] = spam_data['message'].str.lower()
spam_data['message'] = spam_data['message'].str.replace(r'[^\w\s]+', '')
spam_data['message'] = spam_data['message'].str.strip()
spam_data['message'] = spam_data['message'].str.split()
spam_data['message'].head()

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, dont, think, he, goes, to, usf, he, l...
Name: message, dtype: object

In [215]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spam_data['message'], spam_data['class'], random_state=1)

print('No. in training set: {}'.format(X_train.shape[0]))
print('No. in test set: {}'.format(X_test.shape[0]))

No. in training set: 4179
No. in test set: 1393


In [216]:
X_train.head()

710     [height, of, confidence, all, the, aeronautics...
3740                                      [22, 146tf150p]
2711    [wen, ur, lovable, bcums, angry, wid, u, dnt, ...
3155               [long, time, you, remember, me, today]
3748    [dear, voucher, holder, 2, claim, your, 1st, c...
Name: message, dtype: object

In [217]:
y_train.head()

710     0
3740    1
2711    0
3155    0
3748    1
Name: class, dtype: int64

In [218]:
cv = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
training_data = cv.fit_transform(X_train)
testing_data = cv.transform(X_test)

In [219]:
testing_data.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [220]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [221]:
predictions = model.predict(testing_data)
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [222]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy: ', format(accuracy_score(y_test, predictions)))
print('Precision: ', format(precision_score(y_test, predictions)))
print('Recall: ', format(recall_score(y_test, predictions)))
print('F1: ', format(f1_score(y_test, predictions)))

Accuracy:  0.9827709978463748
Precision:  0.9698795180722891
Recall:  0.8944444444444445
F1:  0.930635838150289


In [235]:
#Checking on random data
str1 = 'I will give you xyz'
str2 = 'Hi how r u i'

d = {'message': [str1,str2]}
new_data = pd.DataFrame(d)
new_data['message'] = new_data['message'].str.split()
s = cv.transform(new_data['message'])
model.predict(s)

array([0, 0])