In [11]:
import numpy as np
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [13]:
# Loading the data
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df.iloc[:,:2]
df.columns = ['Spam', 'Text']
df.Spam = df.Spam.map({'ham':0, 'spam':1})
df

Unnamed: 0,Spam,Text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


# Processing

In [14]:
# Missing
df.isnull().sum()

Spam    0
Text    0
dtype: int64

In [15]:
df.drop_duplicates(inplace=True)

In [16]:
df.shape

(5169, 2)

In [17]:
# download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [18]:
# Process text
def processText(text):
    
    # remove punctuations
    nopunc = [c for c in text if c not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    # remove stopwords
    clean = [w for w in nopunc.split() if w.lower() not in stopwords.words('english')]
    
    return clean

# CountVectorizer Explanation

In [19]:
# CountVectorizer example

msg1 = 'Hi Hi Hi world world'
msg2 = 'We we we fine fine'

bow = CountVectorizer(analyzer=processText).fit_transform([[msg1], [msg2]]) # bag of words = bow
print(bow,'\n\n\n BOW shape : ', bow.shape)

  (0, 0)	3
  (0, 2)	2
  (1, 1)	2 


 BOW shape :  (2, 3)


In [20]:
set(processText(msg1) + processText(msg2))

{'Hi', 'fine', 'world'}

In [21]:
len(set(processText(msg1) + processText(msg2))) ==  bow.shape[1]

True

# Model
Back to the game

In [22]:
# Bag of words matrix 
bow = CountVectorizer(analyzer=processText).fit_transform(df.Text)

In [23]:
bow.shape

(5169, 11304)

In [24]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(bow, df.Spam, test_size=0.20, random_state=42)
print(f'X_train, X_test, y_train, y_test shapes are : {X_train.shape, X_test.shape, y_train.shape, y_test.shape}')

X_train, X_test, y_train, y_test shapes are : ((4135, 11304), (1034, 11304), (4135,), (1034,))


In [25]:
# Model
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [26]:
# Prediction
preds = model.predict(X_test)

In [27]:
print(f'Classification report : \n {classification_report(y_test, preds)} \
\nConfusion matrix \n{confusion_matrix(y_test, preds)}\n\
Accuracy score : {accuracy_score(y_test, preds)}')


Classification report : 
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       889
           1       0.83      0.94      0.88       145

    accuracy                           0.97      1034
   macro avg       0.91      0.96      0.93      1034
weighted avg       0.97      0.97      0.97      1034
 
Confusion matrix 
[[861  28]
 [  8 137]]
Accuracy score : 0.965183752417795


In [28]:
model.score(X_train, y_train)

0.9958887545344619

96% accuracy on test and 99% on train. Quite good. Happy coding.