<h2 align='center'>Bag Of Words (BOW)</h2>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
X_train[:4]

1148    Ok... Help me ask if she's working tmr a not?
20          Is that seriously how you spell his name?
291             Hey you told your name to gautham ah?
5511            It‘s reassuring, in this crazy world.
Name: Message, dtype: object

In [12]:
type(y_train)

pandas.core.series.Series

In [13]:
y_train[:4]

1148    0
20      0
291     0
5511    0
Name: spam, dtype: int64

In [14]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7824 sparse matrix of type '<class 'numpy.int64'>'
	with 59290 stored elements in Compressed Sparse Row format>

In [16]:
X_train_cv.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
X_train_cv.shape

(4457, 7675)

In [37]:
v.get_feature_names_out()[1107]

'ask'

In [23]:
v.vocabulary_

{'ok': 4980,
 'help': 3465,
 'me': 4466,
 'ask': 1107,
 'if': 3673,
 'she': 6135,
 'working': 7669,
 'tmr': 6999,
 'not': 4891,
 'is': 3819,
 'that': 6878,
 'seriously': 6086,
 'how': 3590,
 'you': 7785,
 'spell': 6444,
 'his': 3508,
 'name': 4749,
 'hey': 3487,
 'told': 7019,
 'your': 7791,
 'to': 7005,
 'gautham': 3144,
 'ah': 882,
 'it': 3830,
 'reassuring': 5669,
 'in': 3715,
 'this': 6924,
 'crazy': 2090,
 'world': 7672,
 'then': 6894,
 'cant': 1659,
 'get': 3175,
 'da': 2168,
 'laptop': 4086,
 'my': 4728,
 'matric': 4447,
 'card': 1669,
 'wif': 7583,
 'lei': 4142,
 'freemsg': 3041,
 'today': 7012,
 'the': 6882,
 'day': 2210,
 'are': 1060,
 'ready': 5652,
 'horny': 3569,
 'live': 4222,
 'town': 7069,
 'love': 4295,
 'sex': 6101,
 'fun': 3098,
 'games': 3126,
 'netcollex': 4807,
 'ltd': 4316,
 '08700621170150p': 74,
 'per': 5215,
 'msg': 4684,
 'reply': 5771,
 'stop': 6562,
 'end': 2625,
 'mm': 4592,
 'yes': 7769,
 'dear': 2220,
 'look': 4264,
 'am': 943,
 'hugging': 3612,
 'both':

In [26]:
# Conversion to numpy array
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [27]:
np.where(X_train_np[0]!=0)

(array([1107, 3465, 3673, 4466, 4891, 4980, 6135, 6999, 7669], dtype=int64),)

In [39]:
    X_train[:4]

1148    Ok... Help me ask if she's working tmr a not?
20          Is that seriously how you spell his name?
291             Hey you told your name to gautham ah?
5511            It‘s reassuring, in this crazy world.
Name: Message, dtype: object

In [41]:
X_train_np[0][1107]

1

<h3>Train the naive bayes model</h3>

In [42]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [43]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [44]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       968
           1       0.97      0.94      0.95       147

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [45]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [46]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [47]:
clf.fit(X_train, y_train)

In [48]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       968
           1       0.97      0.94      0.95       147

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

