<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [8]:
df.shape

(5572, 3)

In [9]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [12]:
X_train.shape

(4457,)

In [7]:
X_test.shape

(1115,)

In [13]:
type(X_train)

pandas.core.series.Series

In [14]:
X_train[:4]

407     All was well until slightly disastrous class t...
3166    Cheers for the card ... Is it that time of yea...
2947                        make that 3! 4 fucks sake?! x
4922    Its so common hearin How r u? Wat r u doing? H...
Name: Message, dtype: object

In [15]:
type(y_train)

pandas.core.series.Series

In [16]:
y_train[:4]

407     0
3166    0
2947    0
4922    0
Name: spam, dtype: int64

In [17]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7732 sparse matrix of type '<class 'numpy.int64'>'
	with 59314 stored elements in Compressed Sparse Row format>

In [19]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [20]:
X_train_cv

<4457x7732 sparse matrix of type '<class 'numpy.int64'>'
	with 59314 stored elements in Compressed Sparse Row format>

In [21]:
X_train_cv.shape

(4457, 7732)

In [22]:
v.get_feature_names_out()[11]

'02072069400'

In [23]:
v.vocabulary_

{'all': 934,
 'was': 7384,
 'well': 7445,
 'until': 7174,
 'slightly': 6220,
 'disastrous': 2354,
 'class': 1854,
 'this': 6846,
 'pm': 5278,
 'with': 7535,
 'my': 4683,
 'fav': 2804,
 'darlings': 2181,
 'hope': 3501,
 'day': 2195,
 'off': 4909,
 'ok': 4930,
 'coffee': 1905,
 'wld': 7551,
 'be': 1294,
 'good': 3199,
 'as': 1100,
 'can': 1654,
 'stay': 6453,
 'late': 4041,
 'tomorrow': 6953,
 'same': 5903,
 'time': 6894,
 'place': 5242,
 'always': 956,
 'cheers': 1785,
 'for': 2951,
 'the': 6808,
 'card': 1672,
 'is': 3759,
 'it': 3769,
 'that': 6804,
 'of': 4907,
 'year': 7670,
 'already': 947,
 'make': 4344,
 'fucks': 3043,
 'sake': 5892,
 'its': 3777,
 'so': 6269,
 'common': 1941,
 'hearin': 3392,
 'how': 3531,
 'wat': 7390,
 'doing': 2407,
 'ur': 7192,
 'let': 4100,
 'me': 4426,
 'ask': 1109,
 'something': 6292,
 'different': 2328,
 'did': 2315,
 'smile': 6240,
 'today': 6932,
 'if': 3614,
 'not': 4841,
 'do': 2387,
 'now': 4854,
 'gud': 3277,
 'evng': 2691,
 'dont': 2419,
 'worry':

In [24]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
np.where(X_train_np[0]!=0)

(array([ 934,  956, 1100, 1294, 1654, 1854, 1905, 2181, 2195, 2354, 2804,
        3199, 3501, 4041, 4683, 4909, 4930, 5242, 5278, 5903, 6220, 6453,
        6846, 6894, 6953, 7174, 7384, 7445, 7535, 7551], dtype=int64),)

In [28]:
X_train[:4]

407     All was well until slightly disastrous class t...
3166    Cheers for the card ... Is it that time of yea...
2947                        make that 3! 4 fucks sake?! x
4922    Its so common hearin How r u? Wat r u doing? H...
Name: Message, dtype: object

In [52]:
X_train_np[0][1771]

1

<h3>Train the naive bayes model</h3>

In [29]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [30]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [31]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       980
           1       0.97      0.91      0.94       135

    accuracy                           0.99      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [60]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [61]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [66]:
clf.fit(X_train, y_train)

In [64]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       976
           1       0.98      0.93      0.95       139

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

