<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [25]:
import pandas as pd
import numpy as np

In [26]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [28]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [29]:
df.shape

(5572, 3)

In [30]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [32]:
X_train.shape

(4457,)

In [33]:
X_test.shape

(1115,)

In [34]:
type(X_train)

pandas.core.series.Series

In [35]:
X_train[:4]

2023    U can WIN £100 of Music Gift Vouchers every we...
1311    I.ll always be there, even if its just in spir...
3882    Can you plz tell me the ans. BSLVYL sent via f...
1094    Well the weather in cali's great. But its comp...
Name: Message, dtype: object

In [36]:
type(y_train)

pandas.core.series.Series

In [37]:
y_train[:4]

2023    1
1311    0
3882    0
1094    0
Name: spam, dtype: int64

In [38]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [39]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7746 sparse matrix of type '<class 'numpy.int64'>'
	with 58916 stored elements in Compressed Sparse Row format>

In [40]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [41]:
X_train_cv.shape

(4457, 7746)

In [42]:
v.get_feature_names_out()[1771]

'chikku'

In [43]:
v.vocabulary_

{'can': 1624,
 'win': 7523,
 '100': 254,
 'of': 4894,
 'music': 4646,
 'gift': 3127,
 'vouchers': 7326,
 'every': 2657,
 'week': 7441,
 'starting': 6457,
 'now': 4842,
 'txt': 7102,
 'the': 6828,
 'word': 7592,
 'draw': 2432,
 'to': 6949,
 '87066': 697,
 'tscs': 7072,
 'www': 7646,
 'ldew': 4038,
 'com': 1891,
 'skillgame': 6213,
 '1winaweek': 345,
 'age16': 868,
 '150ppermesssubscription': 308,
 'll': 4150,
 'always': 937,
 'be': 1267,
 'there': 6846,
 'even': 2652,
 'if': 3588,
 'its': 3753,
 'just': 3860,
 'in': 3628,
 'spirit': 6396,
 'get': 3115,
 'bb': 1253,
 'soon': 6327,
 'trying': 7069,
 'sure': 6639,
 'need': 4718,
 'it': 3746,
 'you': 7709,
 'plz': 5271,
 'tell': 6771,
 'me': 4396,
 'ans': 983,
 'bslvyl': 1532,
 'sent': 6025,
 'via': 7274,
 'fullonsms': 3035,
 'well': 7457,
 'weather': 7428,
 'cali': 1598,
 'great': 3232,
 'but': 1570,
 'complexities': 1928,
 'are': 1047,
 'car': 1644,
 'move': 4592,
 'freely': 2982,
 'taxes': 6738,
 'outrageous': 5022,
 'all': 919,
 'place'

In [44]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [45]:
np.where(X_train_np[0]!=0)

(array([ 254,  308,  345,  697,  868, 1624, 1891, 2432, 2657, 3127, 4038,
        4646, 4842, 4894, 6213, 6457, 6828, 6949, 7072, 7102, 7326, 7441,
        7523, 7592, 7646]),)

In [55]:
X_train[:4]

2023    U can WIN £100 of Music Gift Vouchers every we...
1311    I.ll always be there, even if its just in spir...
3882    Can you plz tell me the ans. BSLVYL sent via f...
1094    Well the weather in cali's great. But its comp...
Name: Message, dtype: object

In [47]:
X_train_np[0][1771]

0

<h3>Train the naive bayes model</h3>

In [48]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [49]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [50]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       979
           1       0.99      0.90      0.95       136

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [51]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [52]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [53]:
clf.fit(X_train, y_train)

In [54]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       979
           1       0.99      0.90      0.95       136

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115

