# NLP Tutorial: Text Representation - Bag Of Words (BOW)

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [None]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### Train test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [None]:
X_train.shape

(4457,)

In [None]:
X_test.shape

(1115,)

In [None]:
type(X_train)

In [None]:
X_train[:4]

Unnamed: 0,Message
3462,K.. I yan jiu liao... Sat we can go 4 bugis vi...
457,"LOOK AT AMY URE A BEAUTIFUL, INTELLIGENT WOMAN..."
353,TODAY is Sorry day.! If ever i was angry with ...
930,money!!! you r a lucky winner ! 2 claim your p...


In [None]:
type(y_train)

In [None]:
y_train[:4]

Unnamed: 0,spam
3462,0
457,0
353,0
930,1


In [None]:
type(X_train.values)

numpy.ndarray

### Create Bag Of Words representation using CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()


X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59136 stored elements and shape (4457, 7736)>

In [None]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
X_train_cv.shape

(4457, 7736)

In [None]:
v.get_feature_names_out()[1771]

'chgs'

In [None]:
v.vocabulary_

{'yan': 7663,
 'jiu': 3829,
 'liao': 4098,
 'sat': 5902,
 'we': 7400,
 'can': 1623,
 'go': 3155,
 'bugis': 1544,
 'vill': 7279,
 'one': 4943,
 'frm': 2999,
 '10': 245,
 'to': 6926,
 'den': 2235,
 'hop': 3481,
 'parco': 5081,
 'nb': 4717,
 'sun': 6603,
 'cine': 1812,
 '1030': 252,
 'orc': 4985,
 'mrt': 4620,
 'hip': 3432,
 'at': 1110,
 'look': 4198,
 'amy': 948,
 'ure': 7192,
 'beautiful': 1274,
 'intelligent': 3702,
 'woman': 7561,
 'and': 953,
 'like': 4116,
 'lot': 4217,
 'know': 3961,
 'don': 2397,
 'me': 4413,
 'that': 6807,
 'so': 6277,
 'worry': 7591,
 'today': 6932,
 'is': 3748,
 'sorry': 6322,
 'day': 2175,
 'if': 3597,
 'ever': 2658,
 'was': 7374,
 'angry': 956,
 'with': 7533,
 'you': 7695,
 'misbehaved': 4510,
 'or': 4979,
 'hurt': 3555,
 'plz': 5257,
 'just': 3876,
 'slap': 6209,
 'urself': 7200,
 'bcoz': 1262,
 'its': 3767,
 'ur': 7190,
 'fault': 2780,
 'basically': 1235,
 'good': 3179,
 'money': 4574,
 'lucky': 4261,
 'winner': 7516,
 'claim': 1818,
 'your': 7701,
 'prize'

In [None]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
np.where(X_train_np[0]!=0)

(array([ 245,  252, 1110, 1544, 1623, 1812, 2235, 2999, 3155, 3432, 3481,
        3829, 4098, 4620, 4717, 4943, 4985, 5081, 5902, 6603, 6926, 7279,
        7400, 7663]),)

In [None]:
X_train[:4]

Unnamed: 0,Message
3462,K.. I yan jiu liao... Sat we can go 4 bugis vi...
457,"LOOK AT AMY URE A BEAUTIFUL, INTELLIGENT WOMAN..."
353,TODAY is Sorry day.! If ever i was angry with ...
930,money!!! you r a lucky winner ! 2 claim your p...


In [None]:
X_train.values[:4][1]

'LOOK AT AMY URE A BEAUTIFUL, INTELLIGENT WOMAN AND I LIKE U A LOT. I KNOW U DON\x92T LIKE ME LIKE THAT SO DON\x92T WORRY.'

In [None]:
X_train_np[0][245]

np.int64(1)

### Train the naive bayes model

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [None]:
X_test_cv = v.transform(X_test)

### Evaluate Performance

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.96      0.92      0.94       149

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
emails = [
    'Hey, do you come to the office tomorrow?',
    'You won 200 million dollars! Dont miss this reward!',
    'Hey, you are fired!',
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1, 0])

### Train the model using sklearn pipeline and reduce number of lines of code

In [None]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.96      0.92      0.94       149

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

