## NLP Tutorial: Text Representation - Bag Of Words (BOW)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
X_train[:4]

2166                        I'm not coming home 4 dinner.
5359    This is ur face test ( 1 2 3 4 5 6 7 8 9  &lt;...
4495    Man this bus is so so so slow. I think you're ...
2508                                               Yup...
Name: Message, dtype: object

In [12]:
type(y_train)

pandas.core.series.Series

In [13]:
y_train[:4]

2166    0
5359    0
4495    0
2508    0
Name: spam, dtype: int64

In [14]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train)
X_train_cv

<4457x7742 sparse matrix of type '<class 'numpy.int64'>'
	with 59266 stored elements in Compressed Sparse Row format>

In [16]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [17]:
X_train_cv.shape

(4457, 7742)

In [24]:
v.get_feature_names_out()

array(['00', '000', '008704050406', ..., 'zogtorius', 'zyada', 'ú1'],
      dtype=object)

In [19]:
v.vocabulary_

{'not': 4825,
 'coming': 1927,
 'home': 3480,
 'dinner': 2342,
 'this': 6843,
 'is': 3756,
 'ur': 7199,
 'face': 2759,
 'test': 6771,
 'lt': 4252,
 'gt': 3267,
 'select': 5982,
 'any': 992,
 'number': 4854,
 'will': 7523,
 'tell': 6742,
 'astrology': 1111,
 'am': 934,
 'waiting': 7363,
 'quick': 5520,
 'reply': 5710,
 'man': 4335,
 'bus': 1577,
 'so': 6268,
 'slow': 6220,
 'think': 6834,
 'you': 7706,
 're': 5579,
 'gonna': 3195,
 'get': 3131,
 'there': 6821,
 'before': 1302,
 'me': 4403,
 'yup': 7730,
 'do': 2390,
 'want': 7384,
 'bold': 1425,
 'or': 4968,
 'bb': 1256,
 'torch': 6986,
 'no': 4790,
 'to': 6929,
 'be': 1274,
 'nosy': 4824,
 'guess': 3277,
 'idk': 3608,
 'over': 5019,
 'reacting': 5584,
 'if': 3611,
 'freaked': 2996,
 'hi': 3434,
 'roger': 5803,
 'from': 3033,
 'cl': 1836,
 'how': 3527,
 'are': 1048,
 'when': 7485,
 'rimac': 5779,
 'access': 776,
 'pick': 5187,
 'up': 7186,
 'bout': 1457,
 '30ish': 435,
 'what': 7479,
 'time': 6893,
 'and': 958,
 'that': 6801,
 'going': 

In [25]:
X_train_np = X_train_cv.toarray()
X_train_np

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

<h3>Train the naive bayes model</h3>

In [26]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [27]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [28]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       974
           1       0.98      0.89      0.94       141

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [29]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [30]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [31]:
clf.fit(X_train, y_train)

In [32]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       974
           1       0.98      0.89      0.94       141

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

