### NLP Tutorial: Text Representation - Bag Of Words (BOW)

In [2]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("/content/spam.csv", encoding='latin-1')

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
# df.drop(columns= 'Unnamed: 2',inplace=True)

In [10]:
# df.rename(columns={'v1':'Category'},inplace=True)

In [11]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [None]:
# def get_spam_number(x):
#   if x == 'spam':
#     return 1
#   return 0

In [12]:
df['spam']=df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [13]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### Train test split

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message,df.spam,test_size=0.2)

In [16]:
X_train.shape

(4457,)

In [17]:
X_test.shape

(1115,)

In [18]:
type(X_train)

pandas.core.series.Series

In [21]:
X_train[:4]

4943    Wrong phone! This phone! I answer this one but...
2568    Hey. For me there is no leave on friday. Wait ...
1783    No dear i do have free messages without any re...
3420    Welcome! Please reply with your AGE and GENDER...
Name: Message, dtype: object

In [22]:
type(y_train)

pandas.core.series.Series

In [23]:
y_train[:4]

4943    0
2568    0
1783    0
3420    1
Name: spam, dtype: int64

In [26]:
type(X_train.values)

numpy.ndarray

### Create bag of words representation using CountVectorizer

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7774 sparse matrix of type '<class 'numpy.int64'>'
	with 59427 stored elements in Compressed Sparse Row format>

In [31]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [32]:
X_train_cv.shape

(4457, 7774)

In [35]:
v.get_feature_names_out().shape

(7774,)

In [41]:
v.get_feature_names_out()[3062]

'ga'

In [39]:
v.vocabulary_

{'wrong': 7640,
 'phone': 5206,
 'this': 6886,
 'answer': 987,
 'one': 4955,
 'but': 1571,
 'assume': 1115,
 'the': 6846,
 'other': 5016,
 'is': 3747,
 'people': 5163,
 'don': 2402,
 'well': 7472,
 'hey': 3419,
 'for': 2947,
 'me': 4412,
 'there': 6863,
 'no': 4807,
 'leave': 4075,
 'on': 4949,
 'friday': 3006,
 'wait': 7375,
 'will': 7536,
 'ask': 1099,
 'my': 4677,
 'superior': 6635,
 'and': 963,
 'tell': 6781,
 'you': 7720,
 'dear': 2195,
 'do': 2372,
 'have': 3358,
 'free': 2990,
 'messages': 4464,
 'without': 7565,
 'any': 998,
 'recharge': 5652,
 'hi': 3424,
 'welcome': 7470,
 'please': 5268,
 'reply': 5736,
 'with': 7562,
 'your': 7724,
 'age': 862,
 'gender': 3108,
 'to': 6967,
 'begin': 1305,
 '24m': 373,
 'thk': 6887,
 'dun': 2488,
 'haf': 3298,
 'hint': 3438,
 'in': 3640,
 'forum': 2969,
 'already': 926,
 'lor': 4218,
 'cos': 2028,
 'told': 6983,
 'ron': 5837,
 'darren': 2171,
 'going': 3177,
 'shuhui': 6159,
 'all': 913,
 'day': 2183,
 'working': 7607,
 'except': 2697,
 'sa

In [47]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [57]:
np.where(X_train_np[0]!=0)

(array([ 987, 1115, 1571, 2402, 3747, 4955, 5016, 5163, 5206, 6846, 6886,
        7472, 7640]),)

In [52]:
X_train[:4][1783]

'No dear i do have free messages without any recharge. Hi hi hi'

In [63]:
X_train[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

### Train the naive bayes model

In [71]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv,y_train)

In [74]:
X_test_cv = v.transform(X_test)

### Evaluate Performance

In [75]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       959
           1       0.96      0.94      0.95       156

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [76]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

### Train the model using sklearn pipeline and reduce number of lines of code

In [77]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [79]:
clf.fit(X_train,y_train)

In [80]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       959
           1       0.96      0.94      0.95       156

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

