<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [5]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [6]:
df.shape

(5572, 3)

In [8]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [11]:
X_train.shape

(4457,)

In [12]:
X_test.shape

(1115,)

In [47]:
X_train.values

array(['BangBabes Ur order is on the way. U SHOULD receive a Service Msg 2 download UR content. If U do not, GoTo wap. bangb. tv on UR mobile internet/service menu',
       "Not really dude, have no friends i'm afraid :(",
       'Have a safe trip to Nigeria. Wish you happiness and very soon company to share moments with',
       ..., 'No. I.ll meet you in the library',
       'Just sleeping..and surfing',
       'A swt thought: "Nver get tired of doing little things 4 lovable persons.." Coz..somtimes those little things occupy d biggest part in their Hearts.. Gud ni8'],
      shape=(4457,), dtype=object)

In [13]:
type(X_train)

pandas.core.series.Series

In [14]:
X_train[:4]

165     BangBabes Ur order is on the way. U SHOULD rec...
318        Not really dude, have no friends i'm afraid :(
5552    Have a safe trip to Nigeria. Wish you happines...
3518    So you think i should actually talk to him? No...
Name: Message, dtype: object

In [15]:
type(y_train)

pandas.core.series.Series

In [16]:
y_train[:4]

165     1
318     0
5552    0
3518    0
Name: spam, dtype: int64

In [17]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59438 stored elements and shape (4457, 7744)>

In [19]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], shape=(7744,))

In [21]:
X_train_cv.shape

(4457, 7744)

In [48]:
v.get_feature_names_out()[1995:2050]

array(['content', 'contented', 'contention', 'contents', 'continent',
       'continue', 'continued', 'contract', 'contribute', 'control',
       'convenience', 'conversations', 'converter', 'convey', 'conveying',
       'convince', 'convinced', 'convincing', 'cook', 'cooked', 'cookies',
       'cooking', 'cool', 'cooped', 'cooperative', 'copied', 'copies',
       'coping', 'cops', 'copy', 'corect', 'cornwall', 'corrct',
       'correct', 'correction', 'correctly', 'cos', 'cosign', 'cost',
       'costa', 'costing', 'costs', 'costume', 'costumes', 'cougar',
       'cough', 'coughing', 'could', 'coulda', 'couldn', 'count',
       'countin', 'countinlots', 'country', 'coupla'], dtype=object)

In [49]:
dir(v)

['_CountVectorizer__metadata_request__fit',
 '_CountVectorizer__metadata_request__transform',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_tags__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_r

In [23]:
v.vocabulary_

{'bangbabes': 1234,
 'ur': 7204,
 'order': 5005,
 'is': 3784,
 'on': 4959,
 'the': 6810,
 'way': 7416,
 'should': 6121,
 'receive': 5629,
 'service': 6034,
 'msg': 4655,
 'download': 2447,
 'content': 1995,
 'if': 3636,
 'do': 2390,
 'not': 4863,
 'goto': 3239,
 'wap': 7388,
 'bangb': 1233,
 'tv': 7083,
 'mobile': 4579,
 'internet': 3747,
 'menu': 4487,
 'really': 5615,
 'dude': 2496,
 'have': 3393,
 'no': 4831,
 'friends': 3035,
 'afraid': 862,
 'safe': 5880,
 'trip': 7040,
 'to': 6931,
 'nigeria': 4812,
 'wish': 7540,
 'you': 7704,
 'happiness': 3370,
 'and': 978,
 'very': 7277,
 'soon': 6315,
 'company': 1935,
 'share': 6067,
 'moments': 4602,
 'with': 7547,
 'so': 6277,
 'think': 6841,
 'actually': 819,
 'talk': 6697,
 'him': 3467,
 'call': 1614,
 'his': 3474,
 'boss': 1457,
 'in': 3677,
 'morning': 4625,
 'went': 7463,
 'this': 6850,
 'place': 5246,
 'last': 4055,
 'year': 7678,
 'he': 3402,
 'told': 6947,
 'me': 4441,
 'where': 7487,
 'could': 2042,
 'go': 3195,
 'get': 3150,
 'm

In [32]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], shape=(7744,))

In [50]:
np.where(X_train_np[0]!=0)  # 1st email => coulumns where there are non-zero values

(array([1233, 1234, 1995, 2390, 2447, 3239, 3636, 3747, 3784, 4487, 4579,
        4655, 4863, 4959, 5005, 5629, 6034, 6121, 6810, 7083, 7204, 7388,
        7416]),)

In [54]:
X_train[:4]

165     BangBabes Ur order is on the way. U SHOULD rec...
318        Not really dude, have no friends i'm afraid :(
5552    Have a safe trip to Nigeria. Wish you happines...
3518    So you think i should actually talk to him? No...
Name: Message, dtype: object

In [56]:
X_train_np[0][1233] # In 1st email at 1233 we have non zero value in the vector

np.int64(1)

<h3>Train the naive bayes model</h3>

In [40]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [41]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [42]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       978
           1       0.98      0.92      0.95       137

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [43]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [44]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [45]:
clf.fit(X_train, y_train)

In [46]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       978
           1       0.98      0.92      0.95       137

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

