# Text Representation - Bag Of Words (BOW)


In [1]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv("spam.csv",encoding = "ISO-8859-1",usecols=['v1','v2'])
df.head()


Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df.v1.value_counts()


ham     4825
spam     747
Name: v1, dtype: int64

In [11]:
df['spam'] = df['v1'].apply(lambda x: 1 if x =='spam' else 0)


In [12]:
df.shape

(5572, 3)

In [13]:
df.head()

Unnamed: 0,v1,v2,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# Train test split


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.v2, df.spam, test_size=0.2)

In [15]:
X_train.shape

(4457,)

In [16]:
X_test.shape

(1115,)

In [17]:
y_train.shape

(4457,)

In [20]:
X_train[:4]


4418    How have your little darlings been so far this...
886     I like to talk pa but am not able to. I dont k...
1247    Horrible gal. Me in sch doing some stuff. How ...
1330                          Aight no rush, I'll ask jay
Name: v2, dtype: object

In [36]:
X_train[:4][4418]

"How have your little darlings been so far this week? Need a coffee run tomo?Can't believe it's that time of week already \x89Û_"

In [22]:
y_train[:4]


4418    0
886     0
1247    0
1330    0
Name: spam, dtype: int64

In [23]:
type(X_train.values)


numpy.ndarray

# Create bag of words representation using CountVectorizer


In [24]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv


<4457x7685 sparse matrix of type '<class 'numpy.int64'>'
	with 59073 stored elements in Compressed Sparse Row format>

In [25]:
X_train_cv.toarray()


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [27]:
X_train_cv.shape


(4457, 7685)

In [30]:
X_train_np = X_train_cv.toarray()
X_train_np[0]



array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [31]:
np.where(X_train_np[0]!=0)


(array([ 920, 1287, 1305, 1636, 1887, 2177, 2778, 3340, 3508, 3748, 4137,
        4690, 4856, 5802, 6223, 6739, 6784, 6833, 6888, 7355, 7632, 7676],
       dtype=int64),)

In [33]:
X_train[:4][4418]


"How have your little darlings been so far this week? Need a coffee run tomo?Can't believe it's that time of week already \x89Û_"

In [34]:
X_train_np[0][1305]

1

# Train the naive bayes model


In [38]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)


MultinomialNB()

In [39]:
X_test_cv = v.transform(X_test)


# Evaluate Performance


In [40]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       971
           1       0.95      0.92      0.93       144

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [41]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)


array([0, 1], dtype=int64)

In [42]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])


In [43]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [44]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       971
           1       0.95      0.92      0.93       144

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

