Various approaches of converting text into Vector

1. Label Encoding
2. One hot encoding
3. Bag of words
4. TF-IDF4
5. Word Embeddings

# Text Representation - Bag Of Words (BOW)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam_dataset.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [11]:
df.shape

(5572, 3)

In [6]:
df['spam']  = df.Category.apply(lambda x: 1 if x == 'spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
#Train test split 
from sklearn.model_selection import train_test_split

In [13]:
x_train, x_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size = 0.3)

In [14]:
x_train.shape

(3900,)

In [17]:
x_test.shape

(1672,)

In [19]:
x_train[:4]

661     Under the sea, there lays a rock. In the rock,...
1306    Designation is software developer and may be s...
1484    Purity of friendship between two is not about ...
5306    Ill be at yours in about 3 mins but look out f...
Name: Message, dtype: object

In [20]:
y_train[:4]

661     0
1306    0
1484    0
5306    0
Name: spam, dtype: int64

In [26]:
# Create bag of words representation using CountVectorizer

# Convert a collection of text documents to a matrix of token counts.

from sklearn.feature_extraction.text import CountVectorizer

In [27]:
x_train.values

array(["Under the sea, there lays a rock. In the rock, there is an envelope. In the envelope, there is a paper. On the paper, there are 3 words... '",
       'Designation is software developer and may be she get chennai:)',
       'Purity of friendship between two is not about smiling after reading the forwarded message..Its about smiling just by seeing the name. Gud evng musthu',
       ..., 'Yavnt tried yet and never played original either',
       "Okay. I've seen it. So i should pick it on friday?",
       'Ah poop. Looks like ill prob have to send in my laptop to get fixed cuz it has a gpu problem'],
      dtype=object)

In [28]:
v = CountVectorizer()

x_train_cv = v.fit_transform(x_train.values)
x_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
x_train_cv.shape

(3900, 7194)

In [34]:
v.get_feature_names_out()[1000:1050]

array(['arestaurant', 'aretaking', 'areyouunique', 'argh', 'argue',
       'arguing', 'argument', 'arise', 'arithmetic', 'arm', 'armand',
       'arms', 'arng', 'arngd', 'arnt', 'around', 'aroundn', 'arr',
       'arrange', 'arranging', 'arrested', 'arrive', 'arrived', 'art',
       'artists', 'arts', 'arty', 'arul', 'arun', 'as', 'asap', 'ashes',
       'ashley', 'ashwini', 'asian', 'asjesus', 'ask', 'askd', 'asked',
       'askin', 'asking', 'asks', 'asleep', 'asp', 'ass', 'assessment',
       'asshole', 'assistance', 'associate', 'asssssholeeee'],
      dtype=object)

In [36]:
v.vocabulary_

{'under': 6642,
 'the': 6335,
 'sea': 5552,
 'there': 6349,
 'lays': 3807,
 'rock': 5406,
 'in': 3433,
 'is': 3530,
 'an': 915,
 'envelope': 2467,
 'paper': 4741,
 'on': 4618,
 'are': 997,
 'words': 7062,
 'designation': 2124,
 'software': 5829,
 'developer': 2142,
 'and': 919,
 'may': 4135,
 'be': 1202,
 'she': 5651,
 'get': 2935,
 'chennai': 1664,
 'purity': 5129,
 'of': 4583,
 'friendship': 2831,
 'between': 1260,
 'two': 6594,
 'not': 4527,
 'about': 720,
 'smiling': 5801,
 'after': 811,
 'reading': 5220,
 'forwarded': 2790,
 'message': 4185,
 'its': 3549,
 'just': 3649,
 'by': 1501,
 'seeing': 5571,
 'name': 4401,
 'gud': 3073,
 'evng': 2528,
 'musthu': 4377,
 'ill': 3403,
 'at': 1057,
 'yours': 7169,
 'mins': 4226,
 'but': 1486,
 'look': 3947,
 'out': 4682,
 'for': 2767,
 'me': 4141,
 'here': 3209,
 'got': 3015,
 'lots': 3967,
 'hair': 3103,
 'dresser': 2308,
 'fr': 2798,
 'china': 1680,
 'freemsg': 2815,
 'claim': 1704,
 'ur': 6697,
 '250': 343,
 'sms': 5808,
 'messages': 4187,


In [39]:
# convert to numpy array

x_train_np = x_train_cv.toarray()
x_train_np

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [40]:
# Build machine learning model using naive bayes model

from sklearn.naive_bayes import MultinomialNB


In [41]:
model = MultinomialNB()

In [42]:
model.fit(x_train_cv, y_train)

MultinomialNB()

In [43]:
x_test_cv = v.transform(x_test)

In [47]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1448
           1       0.96      0.91      0.93       224

    accuracy                           0.98      1672
   macro avg       0.97      0.95      0.96      1672
weighted avg       0.98      0.98      0.98      1672



In [48]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

**using sklearn pipeline**

In [50]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])
clf

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [51]:
clf.fit(x_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [55]:
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1448
           1       0.96      0.91      0.93       224

    accuracy                           0.98      1672
   macro avg       0.97      0.95      0.96      1672
weighted avg       0.98      0.98      0.98      1672



# Bag of words: Exercises

In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [65]:
#1. read the data provided in the same directory with name 'movies_sentiment_data.csv' and store it in df variable
dataset = pd.read_csv('movies_sentiment_data.csv')

#2. print the shape of the data
print(dataset.shape)

#3. print top 5 datapoints
dataset.head()

(19000, 2)


Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [66]:
#creating a new column "Category" which represent 1 if the sentiment is positive or 0 if it is negative

dataset['Category'] = dataset['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
dataset.head()

Unnamed: 0,review,sentiment,Category
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive,1
1,I enjoyed the movie and the story immensely! I...,positive,1
2,I had a hard time sitting through this. Every ...,negative,0
3,It's hard to imagine that anyone could find th...,negative,0
4,This is one military drama I like a lot! Tom B...,positive,1


In [67]:
#check the distribution of 'Category' and see whether the Target labels are balanced or not.
dataset['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [77]:
#Do the 'train-test' splitting with test size of 20%
from sklearn.model_selection import train_test_split

x_train1, x_test1, y_train1, y_test1 = train_test_split(dataset.review, dataset.Category, test_size=0.2)

In [78]:
x_train1[0]

"I first saw Jake Gyllenhaal in Jarhead (2005) a little while back and, since then, I've been watching every one of his movies that arrives on my radar screen. Like Clive Owen, he has an intensity (and he even resembles Owen somewhat) that just oozes from the screen. I feel sure that, if he lands some meaty roles, he'll crack an Oscar one day...<br /><br />That's not to denigrate this film at all.<br /><br />It's a fine story, with very believable people (well, it's based upon the author's early shenanigans with rocketry), a great cast \x96 Chris Cooper is always good, and Laura Dern is always on my watch list \x96 with the appropriate mix of humor, pathos, excitement...and the great sound track with so many rock n roll oldies to get the feet tapping.<br /><br />But, this film had a very special significance for me: in 1957, I was the same age as Homer Hickham; like him, I looked up at the night stars to watch Sputnik as it scudded across the blackness; like Homer also, I experimented 

Exercise 1 : using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative.

1. use CountVectorizer for pre-processing the text.
2. use Random Forest as the classifier with estimators as 50 and criterion as entropy.
3. print the classification report.

In [79]:
#1. create a pipeline object
clf_random_forest = Pipeline([
    ('vectorizer', CountVectorizer()),                                
    ('random_forest', (RandomForestClassifier(n_estimators=50, criterion='entropy')))
])

#2. fit with X_train and y_train
clf_random_forest.fit(x_train1, y_train1)

#3. get the predictions for X_test and store it in y_pred
y_pred1 = clf_random_forest.predict(x_test1)

#4. print the classfication report
print(classification_report(y_test1, y_pred1))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1925
           1       0.83      0.84      0.84      1875

    accuracy                           0.84      3800
   macro avg       0.84      0.84      0.84      3800
weighted avg       0.84      0.84      0.84      3800



Exercise 2 : using sklearn pipeline module create a classification pipeline to classify the movie reviews positive or negative..

1. use CountVectorizer for pre-processing the text.
2. use KNN as the classifier with n_neighbors of 10 and metric as 'euclidean'.
3. print the classification report

In [82]:
clf_KNN = Pipeline([
    ('vectorizer', CountVectorizer()),                                
     ('KNN', (KNeighborsClassifier(n_neighbors=10, metric = 'euclidean')))
])

#2. fit with X_train and y_train
clf_KNN.fit(x_train1, y_train1)


#3. get the predictions for X_test and store it in y_pred
y_pred2 = clf_KNN.predict(x_test1)


#4. print the classfication report
print(classification_report(y_test1, y_pred2))

              precision    recall  f1-score   support

           0       0.65      0.64      0.65      1925
           1       0.64      0.64      0.64      1875

    accuracy                           0.64      3800
   macro avg       0.64      0.64      0.64      3800
weighted avg       0.64      0.64      0.64      3800



Exercise 3 : using sklearn pipeline module create a classification pipeline to classify the movie reviews positive or negative..

1. use CountVectorizer for pre-processing the text.
2. use Multinomial Naive Bayes as the classifier.
3. print the classification report

In [83]:
#1. create a pipeline object
clf_multi = Pipeline([      
     ('vectorizer', CountVectorizer()),   
      ('Multi NB', MultinomialNB())   #using the Multinomial Naive Bayes classifier 
])


#2. fit with X_train and y_train
clf_multi.fit(x_train1, y_train1)


#3. get the predictions for X_test and store it in y_pred
y_pred3 = clf_multi.predict(x_test1)


#4. print the classfication report
print(classification_report(y_test1, y_pred3))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1925
           1       0.87      0.83      0.85      1875

    accuracy                           0.86      3800
   macro avg       0.86      0.86      0.86      3800
weighted avg       0.86      0.86      0.86      3800

