<h3>Bag of words </h3>
<p> <li> It counts the number of times a word of the vocabulary appears in the article or text. 
    <li> Vocabulary is the unique count of all the words in your training dataset
</p>

<img src="bOw.png" width="700" height="200">

<h3> Limitations </h3>
<p> <li> Sparse representation : the count vector can be too long with many null values
    <li> Doesn't capture meaning of words properly
</p>

In [80]:
import pandas as pd
import numpy as np

In [94]:
df = pd.read_csv("movies_sentiment_data.csv",  nrows=4000)
df.head()

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [95]:
# count number of positive and negative sentiments

df.sentiment.value_counts()

sentiment
positive    2028
negative    1972
Name: count, dtype: int64

In [96]:
df.shape

(4000, 2)

In [97]:
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df.head()

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,1
1,I enjoyed the movie and the story immensely! I...,1
2,I had a hard time sitting through this. Every ...,0
3,It's hard to imagine that anyone could find th...,0
4,This is one military drama I like a lot! Tom B...,1


In [98]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.review, df.sentiment, test_size=0.2)

In [99]:
print("X train:", X_train.shape)
print("X test:", X_test.shape)

X train: (3200,)
X test: (800,)


In [100]:
# converts to numpy array

type(X_train.values)

numpy.ndarray

In [113]:
# to generate count vector using bag of words

from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)

In [102]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [103]:
X_train_cv.shape

(3200, 31517)

In [104]:
# to get all the words my vocabulary has

v.get_feature_names_out()[1000:1030]

array(['akkaya', 'aksar', 'akshay', 'al', 'ala', 'alack', 'alaimo',
       'alain', 'alamos', 'alan', 'alarm', 'alarming', 'alarmingly',
       'alarms', 'alas', 'alaska', 'alastair', 'alba', 'albany', 'albeit',
       'alberson', 'albert', 'alberto', 'albinos', 'albright', 'album',
       'albums', 'alcatraz', 'alchemy', 'alcock'], dtype=object)

In [105]:
# to get all the methods we can have for v

dir(v)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',

In [108]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [109]:
np.where(X_train_np[0] != 0)

(array([  636,   647,   765,  1086,  1317,  1357,  1691,  1832,  1970,
         2035,  2293,  2674,  2690,  2938,  3015,  3781,  4149,  4390,
         4663,  5242,  5609,  5613,  5657,  7764,  7909,  8216,  9416,
         9693,  9772, 10005, 10367, 10410, 10490, 10587, 11233, 12514,
        12839, 13333, 13882, 13890, 13894, 14062, 14224, 14480, 14482,
        14667, 14828, 14862, 16026, 16193, 16369, 16533, 17813, 18044,
        18690, 19549, 19657, 19673, 19791, 21070, 21157, 21235, 21333,
        21411, 22599, 22664, 23399, 24302, 24373, 24543, 24667, 24673,
        24683, 25009, 25490, 27486, 28070, 28166, 28408, 28429, 28549,
        29923, 30480, 30511, 30547, 30727, 30735, 30968, 31176, 31302,
        31366], dtype=int64),)

In [111]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [114]:
X_test_cv = v.transform(X_test)

In [119]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       386
           1       0.85      0.82      0.83       414

    accuracy                           0.83       800
   macro avg       0.83      0.83      0.83       800
weighted avg       0.83      0.83      0.83       800



In [137]:
reviews = [
    "I don't like that movie. It is all bad and vulgar.",
    "Anime is really good."
]

reviews_count = v.transform(reviews)
y_pred = model.predict(reviews_count)

In [138]:
y_pred = ['positive' if x == [1] else 'negative' for x in y_pred]
y_pred

['negative', 'positive']

In [140]:
# using pipeline to do the things we were doing manually above

from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [141]:
clf.fit(X_train, y_train)

In [144]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       386
           1       0.85      0.82      0.83       414

    accuracy                           0.83       800
   macro avg       0.83      0.83      0.83       800
weighted avg       0.83      0.83      0.83       800

