## Bag of Words

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk import word_tokenize

In [2]:
review_1 = 'The movie was good and we really like it'
review_2 = 'the movie was good but the ending was boring'
review_3 = 'we did not like the movie as it was too lengthy'

In [3]:
review_1_tokens = word_tokenize(review_1)
review_2_tokens = word_tokenize(review_2)
review_3_tokens = word_tokenize(review_3)

In [4]:
review_tokens = set(review_1_tokens).union(set(review_2_tokens)).union(set(review_3_tokens))

In [5]:
len(review_tokens)

18

In [6]:
review_tokens

{'The',
 'and',
 'as',
 'boring',
 'but',
 'did',
 'ending',
 'good',
 'it',
 'lengthy',
 'like',
 'movie',
 'not',
 'really',
 'the',
 'too',
 'was',
 'we'}

In [7]:
review1_dict = dict.fromkeys(review_tokens, 0)
review2_dict = dict.fromkeys(review_tokens, 0)
review3_dict = dict.fromkeys(review_tokens, 0)

In [8]:
review1_dict

{'The': 0,
 'really': 0,
 'but': 0,
 'good': 0,
 'it': 0,
 'as': 0,
 'did': 0,
 'the': 0,
 'movie': 0,
 'we': 0,
 'was': 0,
 'and': 0,
 'not': 0,
 'like': 0,
 'ending': 0,
 'lengthy': 0,
 'too': 0,
 'boring': 0}

In [9]:
for token in review_1_tokens:
    review1_dict[token] += 1
for token in review_2_tokens:
    review2_dict[token] += 1
for token in review_3_tokens:
    review3_dict[token] += 1

In [10]:
review1_dict

{'The': 1,
 'really': 1,
 'but': 0,
 'good': 1,
 'it': 1,
 'as': 0,
 'did': 0,
 'the': 0,
 'movie': 1,
 'we': 1,
 'was': 1,
 'and': 1,
 'not': 0,
 'like': 1,
 'ending': 0,
 'lengthy': 0,
 'too': 0,
 'boring': 0}

In [11]:
review_dict_df = pd.DataFrame([review1_dict,review2_dict,review3_dict])

#### DTM (Document Term Matrix)

In [12]:
review_dict_df

Unnamed: 0,The,really,but,good,it,as,did,the,movie,we,was,and,not,like,ending,lengthy,too,boring
0,1,1,0,1,1,0,0,0,1,1,1,1,0,1,0,0,0,0
1,0,0,1,1,0,0,0,2,1,0,2,0,0,0,1,0,0,1
2,0,0,0,0,1,1,1,1,1,1,1,0,1,1,0,1,1,0


## Count Vectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
review_list = [review_1, review_2, review_3]

In [15]:
review_list

['The movie was good and we really like it',
 'the movie was good but the ending was boring',
 'we did not like the movie as it was too lengthy']

In [16]:
vec = CountVectorizer()

In [17]:
Counts = vec.fit_transform(review_list)

In [18]:
Counts.toarray()

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1],
       [0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 2, 0, 2, 0],
       [0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1]], dtype=int64)

In [19]:
type(Counts)

scipy.sparse.csr.csr_matrix

In [20]:
feat_names = vec.get_feature_names()
feat_names

['and',
 'as',
 'boring',
 'but',
 'did',
 'ending',
 'good',
 'it',
 'lengthy',
 'like',
 'movie',
 'not',
 'really',
 'the',
 'too',
 'was',
 'we']

In [21]:
a = pd.DataFrame(Counts.toarray(), columns=feat_names)
a

Unnamed: 0,and,as,boring,but,did,ending,good,it,lengthy,like,movie,not,really,the,too,was,we
0,1,0,0,0,0,0,1,1,0,1,1,0,1,1,0,1,1
1,0,0,1,1,0,1,1,0,0,0,1,0,0,2,0,2,0
2,0,1,0,0,1,0,0,1,1,1,1,1,0,1,1,1,1


In [22]:
vec = CountVectorizer(stop_words='english')
counts = vec.fit_transform(review_list)
counts.toarray()
feat_names = vec.get_feature_names()
a = pd.DataFrame(counts.toarray(), columns=feat_names)
a

Unnamed: 0,boring,did,ending,good,lengthy,like,movie,really
0,0,0,0,1,0,1,1,1
1,1,0,1,1,0,0,1,0
2,0,1,0,0,1,1,1,0


## TF - IDF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
vect = TfidfVectorizer(min_df=1, lowercase=True, stop_words='english')

In [25]:
vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [26]:
matrix = vect.fit_transform(review_list)
matrix.toarray()

array([[0.        , 0.        , 0.        , 0.4804584 , 0.        ,
        0.4804584 , 0.37311881, 0.63174505],
       [0.5844829 , 0.        , 0.5844829 , 0.44451431, 0.        ,
        0.        , 0.34520502, 0.        ],
       [0.        , 0.5844829 , 0.        , 0.        , 0.5844829 ,
        0.44451431, 0.34520502, 0.        ]])

In [27]:
tf_names = vect.get_feature_names()
tf_names

['boring', 'did', 'ending', 'good', 'lengthy', 'like', 'movie', 'really']

In [28]:
type(matrix)

scipy.sparse.csr.csr_matrix

In [29]:
df = pd.DataFrame(matrix.toarray(), columns=tf_names)
df

Unnamed: 0,boring,did,ending,good,lengthy,like,movie,really
0,0.0,0.0,0.0,0.480458,0.0,0.480458,0.373119,0.631745
1,0.584483,0.0,0.584483,0.444514,0.0,0.0,0.345205,0.0
2,0.0,0.584483,0.0,0.0,0.584483,0.444514,0.345205,0.0


## Cosine Similarity

In [30]:
doc1 = 'Natural Language processing is the study of making a machine understand and generate languages like humans'
doc2 = 'Cricket is a sport played with bat and ball. It is not played in many countries'

In [31]:
doc3 = 'Languages are the cornerdtone of human evolution. Making a machine study languages is not easy'
doc4 = 'Football is a sport played in almost all countries. It is played by kicking a ball'

In [32]:
documents = [doc1,doc2,doc3,doc4]

In [33]:
vec = TfidfVectorizer(min_df=1, lowercase=True, stop_words='english')

In [34]:
matrix = vec.fit_transform(documents)

In [35]:
from sklearn.metrics.pairwise import cosine_similarity as cs

In [36]:
cs(matrix[0:1], matrix[2:3])

array([[0.34918271]])

In [39]:
cs(matrix[1:2], matrix[3:4])

array([[0.68509634]])

In [38]:
cs(matrix[1:2], matrix[2:3])

array([[0.]])

In [40]:
cs(matrix[0:1], matrix[3:4])

array([[0.]])