# NLP intro

In [1]:
import numpy as np

In [15]:
review1 = "I LOVE this book about love"
review2 = "No this book was okay"

all_words = [text.lower().split() for text in [review1,review2]]
print(all_words)

#flatten the list 
all_words = [word for text in all_words for word in text]
print(f"Flattened all words : {all_words}")

unique_words = set(all_words)
print(f"Unique words : {unique_words}")

[['i', 'love', 'this', 'book', 'about', 'love'], ['no', 'this', 'book', 'was', 'okay']]
Flattened all words : ['i', 'love', 'this', 'book', 'about', 'love', 'no', 'this', 'book', 'was', 'okay']
Unique words : {'no', 'i', 'book', 'okay', 'love', 'was', 'about', 'this'}


In [27]:
#make a dictionnary of all words 
vocabulary = {word:index for index,word in enumerate(unique_words)}
print(vocabulary)

def term_frequency_vectorizer(phrase,vocabulary):
    term_frequency = np.zeros(len(vocabulary))

    for word in phrase.lower().split():
        index = vocabulary[word]
        term_frequency[index]+=1

    return term_frequency

{'no': 0, 'i': 1, 'book': 2, 'okay': 3, 'love': 4, 'was': 5, 'about': 6, 'this': 7}


In [29]:
review1_term_freq = term_frequency_vectorizer(review1,vocabulary)
review2_term_freq = term_frequency_vectorizer(review2,vocabulary)

print(review1)
print(review1_term_freq)
print(review2)
print(review2_term_freq)

I LOVE this book about love
[0. 1. 1. 0. 2. 0. 1. 1.]
No this book was okay
[1. 0. 1. 1. 0. 1. 0. 1.]


In [32]:
import pandas as pd 

bag_of_words = pd.DataFrame([review1_term_freq,review2_term_freq],columns=vocabulary.keys(),dtype="int32")

bag_of_words


Unnamed: 0,no,i,book,okay,love,was,about,this
0,0,1,1,0,2,0,1,1
1,1,0,1,1,0,1,0,1


# Feature extraction with sklearn

In [37]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer

count_vectorizer = CountVectorizer()
bag_of_words_sparse = count_vectorizer.fit_transform([review1,review2])
bag_of_words_sparse.todense(), count_vectorizer.get_feature_names_out()

(matrix([[1, 1, 2, 0, 0, 1, 0],
         [0, 1, 0, 1, 1, 1, 1]]),
 array(['about', 'book', 'love', 'no', 'okay', 'this', 'was'], dtype=object))

In [39]:
bag_of_words = pd.DataFrame(bag_of_words_sparse.todense(), columns= count_vectorizer.get_feature_names_out())
bag_of_words

Unnamed: 0,about,book,love,no,okay,this,was
0,1,1,2,0,0,1,0
1,0,1,0,1,1,1,1


# TF-IDF

In [40]:
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(bag_of_words_sparse)

tfidf

<2x7 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [41]:
tfidf.todense()

matrix([[0.4078241 , 0.29017021, 0.81564821, 0.        , 0.        ,
         0.29017021, 0.        ],
        [0.        , 0.35520009, 0.        , 0.49922133, 0.49922133,
         0.35520009, 0.49922133]])

In [44]:
# create tfidf vector in one go
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit_transform([review1,review2]).todense()


matrix([[0.4078241 , 0.29017021, 0.81564821, 0.        , 0.        ,
         0.29017021, 0.        ],
        [0.        , 0.35520009, 0.        , 0.49922133, 0.49922133,
         0.35520009, 0.49922133]])

In [45]:
tfidf_vectorizer.get_feature_names_out()

array(['about', 'book', 'love', 'no', 'okay', 'this', 'was'], dtype=object)

In [46]:
review1

'I LOVE this book about love'