# NLP intro

In [35]:
import numpy as np


review1 = "I love this book about love"
review2 = "No this book was okay"
review3 = "No no no no no"

all_words = [text.lower().split() for text in [review1, review2]]
print(all_words)

[['i', 'love', 'this', 'book', 'about', 'love'], ['no', 'this', 'book', 'was', 'okay']]


In [36]:
# flatten from 2D to 1D
all_words = [word for text in all_words for word in text]
print(all_words)

['i', 'love', 'this', 'book', 'about', 'love', 'no', 'this', 'book', 'was', 'okay']


In [37]:
unique_words = set(all_words)
unique_words

{'about', 'book', 'i', 'love', 'no', 'okay', 'this', 'was'}

In [38]:
vocabulary = {word: index for index, word in enumerate(unique_words)}
print(vocabulary)

{'i': 0, 'okay': 1, 'this': 2, 'book': 3, 'was': 4, 'about': 5, 'no': 6, 'love': 7}


In [39]:

def term_freq_vectorizer(document, vocabulary = vocabulary):
    term_freq = np.zeros(len(vocabulary))

    for word in document.lower().split():
        index = vocabulary[word]
        term_freq[index] += 1

    return term_freq


print("vocabulary")
print(review1)
print(review2)
review1_term_freq = term_freq_vectorizer(review1)
review2_term_freq = term_freq_vectorizer(review2)

review1_term_freq, review2_term_freq

vocabulary
I love this book about love
No this book was okay


(array([1., 0., 1., 1., 0., 1., 0., 2.]),
 array([0., 1., 1., 1., 1., 0., 1., 0.]))

In [40]:
import pandas as pd

bag_of_words = pd.DataFrame([review1_term_freq, review2_term_freq], columns=vocabulary.keys())
bag_of_words

Unnamed: 0,i,okay,this,book,was,about,no,love
0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0
1,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0


## bag of words - sklearn

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
bag_of_words_sparse = count_vectorizer.fit_transform([review1, review2, review3])
bag_of_words_sparse.todense(), count_vectorizer.get_feature_names_out()

(matrix([[1, 1, 2, 0, 0, 1, 0],
         [0, 1, 0, 1, 1, 1, 1],
         [0, 0, 0, 5, 0, 0, 0]], dtype=int64),
 array(['about', 'book', 'love', 'no', 'okay', 'this', 'was'], dtype=object))

In [42]:
bag_of_words = pd.DataFrame(bag_of_words_sparse.todense(), columns=count_vectorizer.get_feature_names_out())
bag_of_words

Unnamed: 0,about,book,love,no,okay,this,was
0,1,1,2,0,0,1,0
1,0,1,0,1,1,1,1
2,0,0,0,5,0,0,0


## TF-IDF

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit_transform([review1, review2, review3]).todense()

matrix([[0.40301621, 0.30650422, 0.80603242, 0.        , 0.        ,
         0.30650422, 0.        ],
        [0.        , 0.3935112 , 0.        , 0.3935112 , 0.51741994,
         0.3935112 , 0.51741994],
        [0.        , 0.        , 0.        , 1.        , 0.        ,
         0.        , 0.        ]])