In [1]:
#import some libraries
import nltk
from pprint import pprint
import csv
from typing import List
import re
from collections import Counter
import numpy as np


In [2]:
# Data Download Link
# https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/data#IMDB%20Dataset.csv
# sample data:
ctr = 0
with open("IMDB Dataset.csv", newline="") as csvfile:
    movie_data = csv.reader(csvfile, delimiter=",")
    for row in movie_data:
        if ctr > 5:
            break
        print(row)
        ctr += 1


['review', 'sentiment']
["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of 

In [3]:
def simple_tokenizer(text: str) -> List[str]:
    # Looking at the text we see that <br></br> which is HTML tag for line break can be a good splitter
    # A sentence (atleast well structured) often has a full spot at the end. We use these two for word breaks
    pattern1 = re.compile("<br /><br />|\.")
    lines = re.split(pattern1, text)
    # you can break a sentence into words using whitespace based split
    tokens = []
    for line in lines:
        tokens += line.split(" ")

    # lowercase and remove any non-alphanumeric characters from tokens for normalize
    norrmalized_tokens = [re.sub(r"\W+", "", token.lower()) for token in tokens]
    return norrmalized_tokens


In [4]:
#remove stop-words
from nltk.corpus import stopwords # library 
nltk.download('stopwords')
all_stopwords = set(stopwords.words('english')) # set the language 

[nltk_data] Downloading package stopwords to C:\Users\leonel
[nltk_data]     villagomez\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
#  Tokenize each review using simple tokenizer
#  like before and remove stopwords
reviews_list = []
with open("IMDB Dataset.csv", newline="") as csvfile:
    movie_data = csv.reader(csvfile, delimiter=",")
    for row in movie_data:
        if row[0] == "review":
            continue
        review_text = row[0]
        tokens = simple_tokenizer(review_text)
        # Remove stopwords
        clean_tokens = [
            tokens
            for tokens in tokens
            if tokens not in all_stopwords and len(tokens) > 1
        ]
        reviews_list.append(clean_tokens)


UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 6237: character maps to <undefined>

In [None]:
# the end result is a map with key as review_id and value as review
pprint('number of reviews:' + str(len(reviews_list)))
pprint('sample row:')
print(reviews_list[0])

In [9]:
# Identify unique tokens count their term frequency in each review and document frequency
tf_dict = {}  # data structure to hold term frequencies in each reviews
df_dict = (
    Counter()
)  # data structure to hold document frequency. You can read more about 'Counters' here: https://docs.python.org/3.1/library/collections.html
# token_counter=0
for review_id, review_tokens in enumerate(reviews_list):
    tf_dict[review_id] = Counter()
    for token in review_tokens:
        if token not in df_dict:
            df_dict[token] = 1
        if token in tf_dict[review_id]:
            tf_dict[review_id][token] += 1
        else:
            tf_dict[review_id][token] = 1
            df_dict[token] += 1


In [10]:
pprint("number of unique tokens: " + str(len(df_dict)))

'number of unique tokens: 22888'


In [11]:

# since the numberof unqiue words is too high. We plot histogram to see extremly common
# or extremely rare tokens
import matplotlib.pyplot as plt


n, bins, patches = plt.hist(x=list(df_dict.values()), bins=1000, color="#0504aa")
plt.grid(axis="y", alpha=0.75)
plt.xlabel("term frequency")
plt.ylabel("Count")
plt.title("Document Frequency of Tokens")
maxfreq = n.max()
# Set a clean upper y-axis limit.
plt.ylim(ymax=100)


(0.0, 100)

In [12]:
# remove tokens which are extremely common or extremely rare
lower_count_thr = 100 # rare words/tokens
upper_count_thr = 5000 # frequent/common tokens
trim_df_dict = Counter(
    {
        token: df_dict[token]
        for token in df_dict
        if lower_count_thr <= df_dict[token] <= upper_count_thr
    }
)
pprint("number of unique tokens after trimming: " + str(len(trim_df_dict)))
pprint("most frequent tokens")
pprint(trim_df_dict.most_common(10))
pprint("least frequent tokens")
pprint(trim_df_dict.most_common()[:-11:-1])


'number of unique tokens after trimming: 199'
'most frequent tokens'
[('movie', 854),
 ('film', 759),
 ('one', 748),
 ('like', 632),
 ('good', 486),
 ('see', 470),
 ('even', 460),
 ('would', 456),
 ('time', 444),
 ('really', 429)]
'least frequent tokens'
[('shot', 100),
 ('actor', 100),
 ('remember', 100),
 ('completely', 101),
 ('put', 101),
 ('left', 101),
 ('help', 101),
 ('rest', 101),
 ('dvd', 102),
 ('simply', 102)]


In [13]:
# create token to id maps
token_to_id_dict = {
    token: index for index, token in enumerate(trim_df_dict)
}  # http://book.pythontips.com/en/latest/enumerate.html
id_to_token_dict = {index: token for index, token in enumerate(trim_df_dict)}
tf_idf_values = []
N = len(reviews_list)
# we are ready to compute tfidf
from math import log


tf_idf_lists = []

ctr = 0
for review_id, review_tokens in enumerate(reviews_list):

    # since its not advised to hold full tf-idf dictionary due to limited  RAM, we only compute
    # tf-idf of top 1000 reviews. IN practice the vectors are dumped to storage (a file)
    # line by line
    if ctr > 1000:
        break
    ctr += 1
    tf_idf_list = [0] * len(token_to_id_dict)
    for token in review_tokens:
        if token not in token_to_id_dict:
            continue
        token_id = token_to_id_dict[token]
        # TFIDF formula
        if token not in tf_dict[review_id]:
            continue
        tf = tf_dict[review_id][token]
        idf = log(N / df_dict[token], 10)  # log to base 10
        tf_idf = tf * idf
        tf_idf_list[token_id] = tf_idf
        tf_idf_values.append(tf_idf)
    tf_idf_lists.append(tf_idf_list)


In [14]:
# print sample tfidf vector values
review_id =2
tf_idf_vector = [tf_idf_lists[review_id][index] for index in [token_to_id_dict[token] for token in tf_dict[review_id] if token in token_to_id_dict]]
pprint(tf_idf_vector[:10])

[1.9128607911750006,
 0.608656996381193,
 0.49656814630934365,
 0.7956462533758027,
 2.080294790936013,
 0.7046184225937008,
 0.6756037860118062,
 0.4811932847423893,
 0.5584903869154628,
 1.9299483382615878]


In [15]:
s = {2,5,4}
print(s)

{2, 4, 5}


In [None]:
term = sunshine
count = 20 
N = 1000
DF = 20 

