In [42]:
import numpy as np
import regex as re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import string
import nltk as nlp
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter #like map but worse cuz it senses only the tally --> not for computation :(

#### LOADING IMDB DATASET

In [43]:
dataframe = pd.read_csv(r'..\\IMDB Dataset.csv')

In [44]:
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [45]:
dataframe.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [46]:
np.shape(dataframe)

(50000, 2)

In [47]:
dataframe['sentiment'] = dataframe['sentiment'].replace('positive', 1)
dataframe['sentiment'] = dataframe['sentiment'].replace('negative', 0)
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


#### CHECK FOR NULLS AND DUPLICATES

In [48]:
dataframe.isnull().sum()

review       0
sentiment    0
dtype: int64

In [49]:
dataframe.duplicated().sum()

418

In [50]:
dataframe.drop_duplicates(subset='review', keep='first', inplace=True)

In [51]:
np.shape(dataframe)

(49582, 2)

In [52]:
dataframe.duplicated().sum()

0

In [53]:
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


#### split into test and train samples

#### REMOVING NON-WORD CHARACTERS FROM THE DATASET

In [54]:
def preprocess(text, pattern):
    if pattern=='[.]+':
        text = re.sub(pattern, '. ', text)
    elif pattern =="[']":
        text =  re.sub(pattern, ' ', text)
    else:
        text = re.sub(pattern, '', text)
    #print(text, '\n')
    return text

In [55]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], '<[^>]*>') #remove markup
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [56]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], '[.]+') #remove ... and replace with .
#print(dataframe.loc[0, 'review'])
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming te...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [58]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], '[0-9]+') #remove ... and replace with .
#print(dataframe.loc[0, 'review'])
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming te...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [59]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], "[']") #remove ... and replace with .
print(dataframe.loc[0, 'review'])
dataframe.head()

One of the other reviewers has mentioned that after watching just  Oz episode you ll be hooked.  They are right, as this is exactly what happened with me. The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO.  Trust me, this is not a show for the faint hearted or timid.  This show pulls no punches with regards to drugs, sex or violence.  Its is hardcore, in the classic use of the word. It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary.  It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda.  Em City is home to many. Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more. so scuffles, death stares, dodgy dealings and shady agreements are never far away. I would say the main appeal of the show is due to the fact that it goes where other sho

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming te...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there s a family where a little boy ...,0
4,"Petter Mattei s ""Love in the Time of Money"" is...",1


In [60]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], '[^\w\s\']*') #remove everything that's not word space or '
# ' is left to handle contractions
print(dataframe.loc[0, 'review'])
dataframe.head()

One of the other reviewers has mentioned that after watching just  Oz episode you ll be hooked  They are right as this is exactly what happened with me The first thing that struck me about Oz was its brutality and unflinching scenes of violence which set in right from the word GO  Trust me this is not a show for the faint hearted or timid  This show pulls no punches with regards to drugs sex or violence  Its is hardcore in the classic use of the word It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary  It focuses mainly on Emerald City an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda  Em City is home to many Aryans Muslims gangstas Latinos Christians Italians Irish and more so scuffles death stares dodgy dealings and shady agreements are never far away I would say the main appeal of the show is due to the fact that it goes where other shows wouldn t dare  Forget p

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there s a family where a little boy ...,0
4,Petter Mattei s Love in the Time of Money is a...,1


In [61]:
dataframe['review'] = dataframe['review'].str.lower()#make it lower
dataframe.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there s a family where a little boy ...,0
4,petter mattei s love in the time of money is a...,1


In [62]:
print(dataframe.loc[0, 'review'])

one of the other reviewers has mentioned that after watching just  oz episode you ll be hooked  they are right as this is exactly what happened with me the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go  trust me this is not a show for the faint hearted or timid  this show pulls no punches with regards to drugs sex or violence  its is hardcore in the classic use of the word it is called oz as that is the nickname given to the oswald maximum security state penitentary  it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda  em city is home to many aryans muslims gangstas latinos christians italians irish and more so scuffles death stares dodgy dealings and shady agreements are never far away i would say the main appeal of the show is due to the fact that it goes where other shows wouldn t dare  forget p

#### DOES IT MAKE SENSE TO REMOVE SOME WORDS TO REDUCE COMPUTATION?

In [63]:
count = CountVectorizer()
bag = count.fit_transform(dataframe['review'])
len(count.vocabulary_)

148992

#### Got over 2 Lakh words --> it makes sense to remove some words like articles and prepositions out
#### Better to remove stop words first (Why? --> documentation wip)

In [64]:
#something with tfidf
#question: does it make sense to do tfidf first and then remove stop words using the nltk corpus or 
#remove stop words using the corpus first then perform tfidf next

In [65]:
nlp.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rps24\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [66]:
stop = set(stopwords.words('english')) #set makes serach O(1)
#originally stopwords.words('english') yields a list
print(stop)

{'out', "don't", 'other', 'been', 'hadn', "haven't", 're', 'only', 'then', "shouldn't", 'too', 'off', 'wouldn', "needn't", 'that', 't', 'in', 'very', 'or', 'to', 'such', 'aren', 'when', 'just', 'most', 'what', 'these', 'is', "didn't", 'this', 'if', "you've", 'did', 'same', 'm', 'i', 'our', "hadn't", 'so', 'didn', "isn't", 'both', 'by', 'yourself', 'on', "you'd", 'into', 'we', 'again', 'have', 'from', 'more', 'his', 'mightn', "couldn't", 'theirs', 'themselves', "it's", "you'll", 'because', 'weren', 'down', 'than', 'am', "wouldn't", 'yourselves', 'had', "shan't", 'were', 'until', 'about', 'with', "doesn't", 'own', 'doesn', 'couldn', 'those', 'while', 'won', 'hers', 'each', 'don', "hasn't", 'any', 'shouldn', 'a', 'needn', 'some', 'between', 'all', 'but', 'himself', 'can', "won't", 'myself', 'during', 'where', 'haven', 'be', 'her', 'mustn', 'they', 'an', 'having', 'which', 'will', "she's", 'once', 've', 'there', 'nor', "aren't", 'do', "you're", 'll', 'over', 'for', "weren't", 'hasn', 'few'

In [67]:
ps  = PorterStemmer()

In [68]:
def remove_stopwords_and_stem(text):
    text_ = word_tokenize(text)
    tokens = []
    #print(text_)
    for word in text_:
        if word not in stop:
            tokens.append(ps.stem(word))
    #print(tokens)
    text = ' '.join(tokens) #send only tokens sent as a joined sentence
    return text

In [69]:
dataframe['review'] = np.vectorize(remove_stopwords_and_stem)(dataframe['review'])

In [70]:
print(dataframe.loc[0, 'review'])

one review mention watch oz episod hook right exactli happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side


#### word2vec