In [21]:
import numpy as np
import regex as re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import string
import nltk as nlp

#### LOADING IMDB DATASET

In [2]:
dataframe = pd.read_csv(r'..\\IMDB Dataset.csv')

In [3]:
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
dataframe.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
np.shape(dataframe)

(50000, 2)

#### CHECK FOR NULLS AND DUPLICATES

In [6]:
dataframe.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
dataframe.duplicated().sum()

418

In [8]:
dataframe.drop_duplicates(subset='review', keep='first', inplace=True)

In [9]:
np.shape(dataframe)

(49582, 2)

In [10]:
dataframe.duplicated().sum()

0

In [11]:
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


#### REMOVING NON-WORD CHARACTERS FROM THE DATASET

In [12]:
def preprocess(text, pattern):
    if pattern=='[\.]+':
        text = re.sub(pattern, '. ', text)
    else:
        text = re.sub(pattern, '', text)
    #print(text, '\n')
    return text

In [13]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], '<[^>]*>') #remove markup

In [14]:
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [15]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], '[\.0-9]+') #remove fullstop and numbers
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [16]:
dataframe['review'] = np.vectorize(preprocess)(dataframe['review'], '[^\w\s\']*') #remove everything that's not word spaceor '
# ' is left to handle contractions
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,Petter Mattei's Love in the Time of Money is a...,positive


In [17]:
dataframe['review'] = dataframe['review'].str.lower() #make it lower
dataframe.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,petter mattei's love in the time of money is a...,positive


#### DOES IT MAKE SENSE TO REMOVE SOME WORDS TO REDUCE COMPUTATION?

In [18]:
count = CountVectorizer()
bag = count.fit_transform(dataframe['review']) # we are creating a bag of words

In [19]:
len(count.vocabulary_)

206627

#### Got over 2 Lakh words --> it makes sense to remove some words like articles and prepositions out

In [22]:
#something with tfidf
#question: does it make sense to do tfidf first and then remove stop words using the nltk corpus or 
#remove stop words using the corpus first then perform tfidf next

In [20]:
nlp.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rps24\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True