In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import preprocessor as p
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
import re
import string
import inflect
from tqdm import tqdm

In [43]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\juanc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [44]:
#import database
df = pd.read_csv("../Database/IMDB Dataset.csv")

In [45]:
#preliminary analysis
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [46]:
#count each sentiment
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [47]:
#count average words in each sentiment
df['word_count'] = df['review'].apply(lambda x: len(str(x).split()))
df.groupby(['sentiment'])['word_count'].mean()

sentiment
negative    229.46456
positive    232.84932
Name: word_count, dtype: float64

In [48]:
#print reviews with emojis
df[df['review'].str.contains('[^\w\s#@/:%.,_-]')]

Unnamed: 0,review,sentiment,word_count
0,One of the other reviewers has mentioned that ...,positive,307
1,A wonderful little production. <br /><br />The...,positive,162
2,I thought this was a wonderful way to spend ti...,positive,166
3,Basically there's a family where a little boy ...,negative,138
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,230
...,...,...,...
49995,I thought this movie did a down right good job...,positive,194
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,112
49997,I am a Catholic taught in parochial elementary...,negative,230
49998,I'm going to have to disagree with the previou...,negative,212


# **Data Preprocessing**

In [49]:
#replace <br> with breakline and remove other html tags
df['review'] = df['review'].str.replace('<br />', ' ')

In [50]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

def cleanText(txt):
    txt = p.clean(txt)
    txt = remove_punct(txt).lower()
    return txt

def remove_stopwords(text):
    text = [word for word in text if word not in stopwords.words('english')]
    return text

def lemmatization(text):
    # Lemmatization - From plural to single + Base form of a word (example better-> good) with WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    return text

def steamming(text):
    # Stemming (example: beautiful -> beauty) with PorterStemmer
    stemmer = PorterStemmer()
    text = " ".join([stemmer.stem(word) for word in text])
    return text

def numeric_to_words(text):
    # Initialize inflect engine
    p = inflect.engine()
    
    #replace numbers with words, CONVERT NUMERALS INTO ORDINALS
    text = [p.number_to_words(word) if word.isdigit() else word for word in text]

    #remove ordinal numbers
    text = [word for word in text if not word.isdigit()]
    return text

In [51]:
#clean text function
def clean_text(text):
    text = cleanText(text)
    text = text.split()
    text = remove_stopwords(text)
    text = lemmatization(text)
    text = numeric_to_words(text)
    text = ' '.join(text)
    return text

In [52]:
#create new column for cleaned text use stqdm to show progress

tqdm.pandas()
df['cleaned_review'] = df['review'].progress_apply(lambda x: clean_text(x))

  0%|          | 0/50000 [00:00<?, ?it/s]

100%|██████████| 50000/50000 [58:03<00:00, 14.35it/s]  


In [53]:
#display dataframe
df.head()

Unnamed: 0,review,sentiment,word_count,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,307,one reviewer mentioned watching oz episode you...
1,A wonderful little production. The filming t...,positive,162,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,166,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,138,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,230,petter matteis love time money visually stunni...


In [54]:
#save cleaned dataframe
df.to_csv("../Database/IMDB Dataset Cleaned.csv", index=False)