In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords

In [2]:
address = '.\IMDB_Dataset.csv'

In [3]:
imdb = pd.read_csv(address)

In [4]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    #text = BeautifulSoup(text).get_text()
    text = re.sub('<[^<]+?>', '', text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    #text = text.split()

    return text

In [6]:
imdb_trimmed = imdb.dropna().drop_duplicates().reset_index(drop=True)

In [7]:
imdb_trimmed.review = imdb_trimmed.review.apply(lambda x: text_to_word_list(x))

In [8]:
imdb_trimmed = imdb_trimmed.copy()

In [9]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
def prep_word (text):
    sw = set(stopwords.words("english"))
    word_tk = word_tokenize(text)
    words_nonstop = [w for w in word_tk if not w in sw]
    port_stem = PorterStemmer()
    lem = WordNetLemmatizer()
    lemm_words = []
    stemmed_word = []
    for w in words_nonstop:
        #stemmed_word.append(port_stem.stem(w))
        lemm_words.append(lem.lemmatize(w))
    return lemm_words

In [11]:
imdb_trimmed.review = imdb_trimmed.review.apply(lambda x: prep_word(x))

In [12]:
imdb_trimmed['review2'] = [' '.join(l) for l in imdb_trimmed['review']]


In [13]:
imdb_trimmed.head()

Unnamed: 0,review,sentiment,review2
0,"[one, reviewer, mentioned, watching, 1, oz, ep...",positive,one reviewer mentioned watching 1 oz episode h...
1,"[wonderful, little, production, filming, techn...",positive,wonderful little production filming technique ...
2,"[thought, wonderful, way, spend, time, hot, su...",positive,thought wonderful way spend time hot summer we...
3,"[basically, family, little, boy, jake, think, ...",negative,basically family little boy jake think zombie ...
4,"[petter, mattei, love, time, money, visually, ...",positive,petter mattei love time money visually stunnin...


In [14]:
import nltk 
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\STaine\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [15]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [16]:
imdb_trimmed['scores'] = imdb_trimmed['review2'].apply(lambda x: sia.polarity_scores(x))

In [17]:
imdb_trimmed.head()

Unnamed: 0,review,sentiment,review2,scores
0,"[one, reviewer, mentioned, watching, 1, oz, ep...",positive,one reviewer mentioned watching 1 oz episode h...,"{'neg': 0.31, 'neu': 0.576, 'pos': 0.115, 'com..."
1,"[wonderful, little, production, filming, techn...",positive,wonderful little production filming technique ...,"{'neg': 0.078, 'neu': 0.658, 'pos': 0.264, 'co..."
2,"[thought, wonderful, way, spend, time, hot, su...",positive,thought wonderful way spend time hot summer we...,"{'neg': 0.15, 'neu': 0.533, 'pos': 0.317, 'com..."
3,"[basically, family, little, boy, jake, think, ...",negative,basically family little boy jake think zombie ...,"{'neg': 0.239, 'neu': 0.626, 'pos': 0.135, 'co..."
4,"[petter, mattei, love, time, money, visually, ...",positive,petter mattei love time money visually stunnin...,"{'neg': 0.031, 'neu': 0.695, 'pos': 0.274, 'co..."


In [18]:
imdb_trimmed['compound'] = imdb_trimmed['scores'].apply(lambda x: x['compound'])

In [19]:
imdb_trimmed['comp_score'] = imdb_trimmed['compound'].apply(lambda c: 1 if c >= 0 else 0)

In [20]:
imdb_trimmed['sentiment'] = imdb_trimmed['sentiment'].map({'positive':1,'negative':0})

In [21]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score

In [22]:
f1 = f1_score(imdb_trimmed['sentiment'],imdb_trimmed['comp_score'],pos_label=1)
accuracy = accuracy_score(imdb_trimmed['sentiment'],imdb_trimmed['comp_score'])

print(f"Validation F1 Score  : {f1} and Accuracy Score {accuracy}")

Validation F1 Score  : 0.7285039740552166 and Accuracy Score 0.674135775079666
