### **Sentiment Analysis Of IMDB Dataset**

In [7]:
import pandas as pd

In [8]:
df_imdb=pd.read_csv("IMDB Dataset.csv")

In [9]:
df_imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## **Preprocessing the data**

In [10]:
#replacing punctuation with space

df_imdb["review_processed"]=df_imdb["review"].str.replace("[^a-zA-Z0-9]", " ")


  df_imdb["review_processed"]=df_imdb["review"].str.replace("[^a-zA-Z0-9]", " ")


In [11]:
#reordering the columns

df_imdb=df_imdb[["review","review_processed","sentiment"]]

In [12]:
df_imdb["review_processed"]

0        One of the other reviewers has mentioned that ...
1        A wonderful little production   br    br   The...
2        I thought this was a wonderful way to spend ti...
3        Basically there s a family where a little boy ...
4        Petter Mattei s  Love in the Time of Money  is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot  bad dialogue  bad acting  idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I m going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review_processed, Length: 50000, dtype: object

In [13]:
#make entire text lowercase

# Iterate through each row and make the text lowercase
for index, row in df_imdb.iterrows():
    df_imdb.at[index, "review_processed"] = row["review_processed"].lower()


#df_imdb['review_processed'] = [row.lower() for row in df_imdb['review_processed']]


#df_imdb["review_processed"] = df_imdb["review_processed"].apply(lambda x: x.lower())


In [14]:
df_imdb["review_processed"]

0        one of the other reviewers has mentioned that ...
1        a wonderful little production   br    br   the...
2        i thought this was a wonderful way to spend ti...
3        basically there s a family where a little boy ...
4        petter mattei s  love in the time of money  is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot  bad dialogue  bad acting  idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i m going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review_processed, Length: 50000, dtype: object

In [15]:
#removing stopwords

import nltk
nltk.download("punkt")
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
stop_words = stopwords.words("english")

add_words = ['movie','br','go','film','ugh','one','make','even','see','movies','get','makes','making','time','watch','character', 'like', 'good','well','would','really', 'show', 'look']

stop_words.extend(add_words)

def remove_stopwords(rev):
  review_tokenize=word_tokenize(rev)
  rev_new = " ".join([i for i in review_tokenize  if i not in stop_words])
  return rev_new

df_imdb['review_processed'] = [remove_stopwords(r) for r in df_imdb['review_processed']]

## Lemmatization

In [21]:
nltk.download("wordnet")
nltk.download('omw-1.4')
nltk.download("averaged_perceptron_tagger")
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [22]:
lemmatizer = WordNetLemmatizer()

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  # output will be a list of tuples -> [(word,detailed_tag)]
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged) # output -> [(word,shallow_tag)]
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)


df_imdb['review_processed'] = df_imdb['review_processed'].apply(lambda x: lemmatize_sentence(x))

## **Bag of words model**

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=2500)
X = tfidf.fit_transform(df_imdb.review_processed).toarray()
y = df_imdb.sentiment.map({'positive' : 1, 'negative' : 0}).values
featureNames = tfidf.get_feature_names_out()



In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=777)

### **Building the ML model**

In [26]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)

y_pred = dt.predict(X_test)
y_pred

array([0, 1, 0, ..., 1, 1, 0])

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("The model accuracy is", accuracy )

[[3538 1415]
 [1378 3669]]
The model accuracy is 0.7207


In [28]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred)

0.7243115190997927

In [29]:
##featureImportance.sort_values(by='Importance')
featureImportance = pd.DataFrame({i : j for i,j in zip(dt.feature_importances_,featureNames)}.items(),columns = ['Importance','word'])
featureImportance.sort_values(by='Importance',ascending=False)

Unnamed: 0,Importance,word
113,0.128606,bad
1374,0.043308,waste
558,0.038051,great
108,0.021526,awful
427,0.014453,excellent
...,...,...
824,0.000050,multiple
596,0.000050,henry
954,0.000049,positive
37,0.000033,al
