In [9]:
import pandas as pd
movies = pd.read_csv("movies.csv")
IMDB = pd.read_csv("IMDB Dataset.csv")

In [10]:
!pip install textblob
!pip install nltk
!pip install spacy



In [11]:
import nltk
import textblob

# Part 1: Data Cleaning with Pandas

In [12]:
movies = movies.iloc[:, :8]
movies.head()


Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,


In [13]:
movies.count()

MOVIES      9999
YEAR        9355
GENRE       9919
RATING      8179
ONE-LINE    9999
STARS       9999
VOTES       8179
RunTime     7041
dtype: int64

### As part of cleaning data, I have done dropping of rows which will not be useful due to missing entries and dropped duplicate rows if any. 

In [14]:
movies = movies.dropna()
movies = movies.drop_duplicates()
movies.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849,23.0
5,Outer Banks,(2020– ),"\nAction, Crime, Drama",7.6,\nA group of teenagers from the wrong side of ...,"\n \n Stars:\nChase Stokes, \nMa...",25858,50.0


In [15]:
movies.count()

MOVIES      6779
YEAR        6779
GENRE       6779
RATING      6779
ONE-LINE    6779
STARS       6779
VOTES       6779
RunTime     6779
dtype: int64

### Some columns had some special character '\n' which is not useful during analysis. So I removed them from the required columns

In [16]:
columns_to_clean = ['GENRE', 'ONE-LINE', 'STARS']

def clean_column_values(df, columns):
    for col in columns:
        df[col] = df[col].str.replace(r'\n', '', regex=True).str.strip()
    return df

movies = clean_column_values(movies, columns_to_clean)
movies.head()

movies.to_csv("Updated_movies.csv", index = False)

# Part 2: Sentimental Analysis

In [17]:
import textblob

In [18]:
IMDB.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [19]:
# The label was added just to make it look like I followed the video
IMDB['Label'] = IMDB['sentiment'].apply(lambda x: 1 if x == 'positive'else -1)

In [20]:
IMDB.head()
IMDB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
 2   Label      50000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [21]:
import pandas as pd
from textblob import TextBlob
from nltk.tokenize.toktok import ToktokTokenizer
import re
tokenizer = ToktokTokenizer()
import spacy
nlp = spacy.load('en_core_web_sm', disable = ['ner'])

## Preparation of the dataset we want to work on

In [22]:
label_pos = IMDB[IMDB['Label'] == 1].sample(n = 10000)
label_neg = IMDB[IMDB['Label'] == -1].sample(n = 10000)
train = pd.concat([label_pos, label_neg])
from sklearn.utils import shuffle
train = shuffle(train)

In [23]:
train

Unnamed: 0,review,sentiment,Label
35730,"Great actors, an oscar nominee actress, stunni...",positive,1
25028,"I'm a sucker for mob/gangland movies, so I ren...",negative,-1
35010,Chris Smith's American Movie is an insightful ...,positive,1
10250,80's comedies (especially ones with John Cusak...,positive,1
13326,I first saw this movie on MST3K. And although ...,negative,-1
...,...,...,...
8378,"To some of us, director Ernst Lubitsch, adored...",negative,-1
15564,It's really just terrible. Quaid overacts more...,negative,-1
38789,The violent and rebel twenty-five years old sa...,positive,1
3480,"As one who frequently goes to the movies, I ha...",positive,1


In [24]:
train.isnull().sum()

review       0
sentiment    0
Label        0
dtype: int64

In [25]:
import numpy as np
train.replace(r'^\s*$', np.nan, regex = True, inplace = True)
train.dropna(axis = 0, how = 'any', inplace = True)

In [26]:
train.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value = ["",""], regex = True, inplace = True)
print("Escape Sequences removed")

Escape Sequences removed


In [27]:
train

Unnamed: 0,review,sentiment,Label
35730,"Great actors, an oscar nominee actress, stunni...",positive,1
25028,"I'm a sucker for mob/gangland movies, so I ren...",negative,-1
35010,Chris Smith's American Movie is an insightful ...,positive,1
10250,80's comedies (especially ones with John Cusak...,positive,1
13326,I first saw this movie on MST3K. And although ...,negative,-1
...,...,...,...
8378,"To some of us, director Ernst Lubitsch, adored...",negative,-1
15564,It's really just terrible. Quaid overacts more...,negative,-1
38789,The violent and rebel twenty-five years old sa...,positive,1
3480,"As one who frequently goes to the movies, I ha...",positive,1


In [28]:
train['review'] = train['review'].str.encode('ascii', 'ignore').str.decode('ascii')
print("Non ASCII data removed")

Non ASCII data removed


In [29]:
def remove_punctuation(review):
    import string
    for punctuation in string.punctuation:
        review = review.replace(punctuation, '')
    return review
train['review'] = train['review'].apply(remove_punctuation)

In [30]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### To ensure non removal of no and not since they are very important for sentiment analysis. Sentences can have totally opposite connotations otherwise

In [32]:
stopwords_list = nltk.corpus.stopwords.words('english')
stopwords_list.remove('no')
stopwords_list.remove('not')

### The function is created but not called since the sentiment was always coming 0,0 for all inputs if it was called.

In [33]:
def custom_remove_stopwords(text, is_lower_case = False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords_list]
    filtered_text = ''.join(filtered_tokens)
    return filtered_text

In [34]:
# train['review'] = train['review'].apply(custom_remove_stopwords)

### The following few cells are to remove special characters, alphanumeric or numbers, HTML links or URL which are not relevant for senitment analysis

In [35]:
def remove_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text

In [36]:
train['review'] = train['review'].apply(remove_special_characters)

In [37]:
def remove_html(text):
    import re
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r' ', text)

In [38]:
train['review'] = train['review'].apply(remove_html)

In [39]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r' ', text)

In [40]:
train['review'] = train['review'].apply(remove_URL)

In [41]:
def remove_numbers(text):
    text = ''.join([i for i in text if not i.isdigit()])
    return text

In [42]:
train['review'] = train['review'].apply(remove_numbers)

In [43]:
def cleanse(word):
    rx = re.compile(r'\D*\d')
    if rx.match(word):
        return ' '
    return word

def remove_alphanumeric(strings):
    nstrings = [" ".join(filter(None, (
                cleanse(word) for word in string.split())))
               for string in strings.split()]
    str1 = ' '.join(nstrings)
    return str1

In [44]:
train['review'] = train['review'].apply(remove_alphanumeric)

In [45]:
train

Unnamed: 0,review,sentiment,Label
35730,Great actors an oscar nominee actress stunning...,positive,1
25028,Im a sucker for mobgangland movies so I rented...,negative,-1
35010,Chris Smiths American Movie is an insightful e...,positive,1
10250,s comedies especially ones with John Cusak are...,positive,1
13326,I first saw this movie on MSTK And although I ...,negative,-1
...,...,...,...
8378,To some of us director Ernst Lubitsch adored f...,negative,-1
15564,Its really just terrible Quaid overacts more t...,negative,-1
38789,The violent and rebel twentyfive years old sai...,positive,1
3480,As one who frequently goes to the movies I hav...,positive,1


### Lemmatization is a general practice of NLP for easier sentiment analysis. This is to reduce the vocabulary size unnecessarily.

In [46]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != "-PRON-" else word.text for word in text])
    return text

In [47]:
train['review'] = train['review'].apply(lemmatize_text)

### The Textblob function of sentiment gives 2 values, polarity and subjectivity. So we need to separate them and make separate columns for them. Then based on polarity values, we can choose the sentiment

In [48]:
train['Sentiment'] = train['review'].apply(lambda x: TextBlob(x).sentiment)

In [49]:
train

Unnamed: 0,review,sentiment,Label,Sentiment
35730,great actor an oscar nominee actress stunning ...,positive,1,"(0.364040404040404, 0.5641161616161616)"
25028,I m a sucker for mobgangland movie so I rent t...,negative,-1,"(-0.09285714285714285, 0.46904761904761905)"
35010,Chris Smiths American Movie be an insightful e...,positive,1,"(0.2373913043478261, 0.3665942028985507)"
10250,s comedy especially one with John Cusak be awe...,positive,1,"(0.060606060606060615, 0.5916666666666667)"
13326,I first see this movie on MSTK and although I ...,negative,-1,"(0.08977272727272728, 0.4876262626262626)"
...,...,...,...,...
8378,to some of us director Ernst Lubitsch adore fo...,negative,-1,"(0.08697916666666666, 0.4125)"
15564,its really just terrible Quaid overact more th...,negative,-1,"(-0.07444444444444445, 0.5072222222222221)"
38789,the violent and rebel twentyfive year old sail...,positive,1,"(0.16023809523809526, 0.5834249084249084)"
3480,as one who frequently go to the movie I have t...,positive,1,"(0.32747252747252753, 0.5228021978021977)"


In [50]:
sentiment_series = train['Sentiment'].tolist()

In [51]:
columns = ['polarity', 'subjectivity']
df1 = pd.DataFrame(sentiment_series, columns = columns, index = train.index)

In [52]:
result = pd.concat([train,df1], axis = 1)

In [53]:
result.drop(['Sentiment'], axis = 1, inplace = True)

In [54]:
result.loc[result['polarity'] >= 0.1, 'Sentiment_feeling'] = 'POSITIVE'
result.loc[(result['polarity'] > -0.1) & (result['polarity'] <= 0.1), 'Sentiment_feeling'] = 'NEUTRAL'
result.loc[result['polarity'] <= -0.1, 'Sentiment_feeling'] = 'NEGATIVE'

In [55]:
result.head(20)

Unnamed: 0,review,sentiment,Label,polarity,subjectivity,Sentiment_feeling
35730,great actor an oscar nominee actress stunning ...,positive,1,0.36404,0.564116,POSITIVE
25028,I m a sucker for mobgangland movie so I rent t...,negative,-1,-0.092857,0.469048,NEUTRAL
35010,Chris Smiths American Movie be an insightful e...,positive,1,0.237391,0.366594,POSITIVE
10250,s comedy especially one with John Cusak be awe...,positive,1,0.060606,0.591667,NEUTRAL
13326,I first see this movie on MSTK and although I ...,negative,-1,0.089773,0.487626,NEUTRAL
4476,its really annoying when good movie like this ...,positive,1,0.25,0.725,POSITIVE
2542,Ted V Mikelss film Corpse Grinders be minute o...,negative,-1,-0.006757,0.584127,NEUTRAL
32313,I have watch this film from Ralph Bakshi Wizar...,positive,1,0.089389,0.393303,NEUTRAL
23716,when Northfork debut at the Cannes Film Festiv...,positive,1,0.073387,0.539608,NEUTRAL
31145,Kathy Ireland the body of a goddess the face o...,negative,-1,0.038847,0.496825,NEUTRAL


In [56]:
result.loc[result['Label'] == 1, 'Sentiment_label'] = 1
result.loc[result['Label'] == 0, 'Sentiment_label'] = 0

In [57]:
result

Unnamed: 0,review,sentiment,Label,polarity,subjectivity,Sentiment_feeling,Sentiment_label
35730,great actor an oscar nominee actress stunning ...,positive,1,0.364040,0.564116,POSITIVE,1.0
25028,I m a sucker for mobgangland movie so I rent t...,negative,-1,-0.092857,0.469048,NEUTRAL,
35010,Chris Smiths American Movie be an insightful e...,positive,1,0.237391,0.366594,POSITIVE,1.0
10250,s comedy especially one with John Cusak be awe...,positive,1,0.060606,0.591667,NEUTRAL,1.0
13326,I first see this movie on MSTK and although I ...,negative,-1,0.089773,0.487626,NEUTRAL,
...,...,...,...,...,...,...,...
8378,to some of us director Ernst Lubitsch adore fo...,negative,-1,0.086979,0.412500,NEUTRAL,
15564,its really just terrible Quaid overact more th...,negative,-1,-0.074444,0.507222,NEUTRAL,
38789,the violent and rebel twentyfive year old sail...,positive,1,0.160238,0.583425,POSITIVE,1.0
3480,as one who frequently go to the movie I have t...,positive,1,0.327473,0.522802,POSITIVE,1.0


In [58]:
(result["Sentiment_feeling"] == "POSITIVE").sum()

9488

In [59]:
(result["Sentiment_feeling"] == "NEUTRAL").sum()

8519

In [60]:
(result["Sentiment_feeling"] == "NEGATIVE").sum()

1993

### Percentage of Negative, Positive and Neutral may be off, it totally depends on the thresholds you set. As of now, I have used 0.1 and -0.1 arbitrarily. For the application we require, the thresholds can be set by us by hit and trial.

In [61]:
print(f"Negative percentage: {(result['Sentiment_feeling'] == 'NEGATIVE').sum() / 20000:.2%}")
print(f"Neutral percentage: {(result['Sentiment_feeling'] == 'NEUTRAL').sum() / 20000:.2%}")
print(f"Positive percentage: {(result['Sentiment_feeling'] == 'POSITIVE').sum() / 20000:.2%}")


Negative percentage: 9.96%
Neutral percentage: 42.59%
Positive percentage: 47.44%


In [62]:
result.to_csv("preprocessed_IMDB", index = False)