##### This. notebook compares all three models with logistic regression (model taking imbalanced data, undersampling data, considering class weights) for the movie subtitles data 

##### Importing all necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import pickle
import re
from string import punctuation 
import csv

In [2]:
#importing libraries for models and nlp tasks
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [3]:
import nltk
import nltk.data
from string import punctuation 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords

##### Loading the data, models and supporting files

In [4]:
df_title=pd.read_csv("../data/raw/movies_titles.csv")
df_sub=pd.read_csv("../data/raw/movies_subtitles.csv")

In [33]:
tfidf_vectorizer=pickle.load(open('../tfidfvectors/tfidf_vect.pkl','rb'))
tfidf_vectorizer_under=pickle.load(open('../tfidfvectors/tfidf_vect_undersampling.pkl','rb'))
tfidf_vectorizer_imb=pickle.load(open('../tfidfvectors/tfidf_vect_imb.pkl','rb'))
tfidf_vectorizer_cw=pickle.load(open('../tfidfvectors/tfidf_vect_classweights.pkl','rb'))

In [34]:
test_model_lr=pickle.load(open('../models/lr_mn.pkl','rb'))
test_model_lr_under=pickle.load(open('../models/lr_mn_neutral.pkl','rb'))
test_model_lr_imb=pickle.load(open('../models/lr_mn_imb.pkl','rb'))
test_model_lr_cw=pickle.load(open('../models/lr_mn_classweights.pkl','rb'))

In [35]:
emotion = pd.read_csv('../labels_prediction/emotions.csv')
emotion_neutral = pd.read_csv('../labels_prediction/emotions_neutral.csv')

dic_emotions=emotion.to_dict('series')
dic_emotions_neutral=emotion_neutral.to_dict('series')

print(dic_emotions['emotion'])
print(dic_emotions_neutral['emotion'])

0     sadness
1         joy
2        love
3       anger
4        fear
5    surprise
Name: emotion, dtype: object
0     sadness
1         joy
2        love
3       anger
4        fear
5    surprise
6     neutral
Name: emotion, dtype: object


##### Preprocessing the data

In [12]:
df_title.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,...,1995-11-16,352194034.0,130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0
3,False,,3600000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://www.mgm.com/title_title.do?title_star=L...,451,tt0113627,en,Leaving Las Vegas,"Ben Sanderson, an alcoholic Hollywood screenwr...",...,1995-10-27,49800000.0,112.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,I Love You... The Way You Are.,Leaving Las Vegas,False,7.1,365.0
4,False,,29500000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",,63,tt0114746,en,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...",...,1995-12-29,168840000.0,129.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The future is history.,Twelve Monkeys,False,7.4,2470.0


In [13]:
df_sub.head()

Unnamed: 0,start_time,end_time,text,imdb_id
0,58.559,61.602,"BOY: All right, everyone!\nThis... is a stick-up!",tt0114709
1,61.687,63.354,Don't anybody move!,tt0114709
2,64.398,66.482,"Now, empty that safe!",tt0114709
3,68.318,71.612,"Ooh-hoo-hoo!\nMoney, money, money! (KISSING)",tt0114709
4,71.697,74.031,"Stop it! Stop it,\nyou mean, old potato!",tt0114709


In [14]:
df_sub.groupby(['imdb_id']).count()

Unnamed: 0_level_0,start_time,end_time,text
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0002130,1049,1049,1049
tt0005044,1338,1338,1338
tt0005059,3471,3471,3471
tt0005060,1457,1457,1457
tt0007338,2834,2834,2772
...,...,...,...
tt6176078,3771,3771,3771
tt6210808,2476,2476,2476
tt6212346,1515,1515,1515
tt6582384,246,246,246


In [15]:
imdb=df_title.loc[df_title['title'] == "Harry Potter and the Philosopher's Stone"]['imdb_id']
imdb

747    tt0241527
Name: imdb_id, dtype: object

In [36]:
df_harry=df_sub.loc[df_sub['imdb_id']==imdb[747]]

In [37]:
df_harry

Unnamed: 0,start_time,end_time,text,imdb_id
2639074,81.700,87.000,"I should've known that you would\nbe here, Pro...",tt0241527
2639075,97.900,101.400,"Good evening, Professor Dumbledore.",tt0241527
2639076,103.599,106.899,"Are the rumors true, Albus?",tt0241527
2639077,106.900,112.299,"I'm afraid so, professor.\nThe good and the bad.",tt0241527
2639078,112.299,115.799,- And the boy?\n- Hagrid is bringing him.,tt0241527
...,...,...,...,...
3259946,8976.299,8980.899,"I do. But your cousin don't, do he?",tt0241527
3259947,8986.100,8989.498,"Feels strange to be going home,\ndoesn't it?",tt0241527
3259948,8989.500,8991.699,I'm not going home.,tt0241527
3259949,8991.700,8994.000,Not really.,tt0241527


In [38]:
def text_cleaning(text):
   
    text = re.sub(r"[^A-Za-z]", " ", text)
    
    
    # Remove punctuation from text
    text = "".join([c for c in text if c not in punctuation])
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.split()
    text = [w for w in text if not w in stopwords]
    text = " ".join(text)
        
    text = text.split()
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(lemmatized_words)
    text=text.lower()
    
    return text 

In [39]:
df_harry=df_harry[~df_harry.text.str.contains('♪')]

df_harry['cleaned_text'] = df_harry['text'].apply(lambda x: text_cleaning(x))

df_harry = df_harry[df_harry['cleaned_text'].map(len) > 0]


In [40]:
df_harry

Unnamed: 0,start_time,end_time,text,imdb_id,cleaned_text
2639074,81.700,87.000,"I should've known that you would\nbe here, Pro...",tt0241527,i known would professor mcgonagall
2639075,97.900,101.400,"Good evening, Professor Dumbledore.",tt0241527,good evening professor dumbledore
2639076,103.599,106.899,"Are the rumors true, Albus?",tt0241527,are rumor true albus
2639077,106.900,112.299,"I'm afraid so, professor.\nThe good and the bad.",tt0241527,i afraid professor the good bad
2639078,112.299,115.799,- And the boy?\n- Hagrid is bringing him.,tt0241527,and boy hagrid bringing
...,...,...,...,...,...
3259946,8976.299,8980.899,"I do. But your cousin don't, do he?",tt0241527,i but cousin
3259947,8986.100,8989.498,"Feels strange to be going home,\ndoesn't it?",tt0241527,feels strange going home
3259948,8989.500,8991.699,I'm not going home.,tt0241527,i going home
3259949,8991.700,8994.000,Not really.,tt0241527,not really


##### Testing models

In [41]:
test_tfidf = tfidf_vectorizer.transform(df_harry['cleaned_text'])
test_tfidf_under = tfidf_vectorizer_under.transform(df_harry['cleaned_text'])
test_tfidf_imb = tfidf_vectorizer_imb.transform(df_harry['cleaned_text'])
test_tfidf_cw = tfidf_vectorizer_cw.transform(df_harry['cleaned_text'])

ytest_pred=test_model_lr.predict(test_tfidf)
ytest_pred_under=test_model_lr_under.predict(test_tfidf_under)
ytest_pred_imb=test_model_lr_imb.predict(test_tfidf_imb)
ytest_pred_cw=test_model_lr_cw.predict(test_tfidf_cw)

In [42]:
df_harry['predicted_label']=ytest_pred
df_harry['predicted_label_under']=ytest_pred_under
df_harry['predicted_label_imb']=ytest_pred_imb
df_harry['predicted_label_cw']=ytest_pred_cw

In [43]:
df_harry

Unnamed: 0,start_time,end_time,text,imdb_id,cleaned_text,predicted_label,predicted_label_under,predicted_label_imb,predicted_label_cw
2639074,81.700,87.000,"I should've known that you would\nbe here, Pro...",tt0241527,i known would professor mcgonagall,1,6,6,6
2639075,97.900,101.400,"Good evening, Professor Dumbledore.",tt0241527,good evening professor dumbledore,1,6,6,6
2639076,103.599,106.899,"Are the rumors true, Albus?",tt0241527,are rumor true albus,1,6,6,6
2639077,106.900,112.299,"I'm afraid so, professor.\nThe good and the bad.",tt0241527,i afraid professor the good bad,4,6,6,6
2639078,112.299,115.799,- And the boy?\n- Hagrid is bringing him.,tt0241527,and boy hagrid bringing,1,6,6,6
...,...,...,...,...,...,...,...,...,...
3259946,8976.299,8980.899,"I do. But your cousin don't, do he?",tt0241527,i but cousin,0,6,6,6
3259947,8986.100,8989.498,"Feels strange to be going home,\ndoesn't it?",tt0241527,feels strange going home,5,5,4,5
3259948,8989.500,8991.699,I'm not going home.,tt0241527,i going home,4,6,6,6
3259949,8991.700,8994.000,Not really.,tt0241527,not really,1,6,6,6


In [46]:
df_harry['predicted_emotion'] = df_harry['predicted_label'].map(dic_emotions['emotion'])
df_harry['predicted_emotion_under'] = df_harry['predicted_label_under'].map(dic_emotions_neutral['emotion'])
df_harry['predicted_emotion_imb'] = df_harry['predicted_label_imb'].map(dic_emotions_neutral['emotion'])
df_harry['predicted_emotion_cw'] = df_harry['predicted_label_cw'].map(dic_emotions_neutral['emotion'])



In [47]:
df_harry # Harry potter

Unnamed: 0,start_time,end_time,text,imdb_id,cleaned_text,predicted_label,predicted_label_under,predicted_label_imb,predicted_label_cw,predicted_emotion,predicted_emotion_under,predicted_emotion_imb,predicted_emotion_cw
2639074,81.700,87.000,"I should've known that you would\nbe here, Pro...",tt0241527,i known would professor mcgonagall,1,6,6,6,joy,neutral,neutral,neutral
2639075,97.900,101.400,"Good evening, Professor Dumbledore.",tt0241527,good evening professor dumbledore,1,6,6,6,joy,neutral,neutral,neutral
2639076,103.599,106.899,"Are the rumors true, Albus?",tt0241527,are rumor true albus,1,6,6,6,joy,neutral,neutral,neutral
2639077,106.900,112.299,"I'm afraid so, professor.\nThe good and the bad.",tt0241527,i afraid professor the good bad,4,6,6,6,fear,neutral,neutral,neutral
2639078,112.299,115.799,- And the boy?\n- Hagrid is bringing him.,tt0241527,and boy hagrid bringing,1,6,6,6,joy,neutral,neutral,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3259946,8976.299,8980.899,"I do. But your cousin don't, do he?",tt0241527,i but cousin,0,6,6,6,sadness,neutral,neutral,neutral
3259947,8986.100,8989.498,"Feels strange to be going home,\ndoesn't it?",tt0241527,feels strange going home,5,5,4,5,surprise,surprise,fear,surprise
3259948,8989.500,8991.699,I'm not going home.,tt0241527,i going home,4,6,6,6,fear,neutral,neutral,neutral
3259949,8991.700,8994.000,Not really.,tt0241527,not really,1,6,6,6,joy,neutral,neutral,neutral


In [48]:
df_harry.groupby(['predicted_emotion']).count()

Unnamed: 0_level_0,start_time,end_time,text,imdb_id,cleaned_text,predicted_label,predicted_label_under,predicted_label_imb,predicted_label_cw,predicted_emotion_under,predicted_emotion_imb,predicted_emotion_cw
predicted_emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
anger,264,264,264,264,264,264,264,264,264,264,264,264
fear,63,63,63,63,63,63,63,63,63,63,63,63
joy,2754,2754,2754,2754,2754,2754,2754,2754,2754,2754,2754,2754
love,15,15,15,15,15,15,15,15,15,15,15,15
sadness,780,780,780,780,780,780,780,780,780,780,780,780
surprise,33,33,33,33,33,33,33,33,33,33,33,33


In [50]:
df_harry.groupby(['predicted_emotion_under']).count() 

Unnamed: 0_level_0,start_time,end_time,text,imdb_id,cleaned_text,predicted_label,predicted_label_under,predicted_label_imb,predicted_label_cw,predicted_emotion,predicted_emotion_imb,predicted_emotion_cw
predicted_emotion_under,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
anger,12,12,12,12,12,12,12,12,12,12,12,12
fear,27,27,27,27,27,27,27,27,27,27,27,27
joy,24,24,24,24,24,24,24,24,24,24,24,24
love,15,15,15,15,15,15,15,15,15,15,15,15
neutral,3780,3780,3780,3780,3780,3780,3780,3780,3780,3780,3780,3780
sadness,15,15,15,15,15,15,15,15,15,15,15,15
surprise,36,36,36,36,36,36,36,36,36,36,36,36


In [51]:
df_harry.groupby(['predicted_emotion_imb']).count() 

Unnamed: 0_level_0,start_time,end_time,text,imdb_id,cleaned_text,predicted_label,predicted_label_under,predicted_label_imb,predicted_label_cw,predicted_emotion,predicted_emotion_under,predicted_emotion_cw
predicted_emotion_imb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
anger,51,51,51,51,51,51,51,51,51,51,51,51
fear,48,48,48,48,48,48,48,48,48,48,48,48
joy,198,198,198,198,198,198,198,198,198,198,198,198
love,9,9,9,9,9,9,9,9,9,9,9,9
neutral,3453,3453,3453,3453,3453,3453,3453,3453,3453,3453,3453,3453
sadness,120,120,120,120,120,120,120,120,120,120,120,120
surprise,30,30,30,30,30,30,30,30,30,30,30,30


In [52]:
df_harry.groupby(['predicted_emotion_cw']).count() 

Unnamed: 0_level_0,start_time,end_time,text,imdb_id,cleaned_text,predicted_label,predicted_label_under,predicted_label_imb,predicted_label_cw,predicted_emotion,predicted_emotion_under,predicted_emotion_imb
predicted_emotion_cw,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
anger,30,30,30,30,30,30,30,30,30,30,30,30
fear,30,30,30,30,30,30,30,30,30,30,30,30
joy,30,30,30,30,30,30,30,30,30,30,30,30
love,9,9,9,9,9,9,9,9,9,9,9,9
neutral,3744,3744,3744,3744,3744,3744,3744,3744,3744,3744,3744,3744
sadness,33,33,33,33,33,33,33,33,33,33,33,33
surprise,33,33,33,33,33,33,33,33,33,33,33,33


In [54]:
df_harry['text'][0:10]

  df_harry['text'][0:10]


2639074    I should've known that you would\nbe here, Pro...
2639075                  Good evening, Professor Dumbledore.
2639076                          Are the rumors true, Albus?
2639077     I'm afraid so, professor.\nThe good and the bad.
2639078            - And the boy?\n- Hagrid is bringing him.
2639079    Do you think it wise to trust Hagrid\nwith som...
2639080    Ah, Professor, I would trust Hagrid\nwith my l...
2639081    Professor Dumbledore, sir.\nProfessor McGonagall.
2639082          - No problems, I trust, Hagrid?\n- No, sir.
2639083    Little tyke fell asleep just\nas we were flyin...
Name: text, dtype: object

##### To confirm the results from the models, checking the emotion of the movie subtitle manually.

In [56]:
df_harry.to_csv("../data/processed/manual_testing_harry.csv",header=False)

In [64]:
df_harry.iloc[[54]] # sadness is the correct emotion

Unnamed: 0,start_time,end_time,text,imdb_id,cleaned_text,predicted_label,predicted_label_under,predicted_label_imb,predicted_label_cw,predicted_emotion,predicted_emotion_under,predicted_emotion_imb,predicted_emotion_cw
2639128,411.699,414.3,Do you miss your family?,tt0241527,do miss family,0,6,6,6,sadness,neutral,neutral,neutral


In [65]:
df_harry.iloc[[55]] # This is also sad. all the predictions are wrong.

Unnamed: 0,start_time,end_time,text,imdb_id,cleaned_text,predicted_label,predicted_label_under,predicted_label_imb,predicted_label_cw,predicted_emotion,predicted_emotion_under,predicted_emotion_imb,predicted_emotion_cw
2639129,416.899,421.798,I see. That's me as well.\nI never knew my par...,tt0241527,i see that well i never knew parent either,1,6,6,6,joy,neutral,neutral,neutral


In [74]:
df_harry.iloc[[150]]

Unnamed: 0,start_time,end_time,text,imdb_id,cleaned_text,predicted_label,predicted_label_under,predicted_label_imb,predicted_label_cw,predicted_emotion,predicted_emotion_under,predicted_emotion_imb,predicted_emotion_cw
2639224,1130.599,1135.398,Never insult Albus Dumbledore...,tt0241527,never insult albus dumbledore,1,6,6,6,joy,neutral,neutral,neutral


In [75]:
df_harry.iloc[[151]]

Unnamed: 0,start_time,end_time,text,imdb_id,cleaned_text,predicted_label,predicted_label_under,predicted_label_imb,predicted_label_cw,predicted_emotion,predicted_emotion_under,predicted_emotion_imb,predicted_emotion_cw
2639225,1135.4,1137.7,...in front of me.,tt0241527,front,3,6,6,6,anger,neutral,neutral,neutral


The above two rows, if combined together should be angry emotion. When splittled the emotion is not captured correctly from any model. 