In [89]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import spacy
from nltk.stem import WordNetLemmatizer

In [90]:
# Featured reviews dataframe
reviews_df = pd.read_csv('/home/arisbethlaguna/COMP_4447/final_project/Featured_Reviews_with_tmdbid.csv')
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4652 entries, 0 to 4651
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tmdbid           4652 non-null   int64 
 1   Lookup ID        4652 non-null   object
 2   URL              4652 non-null   object
 3   Featured Review  4652 non-null   object
dtypes: int64(1), object(3)
memory usage: 145.5+ KB


In [91]:
# Movie overview dataframe
TMDB_df = pd.read_csv('/home/arisbethlaguna/COMP_4447/final_project/OMDB API Data.csv')
TMDB_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13005 entries, 0 to 13004
Data columns (total 39 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdbID                 13005 non-null  object 
 1   TT URL                 13005 non-null  object 
 2   Opening Revenue        13005 non-null  object 
 3   RL URL                 13005 non-null  object 
 4   Number of Theaters     13005 non-null  object 
 5   tmdbID                 13005 non-null  int64  
 6   url                    13005 non-null  object 
 7   budget                 12973 non-null  float64
 8   original_title         12973 non-null  object 
 9   title                  12973 non-null  object 
 10  overview               12972 non-null  object 
 11  production_company     13005 non-null  object 
 12  belongs_to_collection  13005 non-null  object 
 13  omdb title             12999 non-null  object 
 14  year                   12999 non-null  object 
 15  Ra

In [92]:
# Function to remove stopwords and apply lemmatization
def stopwords_lemmatize(text):
    if not pd.isna(text):
        stop_words = set(stopwords.words('english'))
        words = nltk.word_tokenize(text.lower())
        lemmatizer = WordNetLemmatizer()
        filtered_words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]
        return ' '.join(filtered_words)
    return text  # Return NaN values as they are

# Apply the stopwords_lemmatize function to the appropriate column
reviews_df['review_NLG'] = reviews_df['Featured Review'].apply(stopwords_lemmatize)
TMDB_df['overview_NLG'] = TMDB_df['overview'].apply(stopwords_lemmatize)


In [93]:
# Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to calculate sentiment scores
def sentiment_score(text):
    if isinstance(text, str): 
        sentiment = analyzer.polarity_scores(text)
        compound_score = sentiment['compound']
        return compound_score
    else:
        return None  # Return None for non-string values

# Apply the calculate_sentiment function to the 'NLG' column
reviews_df['review_Sentiment'] = reviews_df['review_NLG'].apply(sentiment_score)
TMDB_df['overview_Sentiment'] = TMDB_df['overview_NLG'].apply(sentiment_score)

In [94]:
TMDB_df.head(5)

Unnamed: 0,imdbID,TT URL,Opening Revenue,RL URL,Number of Theaters,tmdbID,url,budget,original_title,title,...,imdbRating,imdbVotes,Type,DVD,BoxOffice,Production,Website,Response,overview_NLG,overview_Sentiment
0,tt0338751,https://www.boxofficemojo.com/title/tt00338751/?ref_=bo_se_r_1,"$858,021",https://www.boxofficemojo.com/release/rl742753793/weekend?ref_=bo_tt_gr#table,40,2567,https://api.themoviedb.org/3/movie/2567?api_key=448fb8084e6c46cb8447cda0cd773e3a,110000000.0,The Aviator,The Aviator,...,7.5,376165,movie,19 Apr 2016,"$102,610,330",,,True,"biopic depicting life filmmaker aviation pioneer howard hughes 1927 1947 , time became successful film producer aviation magnate , simultaneously growing unstable due severe obsessive-compulsive disorder .",-0.3182
1,tt0293508,https://www.boxofficemojo.com/title/tt00293508/?ref_=bo_se_r_1,"$4,001,890",https://www.boxofficemojo.com/release/rl1214809601/weekend?ref_=bo_tt_gr#table,622,9833,https://api.themoviedb.org/3/movie/9833?api_key=448fb8084e6c46cb8447cda0cd773e3a,70000000.0,The Phantom of the Opera,The Phantom of the Opera,...,7.2,128266,movie,27 Dec 2011,"$51,293,931",,,True,"deformed since birth , bitter man known phantom life sewer underneath paris opera house . fall love obscure chorus singer christine , privately tutor terrorizing rest opera house demanding christine given lead role .",-0.5423
2,tt0363473,https://www.boxofficemojo.com/title/tt00363473/?ref_=bo_se_r_1,"$45,264",https://www.boxofficemojo.com/release/rl3326379521/weekend?ref_=bo_tt_gr#table,6,6478,https://api.themoviedb.org/3/movie/6478?api_key=448fb8084e6c46cb8447cda0cd773e3a,25000000.0,Beyond the Sea,Beyond the Sea,...,6.7,15400,movie,01 Oct 2016,"$6,318,709",,,True,"based life career legendary entertainer , bobby darin , biopic move back forth childhood adulthood , tell tale life .",0.3818
3,tt0361127,https://www.boxofficemojo.com/title/tt00361127/?ref_=bo_se_r_1,"$53,985",https://www.boxofficemojo.com/release/rl494568961/weekend?ref_=bo_tt_gr#table,6,9692,https://api.themoviedb.org/3/movie/9692?api_key=448fb8084e6c46cb8447cda0cd773e3a,0.0,The Woodsman,The Woodsman,...,7.2,34905,movie,14 Dec 2016,"$1,576,231",,,True,paedophile return hometown 12 year prison attempt start new life .,-0.5106
4,tt0385267,https://www.boxofficemojo.com/title/tt00385267/?ref_=bo_se_r_1,"$151,750",https://www.boxofficemojo.com/release/rl91588097/weekend?ref_=bo_tt_gr#table,3,1901,https://api.themoviedb.org/3/movie/1901?api_key=448fb8084e6c46cb8447cda0cd773e3a,0.0,In Good Company,In Good Company,...,6.5,56162,movie,07 May 2015,"$45,806,659",,,True,"dan foreman seasoned advertisement sale executive high-ranking publication corporate takeover result placed naive supervisor carter duryea , half age . matter made worse dan 's new supervisor becomes romantically involved daughter 18 year-old college student alex .",-0.3182


In [95]:
pd.set_option('display.max_colwidth', None)
TMDB_df['overview_Sentiment']

0       -0.3182
1       -0.5423
2        0.3818
3       -0.5106
4       -0.3182
          ...  
13000   -0.0258
13001    0.3400
13002    0.5719
13003    0.5994
13004    0.0000
Name: overview_Sentiment, Length: 13005, dtype: float64

In [96]:
reviews_df.head(5)

Unnamed: 0,tmdbid,Lookup ID,URL,Featured Review,review_NLG,review_Sentiment
0,9833,9833-the-phantom-of-the-opera,https://www.themoviedb.org/movie/9833-the-phantom-of-the-opera,SENSATIONAL,sensational,0.0
1,9692,9692-the-woodsman,https://www.themoviedb.org/movie/9692-the-woodsman,Great psychological study,great psychological study,0.6249
2,9953,9953-love-song-for-bobby-long-a,https://www.themoviedb.org/movie/9953-love-song-for-bobby-long-a,Literature-quoting lazy drama about a teen girl and two older boozers living in a hovel in New Orleans,literature-quoting lazy drama teen girl two older boozer living hovel new orleans,-0.3612
3,15045,15045-fat-albert,https://www.themoviedb.org/movie/15045-fat-albert,Hey hey hey!,hey hey hey !,0.0
4,1853,1853-in-the-realms-of-the-unreal,https://www.themoviedb.org/movie/1853-in-the-realms-of-the-unreal,"This is a 2004 documentary film by Jessica Yu introducing the life and work of Henry Darger. A major figure of ""outsider art"", Darger's work was only discovered after his death when his landlords found thousands of pages of text and paintings in his room. Through his long life, he was known to his few friends and associates only as a janitor in a local hospital. Secretly, he wrote a massive manuscript chronicling the rebellion of girl slaves in a fantasy world, painted with watercolors of the heroines and battles.","2004 documentary film jessica yu introducing life work henry darger . major figure `` outsider art '' , darger 's work discovered death landlord found thousand page text painting room . long life , known friend associate janitor local hospital . secretly , wrote massive manuscript chronicling rebellion girl slave fantasy world , painted watercolor heroine battle .",-0.0258


In [97]:
pd.set_option('display.max_colwidth', None)
reviews_df['review_Sentiment']

0       0.0000
1       0.6249
2      -0.3612
3       0.0000
4      -0.0258
         ...  
4647    0.3818
4648   -0.4404
4649   -0.8360
4650    0.0000
4651    0.7906
Name: review_Sentiment, Length: 4652, dtype: float64

In [102]:
# Inner merge to only show data that has both a review
inner_merge_df = pd.merge(TMDB_df, reviews_df[['tmdbid', 'Featured Review', 'review_NLG', 'review_Sentiment']], left_on='tmdbID', right_on='tmdbid', how='inner')

In [105]:
inner_merge_df.drop('tmdbid', axis=1, inplace=True)

In [106]:
inner_merge_df.sample(3)

Unnamed: 0,imdbID,TT URL,Opening Revenue,RL URL,Number of Theaters,tmdbID,url,budget,original_title,title,...,DVD,BoxOffice,Production,Website,Response,overview_NLG,overview_Sentiment,Featured Review,review_NLG,review_Sentiment
1318,tt1763264,https://www.boxofficemojo.com/title/tt01763264/?ref_=bo_se_r_1,"$47,800",https://www.boxofficemojo.com/release/rl2741470721/weekend?ref_=bo_tt_gr#table,53,123109,https://api.themoviedb.org/3/movie/123109?api_key=448fb8084e6c46cb8447cda0cd773e3a,2900000.0,No One Lives,No One Lives,...,09 Nov 2016,"$74,918",,,True,gang ruthless highway killer kidnap wealthy couple traveling cross country shockingly discover thing seem .,-0.5423,"A pleasant surprise from WWE Studios. NO ONE LIVES is a tight, nasty genre gem from Japanese schlockmeister Ryuhei Kitamura.","pleasant surprise wwe studio . one life tight , nasty genre gem japanese schlockmeister ryuhei kitamura .",0.2023
333,tt0442933,https://www.boxofficemojo.com/title/tt00442933/?ref_=bo_se_r_1,"$27,515,871",https://www.boxofficemojo.com/release/rl2789508609/weekend?ref_=bo_tt_gr#table,3153,2310,https://api.themoviedb.org/3/movie/2310?api_key=448fb8084e6c46cb8447cda0cd773e3a,70000000.0,Beowulf,Beowulf,...,01 Aug 2013,"$82,280,579",,,True,"6th-century scandinavian warrior named beowulf embarks mission slay man-like ogre , grendel .",0.0,"It was just too much...but to be fair I saw it in 3D. The none 3D version may have actually been better, but honestly I have no way of knowing.","much ... fair saw 3d . none 3d version may actually better , honestly way knowing .",0.802
1647,tt0093300,https://www.boxofficemojo.com/title/tt00093300/?ref_=bo_se_r_1,"$7,154,890",https://www.boxofficemojo.com/release/rl3613033985/weekend?ref_=bo_tt_gr#table,1606,580,https://api.themoviedb.org/3/movie/580?api_key=448fb8084e6c46cb8447cda0cd773e3a,23000000.0,Jaws: The Revenge,Jaws: The Revenge,...,11 Sep 2015,"$20,763,013",,,True,"another deadly shark attack , ellen brody decides enough new england 's amity island move caribbean join son , michael , family . great white shark followed , hungry life .",0.4939,"Michael Caine must have needed another new swimming pool, otherwise what could have possessed him to turn up for this terribly poor sequel. We all start off with ""Deputy Sean Brady"" (Mitchell Anderson) sent to clear up some seaborne blockage before the fishing boats return. Needless to say, he encounters the distant cousin of his late father's menacing pal and is soon little more than tooth pickings. This hastens the arrival of the pretty but extremely bland Lance Guest as older brother ""Michael"" and guess what, the shark seems to have him on his sonar, too. Despite the fairly charismatic effor...read the rest.","michael caine must needed another new swimming pool , otherwise could possessed turn terribly poor sequel . start `` deputy sean brady '' ( mitchell anderson ) sent clear seaborne blockage fishing boat return . needle say , encounter distant cousin late father 's menacing pal soon little tooth picking . hastens arrival pretty extremely bland lance guest older brother `` michael '' guess , shark seems sonar , . despite fairly charismatic effor ... read rest .",-0.2263


In [99]:
left_merge_df = pd.merge(TMDB_df, reviews_df[['tmdbid', 'Featured Review', 'review_NLG', 'review_Sentiment']], left_on='tmdbID', right_on='tmdbid', how='left')

In [None]:
left_merge_df.drop('tmdbid', axis=1, inplace=True)

In [109]:
left_merge_df.to_csv('OMDB API Data_NLP.csv', index=False)

In [None]:
inner_merge_df.to_csv('OMDB API Data_review_NLP.csv')