In [2]:
#!pip3 install pymongo
import csv 
import json, requests
import pandas as pd
import time
import numpy as np
import psycopg2, os

## Populate New York Times Movie Reviews

In [3]:
from pymongo import MongoClient
client = MongoClient('localhost',27017) ## or MongoClient("localhost:27")
db = client.apan5400

In [4]:
#Assign New York Times API key
nyt_key = open ('nyt_api_key.txt')
nyt_key = nyt_key.read()
base_url = 'https://api.nytimes.com/svc/movies/v2/reviews/search.json?query='
api_key = '&api-key='+nyt_key

In [5]:
#read CSV of IMDB review
imdb = pd.read_csv('movieReviews_IMDB.csv')
detail_title = imdb.movie

titles = list(dict.fromkeys(imdb.movie))

In [6]:
#create collection movie review new york times
collection = db.moviereview_nyt

In [7]:
#get new york times review
for title in titles:
    url = base_url+title+api_key
    resp = requests.get(url)
    data = json.loads(resp.text)
    collection.insert_one(data).inserted_id
    time.sleep(3) # to reduce error response

In [8]:
detail_title = []
detail_review = []
detail_source = []

docs = collection.find()
for doc in docs:
    if("results" in doc and doc["results"] != None):
        for result in doc["results"]:
            detail_title.append(result["display_title"])
            detail_review.append(result["summary_short"])
            detail_source.append("NYT")
    
df = pd.DataFrame({'movie': detail_title, 'user_review': detail_review, 'source': detail_source})
print(df)

                                      movie  \
0                     The Dark Knight Rises   
1                           The Dark Knight   
2                                 Inception   
3                                  Parasite   
4                                     Seven   
..                                      ...   
884                             The Wailing   
885                       Nocturnal Animals   
886  The Conjuring: The Devil Made Me Do It   
887                         The Conjuring 2   
888                           The Conjuring   

                                           user_review source  
0    Christopher Nolan’s “Dark Knight Rises” wraps ...    NYT  
1    Pitched at the divide between art and industry...    NYT  
2    In Christopher Nolan’s “Inception” a specialis...    NYT  
3    In Bong Joon Ho’s new film, a destitute family...    NYT  
4     Murders based on seven deadly sins. Deadly dull.    NYT  
..                                                 

In [9]:
df['user_review'].replace('', np.nan, inplace=True)
print(df)

                               movie  \
0              The Dark Knight Rises   
1                    The Dark Knight   
2                          Inception   
3                           Parasite   
4                              Seven   
..                               ...   
456                      The Wailing   
457                Nocturnal Animals   
458             John Wick: Chapter 2   
459                   Predestination   
460  The Hunger Games: Catching Fire   

                                           user_review source  
0    Christopher Nolan’s “Dark Knight Rises” wraps ...    NYT  
1    Pitched at the divide between art and industry...    NYT  
2    In Christopher Nolan’s “Inception” a specialis...    NYT  
3    In Bong Joon Ho’s new film, a destitute family...    NYT  
4     Murders based on seven deadly sins. Deadly dull.    NYT  
..                                                 ...    ...  
456  Na Hong-jin’s movie about demonic possession i...    NYT  
457  Th

In [10]:
df.dropna(subset=['user_review'], inplace=True)
print(df)

                               movie  \
0              The Dark Knight Rises   
1                    The Dark Knight   
2                          Inception   
3                           Parasite   
4                              Seven   
..                               ...   
456                      The Wailing   
457                Nocturnal Animals   
458             John Wick: Chapter 2   
459                   Predestination   
460  The Hunger Games: Catching Fire   

                                           user_review source  
0    Christopher Nolan’s “Dark Knight Rises” wraps ...    NYT  
1    Pitched at the divide between art and industry...    NYT  
2    In Christopher Nolan’s “Inception” a specialis...    NYT  
3    In Bong Joon Ho’s new film, a destitute family...    NYT  
4     Murders based on seven deadly sins. Deadly dull.    NYT  
..                                                 ...    ...  
456  Na Hong-jin’s movie about demonic possession i...    NYT  
457  Th

In [11]:
# save the dataframe to a csv file.
df.to_csv('movieReviews_NYT.csv', index=False)

## Sentiment Analysis (Positive/Negative Reviews)
Reference: https://hub.packtpub.com/how-to-perform-sentiment-analysis-using-python-tutorial/

In [14]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

In [15]:
def extract_features(word_list):
    return dict([(word, True) for word in word_list])

In [16]:
if __name__=='__main__':
    positive_fileids = movie_reviews.fileids('pos')
    negative_fileids = movie_reviews.fileids('neg')

In [17]:
features_positive = [(extract_features(movie_reviews.words(fileids=[f])),
                      'Positive') for f in positive_fileids]
features_negative = [(extract_features(movie_reviews.words(fileids=[f])),
                      'Negative') for f in negative_fileids]

In [18]:
# Split the data into train and test (80/20)
threshold_factor = 0.8
threshold_positive = int(threshold_factor * len(features_positive))
threshold_negative = int(threshold_factor * len(features_negative))

In [19]:
features_train = features_positive[:threshold_positive] + features_negative[:threshold_negative]
features_test = features_positive[threshold_positive:] + features_negative[threshold_negative:]  
print("\nNumber of training datapoints:", len(features_train))
print("Number of test datapoints:", len(features_test))


Number of training datapoints: 1600
Number of test datapoints: 400


In [20]:
# Train a Naive Bayes classifier
classifier = NaiveBayesClassifier.train(features_train)
print("\nAccuracy of the classifier:", nltk.classify.util.accuracy(classifier, features_test))


Accuracy of the classifier: 0.735


In [21]:
sentiment = []

for review in df.user_review:
    probdist = classifier.prob_classify(extract_features(review.split()))
    pred_sentiment = probdist.max().lower()
    sentiment.append(pred_sentiment)

In [22]:
df['sentiment'] = sentiment
print(df)

                               movie  \
0              The Dark Knight Rises   
1                    The Dark Knight   
2                          Inception   
3                           Parasite   
4                              Seven   
..                               ...   
456                      The Wailing   
457                Nocturnal Animals   
458             John Wick: Chapter 2   
459                   Predestination   
460  The Hunger Games: Catching Fire   

                                           user_review source sentiment  
0    Christopher Nolan’s “Dark Knight Rises” wraps ...    NYT  positive  
1    Pitched at the divide between art and industry...    NYT  positive  
2    In Christopher Nolan’s “Inception” a specialis...    NYT  positive  
3    In Bong Joon Ho’s new film, a destitute family...    NYT  negative  
4     Murders based on seven deadly sins. Deadly dull.    NYT  positive  
..                                                 ...    ...       ...  
4

## Combine IMDB + NYT Movie Reviews

In [23]:
sentiment = []
source = []

for review in imdb.user_review:
    probdist = classifier.prob_classify(extract_features(review.split()))
    pred_sentiment = probdist.max().lower()
    sentiment.append(pred_sentiment)
    source.append('IMDB')

imdb['sentiment'] = sentiment
imdb['source'] = source

In [24]:
df = df.append(imdb, sort = False)
print(df)

                               movie  \
0              The Dark Knight Rises   
1                    The Dark Knight   
2                          Inception   
3                           Parasite   
4                              Seven   
..                               ...   
495  The Hunger Games: Catching Fire   
496           Jo Nesbø's Headhunters   
497           Jo Nesbø's Headhunters   
498                    The Conjuring   
499                    The Conjuring   

                                           user_review source sentiment  
0    Christopher Nolan’s “Dark Knight Rises” wraps ...    NYT  positive  
1    Pitched at the divide between art and industry...    NYT  positive  
2    In Christopher Nolan’s “Inception” a specialis...    NYT  positive  
3    In Bong Joon Ho’s new film, a destitute family...    NYT  negative  
4     Murders based on seven deadly sins. Deadly dull.    NYT  positive  
..                                                 ...    ...       ...  
4

## Adding sentiment scores

In [25]:
movies = pd.read_csv('negativeSentimentOfMovies.csv')
movies = pd.read_csv('negativeSentimentOfMovies.csv', index_col=0)

In [26]:
movies

Unnamed: 0_level_0,movie,user_review,sentiment,negative sentiment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,The Dark Knight,"If someone else acted as Joker, I would give t...",negative,0.138268
1,The Dark Knight,Totally one of the greatest movie titles ever ...,positive,0.003493
2,Inception,"The central idea of ""Inception"" is an interest...",negative,0.392330
3,Inception,"My 3rd time watching this movie! Yet, it still...",positive,-1.000000
4,Parasite,There are a few suspenseful scenes and a decen...,negative,0.560443
...,...,...,...,...
495,Jo Nesbø's Headhunters,Headhunters is FANTASTIC. If you hate subtitle...,positive,0.062805
496,The Conjuring,Adequate and decent horror picture in which a ...,negative,-1.000000
497,The Conjuring,I'm an avid horror fan. Lately I've been think...,positive,0.077141
498,Disconnect,"I found this film, directed by Henry-Alex Rubi...",negative,0.030692


In [27]:
df = pd.merge(df, movies[['movie', 'sentiment', 'negative sentiment']], how='left', on=['movie', 'sentiment'])

In [28]:
df

Unnamed: 0,movie,user_review,source,sentiment,negative sentiment
0,The Dark Knight Rises,Christopher Nolan’s “Dark Knight Rises” wraps ...,NYT,positive,
1,The Dark Knight,Pitched at the divide between art and industry...,NYT,positive,0.003493
2,Inception,In Christopher Nolan’s “Inception” a specialis...,NYT,positive,-1.000000
3,Parasite,"In Bong Joon Ho’s new film, a destitute family...",NYT,negative,0.560443
4,Seven,Murders based on seven deadly sins. Deadly dull.,NYT,positive,
...,...,...,...,...,...
927,The Hunger Games: Catching Fire,"Ever since I first saw it in theaters, Catchin...",IMDB,positive,0.006867
928,Jo Nesbø's Headhunters,"The great German word fremdschämen means ""seco...",IMDB,negative,0.588086
929,Jo Nesbø's Headhunters,Headhunters is FANTASTIC. If you hate subtitle...,IMDB,positive,0.062805
930,The Conjuring,Adequate and decent horror picture in which a ...,IMDB,positive,0.077141


In [29]:
#be careful
#col.remove()

## Store to MongoDB

In [30]:
data_dict = df.to_dict(orient='records')  

col = db.movie_reviews
col.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x7fa74128d940>

In [31]:
#to erase all data (if needed)
#collection.delete_many({})
print(col.count_documents({}))

932


In [31]:
#read CSV of IMDB review
tmdb = pd.read_csv('tmdb_dt.csv')
print(tmdb)
movie_title = tmdb.title
tmdb = tmdb.drop_duplicates(subset = ["title"])
print(tmdb)

                                        genres      id  \
0           ['action', 'adventure', 'fantasy']   19995   
1           ['adventure', 'fantasy', 'action']     285   
2             ['action', 'adventure', 'crime']  206647   
3                 ['action', 'crime', 'drama']   49026   
4    ['action', 'adventure', 'sciencefiction']   49529   
..                                         ...     ...   
495           ['adventure', 'comedy', 'drama']    9447   
496         ['fantasy', 'action', 'adventure']  274854   
497   ['thriller', 'action', 'sciencefiction']    8870   
498      ['adventure', 'fantasy', 'animation']    9992   
499                  ['documentary', 'family']   36970   

                                              keywords original_language  \
0               ['cultureclash', 'future', 'spacewar']                en   
1               ['ocean', 'drugabuse', 'exoticisland']                en   
2               ['spy', 'basedonnovel', 'secretagent']                en   

## Store to MongoDb for Movie Recommendation

In [32]:
#create collection movie recommendation
collection = db.tmdb

In [33]:
data_dict_tmdb = tmdb.to_dict(orient='records')  

#col = data_dict_tmdb
#col.insert_many(data_dict_tmdb)
db.tmdb.insert(data_dict_tmdb)

  db.tmdb.insert(data_dict_tmdb)


[ObjectId('610b1ca2711735c4ba85c71c'),
 ObjectId('610b1ca2711735c4ba85c71d'),
 ObjectId('610b1ca2711735c4ba85c71e'),
 ObjectId('610b1ca2711735c4ba85c71f'),
 ObjectId('610b1ca2711735c4ba85c720'),
 ObjectId('610b1ca2711735c4ba85c721'),
 ObjectId('610b1ca2711735c4ba85c722'),
 ObjectId('610b1ca2711735c4ba85c723'),
 ObjectId('610b1ca2711735c4ba85c724'),
 ObjectId('610b1ca2711735c4ba85c725'),
 ObjectId('610b1ca2711735c4ba85c726'),
 ObjectId('610b1ca2711735c4ba85c727'),
 ObjectId('610b1ca2711735c4ba85c728'),
 ObjectId('610b1ca2711735c4ba85c729'),
 ObjectId('610b1ca2711735c4ba85c72a'),
 ObjectId('610b1ca2711735c4ba85c72b'),
 ObjectId('610b1ca2711735c4ba85c72c'),
 ObjectId('610b1ca2711735c4ba85c72d'),
 ObjectId('610b1ca2711735c4ba85c72e'),
 ObjectId('610b1ca2711735c4ba85c72f'),
 ObjectId('610b1ca2711735c4ba85c730'),
 ObjectId('610b1ca2711735c4ba85c731'),
 ObjectId('610b1ca2711735c4ba85c732'),
 ObjectId('610b1ca2711735c4ba85c733'),
 ObjectId('610b1ca2711735c4ba85c734'),
 ObjectId('610b1ca2711735