In [112]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import itertools
from imdbUtils import *
import builtins

pd.options.display.max_colwidth=500

In [113]:
# API call to select:
## feature films
## which are rated atleast 4.0
## having atleast 50,000 votes
## in the Thriller genre
## sorted by user rating
## limit to 250 movies
url = '''https://www.imdb.com/search/title/?title_type=feature&user_rating=4.0,10.0
&num_votes=50000,&genres=thriller&view=simple&sort=user_rating,desc&count=250'''

# get the soup object for main api url
movies_soup = getSoup(url)

In [114]:
# find all a-tags with class:None
movie_tags = movies_soup.find_all('a', attrs={'class': None})

# filter the a-tags to get just the titles
movie_tags = [tag.attrs['href'] for tag in movie_tags 
              if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]

In [115]:
print("There are a total of " + str(len(movie_tags)) + " movie titles")
print("Displaying 10 titles")
movie_tags[:10]

There are a total of 500 movie titles
Displaying 10 titles


['/title/tt0468569/',
 '/title/tt0468569/',
 '/title/tt1375666/',
 '/title/tt1375666/',
 '/title/tt6751668/',
 '/title/tt6751668/',
 '/title/tt0114369/',
 '/title/tt0114369/',
 '/title/tt0102926/',
 '/title/tt0102926/']

In [116]:
# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))

print("There are a total of " + str(len(movie_tags)) + " movie titles")
print("Displaying 10 titles")
movie_tags[:10]

There are a total of 250 movie titles
Displaying 10 titles


['/title/tt0468569/',
 '/title/tt1375666/',
 '/title/tt6751668/',
 '/title/tt0114369/',
 '/title/tt0102926/',
 '/title/tt0482571/',
 '/title/tt0407887/',
 '/title/tt0114814/',
 '/title/tt0110413/',
 '/title/tt0054215/']

In [117]:
# movie links
base_url = "https://www.imdb.com"
movie_links = [base_url + tag + 'reviews' for tag in movie_tags]
print("There are a total of " + str(len(movie_links)) + " movie user reviews")
print("Displaying 10 user reviews links")
movie_links[:10]

There are a total of 250 movie user reviews
Displaying 10 user reviews links


['https://www.imdb.com/title/tt0468569/reviews',
 'https://www.imdb.com/title/tt1375666/reviews',
 'https://www.imdb.com/title/tt6751668/reviews',
 'https://www.imdb.com/title/tt0114369/reviews',
 'https://www.imdb.com/title/tt0102926/reviews',
 'https://www.imdb.com/title/tt0482571/reviews',
 'https://www.imdb.com/title/tt0407887/reviews',
 'https://www.imdb.com/title/tt0114814/reviews',
 'https://www.imdb.com/title/tt0110413/reviews',
 'https://www.imdb.com/title/tt0054215/reviews']

In [118]:
# get a list of soup objects
movie_soups = [getSoup(link) for link in movie_links]

# get all 500 movie review links
movie_review_list = [getReviews(movie_soup) for movie_soup in movie_soups]

movie_review_list = list(itertools.chain(*movie_review_list))
print(len(movie_review_list))

print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
print("Displaying 10 reviews")
movie_review_list[:10]

500
There are a total of 500 individual movie reviews
Displaying 10 reviews


['https://www.imdb.com/review/rw6513945/',
 'https://www.imdb.com/review/rw6457886/',
 'https://www.imdb.com/review/rw2285156/',
 'https://www.imdb.com/review/rw4692192/',
 'https://www.imdb.com/review/rw5512155/',
 'https://www.imdb.com/review/rw5195256/',
 'https://www.imdb.com/review/rw0960802/',
 'https://www.imdb.com/review/rw0370669/',
 'https://www.imdb.com/review/rw3476006/',
 'https://www.imdb.com/review/rw1198894/']

In [108]:
movie_review_list = list(itertools.chain(*movie_review_list))
print(len(movie_review_list))

print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
print("Displaying 10 reviews")
movie_review_list[:10]

19000
There are a total of 19000 individual movie reviews
Displaying 10 reviews


['h', 't', 't', 'p', 's', ':', '/', '/', 'w', 'w']

In [120]:
print(str(len(movie_review_list)))

500


In [121]:
# get review text from the review link
review_texts = [getReviewText(url) for url in movie_review_list]

# get movie name from the review link
movie_titles = [getMovieTitle(url) for url in movie_review_list]

# label each review with negative or positive
review_sentiment = np.array(['negative', 'positive'] * (len(movie_review_list)//2))

# construct a dataframe
df = pd.DataFrame({'movie': movie_titles, 'user_review': review_texts, 'sentiment': review_sentiment})

In [122]:
df.head()

Unnamed: 0,movie,user_review,sentiment
0,The Dark Knight,"If someone else acted as Joker, I would give the movie 7-8 stars. The majority of people ended up loving the villain more than the hero, and that rarely happends in movies.Rest in peace Heath Ledger.",negative
1,The Dark Knight,"Totally one of the greatest movie titles ever made. Everything was great, filming, acting, story. Nothing to complain about",positive
2,Inception,"The central idea of ""Inception"" is an interesting one: technology exists to enter other people's dreams in order to steal their most private secrets or to implant new ideas. With Christopher Nolan of ""Memento"" fame as writer and director, this should have been a smart, compelling movie. Unfortunately, ""Inception"" is a bloated failure.The root problem is that Nolan replaces the fascinating surrealism of dream worlds with lengthy outtakes from James Bond movies. For example, early in the film,...",negative
3,Inception,"My 3rd time watching this movie! Yet, it still stunned my mind, kept me enjoyed its every moment and left me with many thoughts afterward.For someone like me, who've rarely slept without dream, it's so exciting watching how Christopher Nolan had illustrated every single characteristic of dream on the big screen. As it's been done so sophisticatedly, I do believe the rumour that Nolan had spent 10 years to finish the script of Inception. In my opinion, it's been so far the greatest achievemen...",positive
4,Parasite,"There are a few suspenseful scenes and a decent plot, along with comedic dialogue, but that's it. It's not a ""metaphorical masterpiece"" that some critics and users are saying. I feel as though too many people are trying to be expert critics like on that episode of South Park where the entire town writes crazy yelp reviews for all the restaurants.",negative


In [124]:
# save the dataframe to a csv file.
df.to_csv('movieReviews_IMDB.csv', index=False)