# IMDB Review Scraper

This is an interactive scraper. It is robust but not %100 automized but can easily be converted to a script.

I used XPATHs in order to find elements on the website.

In [1]:
import selenium
from selenium import webdriver
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
## This sets the page language option for Firefox. It doesn't change the Firefox's visual language.

options = webdriver.FirefoxOptions()
options.set_preference('intl.accept_languages', 'en-US')

## I am use Firefox in this project but this can easily be changed.
driver = webdriver.Firefox(executable_path="geckodriver.exe",options=options)

In [3]:
## IMDB User Review Section Links go into this list.

review_links = [
    "https://www.imdb.com/title/tt4633694/reviews?ref_=tt_ov_rt",
    "https://www.imdb.com/title/tt0145487/reviews/?ref_=tt_ql_urv",
    "https://www.imdb.com/title/tt0316654/reviews/?ref_=tt_ql_urv",
    "https://www.imdb.com/title/tt0413300/reviews/?ref_=tt_ql_urv",
    "https://www.imdb.com/title/tt0948470/reviews/?ref_=tt_ql_urv",
    "https://www.imdb.com/title/tt1872181/reviews/?ref_=tt_ql_urv",
    "https://www.imdb.com/title/tt2250912/reviews/?ref_=tt_ql_urv",
    "https://www.imdb.com/title/tt6320628/reviews/?ref_=tt_ql_urv",
    "https://www.imdb.com/title/tt10872600/reviews/?ref_=tt_ql_urv"
    
]

## This dictionary is a control measure. Scraped movie names and total review numbers go here.
## At the end of the scraping process I compare this with what I scraped.

review_counts = {"Movie" : [], "Counter" : []}

## The Dictonary to hold the review data        

review_set = {
    "Rating" : [],
    "Title" : [],
    "Date" : [],
    "Helpful_Vote" : [],
    "Total_Vote" : [],
    "Review" : [],
    "Movie" : []
    
}

## User Review Section Links go into this for loop

for link in review_links:
    driver.get(link)

    ## After User Reviews Page loads, While loop starts to load all the comments
    
    ## Be sure to have a stable and fast internet connection
    
    ## Script explicitly waits for the "load more" element for 10 seconds in order to make sure that
    ## every review is loaded.

    ## As a safe guard against infinite while loop in case of the bad internet,
    ## I also added the times_continued variable to break the while loop after 6 tries.


    times_continued = 0
    page = True
    while page:
        try:
            load_more = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='ipl-load-more ipl-load-more--loaded']/button[@id='load-more-trigger']")))
            load_more.click()
            time.sleep(1)

        except selenium.common.exceptions.TimeoutException:
            loaded_all = driver.find_elements_by_xpath("//div[@class='ipl-load-more ipl-load-more--loaded-all']")

            if len(loaded_all) == 1:
                page = False
            else:
                times_continued += 1
                if times_continued < 7:
                    continue
                else:
                    page = False


    ## Finding Spoiler/Long Review Expanders. Be careful that even reviews without spoiler or long
    ## reviews have expanders. This is why I am using an exception here.

    expanders = driver.find_elements_by_xpath("//div[@class='ipl-expander ']")

    ## Expanding all the comments

    for button in expanders:
        try:
            button.click()
        except selenium.common.exceptions.ElementNotInteractableException:
            continue


    ## Ratings -- Some Reviews don't have ratings so throwing exception here.

    item_contents = driver.find_elements_by_xpath("//div[@class='lister-item-content']")

    for item in item_contents:
        try:
            review_set["Rating"].append(item.find_element_by_class_name("ipl-ratings-bar").text)
        except selenium.common.exceptions.NoSuchElementException:
            review_set["Rating"].append("NA")


    ## Review Titles

    review_set["Title"].extend(i.text for i in driver.find_elements_by_xpath("//div[@class='lister-item-content']/a[@class='title']"))

    ## Review Dates

    review_set["Date"].extend(i.text for i in driver.find_elements_by_xpath("//div[@class='display-name-date']/span[@class='review-date']"))

    ## Helpfulness and Movie Name

    movie_name = driver.find_element_by_xpath("//h3[@itemprop='name']/a").text

    for i in driver.find_elements_by_xpath("//div[@class='actions text-muted']"):
        temp_list = i.text.split(" ")
        review_set["Helpful_Vote"].append(temp_list[0])
        review_set["Total_Vote"].append(temp_list[3])
        review_set["Movie"].append(movie_name)

    ## Reviews

    review_set["Review"].extend(i.text for i in driver.find_elements_by_xpath("//div[@class='text show-more__control']"))
    
    ## Review Total
    
    review_total = driver.find_element_by_xpath("//div[@class='header']/div/span").text
    
    ## This is my control measure dictionary I mentioned above.
    
    review_counts["Movie"].append(movie_name)
    review_counts["Counter"].append(review_total)


In [4]:
## This is the other control measure.
## I am making sure that every list in the dictionary has the same length.
## If they are not equal, something went wrong.

for i in review_set.keys():
    print(len(review_set[i]))

21228
21228
21228
21228
21228
21228
21228


In [5]:
## After seeing every list has the same length,
## I convert the dictionary into a Pandas DataFrame

movie_reviews = pd.DataFrame(review_set)
movie_reviews

Unnamed: 0,Rating,Title,Date,Helpful_Vote,Total_Vote,Review,Movie
0,10/10,Fantastic...but possibly overwhelming.,26 March 2019,54,71,"""Spider-Man: Into the Spider-Verse"" is a fanta...",Spider-Man: Into the Spider-Verse
1,8/10,How on God's green Earth did this work so well???,21 December 2018,198,279,The worlds of superhero movies and superhero c...,Spider-Man: Into the Spider-Verse
2,10/10,Stan Lee Is Smiling Right Now,17 December 2018,773,947,A movie worthy of Stan Lee's approval. Incredi...,Spider-Man: Into the Spider-Verse
3,10/10,So Much More Than I Expected!,19 November 2019,38,50,"So many have commented, so I will be brief. Fr...",Spider-Man: Into the Spider-Verse
4,10/10,Game Changer,21 December 2018,373,517,Have you ever sat through a film and you knew ...,Spider-Man: Into the Spider-Verse
...,...,...,...,...,...,...,...
21223,10/10,Marvel did the best spider ever,23 December 2021,0,7,Love what Marvel/Sony just did.\n\nVery good i...,Spider-Man: No Way Home
21224,10/10,spider man no way home,17 December 2021,0,4,This film is amazing! The best Spider-Man film...,Spider-Man: No Way Home
21225,10/10,Spiderman spiderman,14 January 2022,0,2,"One of the Marvel's masterpiece, Super Excelle...",Spider-Man: No Way Home
21226,9/10,Amazing!,19 December 2021,0,0,The originals have always been my fav Spiderma...,Spider-Man: No Way Home


In [6]:
## Printing movie names and total review counts from the IMDB.
## Then I am going to compare the number with the value_counts method of the DataFrame

for i,k in zip(review_counts["Movie"],review_counts["Counter"]):
    print(i," :",k)

Spider-Man: Into the Spider-Verse  : 2,148 Reviews
Spider-Man  : 2,352 Reviews
Spider-Man 2  : 1,645 Reviews
Spider-Man 3  : 2,258 Reviews
The Amazing Spider-Man  : 1,525 Reviews
The Amazing Spider-Man 2  : 1,349 Reviews
Spider-Man: Homecoming  : 1,570 Reviews
Spider-Man: Far from Home  : 2,333 Reviews
Spider-Man: No Way Home  : 6,067 Reviews


In [7]:
## There could be little differences of the counters
## so don't panic if your data frame has 1-5 less reviews than expected.
## My guess is that some reviews are hidden.

movie_reviews.value_counts("Movie")

Movie
Spider-Man: No Way Home              6065
Spider-Man                           2345
Spider-Man: Far from Home            2333
Spider-Man 3                         2257
Spider-Man: Into the Spider-Verse    2146
Spider-Man 2                         1644
Spider-Man: Homecoming               1567
The Amazing Spider-Man               1524
The Amazing Spider-Man 2             1347
dtype: int64

In [8]:
## Cheking the all data frame is something is wrong.

movie_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21228 entries, 0 to 21227
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Rating        21228 non-null  object
 1   Title         21228 non-null  object
 2   Date          21228 non-null  object
 3   Helpful_Vote  21228 non-null  object
 4   Total_Vote    21228 non-null  object
 5   Review        21228 non-null  object
 6   Movie         21228 non-null  object
dtypes: object(7)
memory usage: 1.1+ MB


In [9]:
## Thousand separators can be confusing and can change depending on your locale.
## In this case I am checking the lenghtiest Total_Vote instances to detect what is going on.

movie_reviews[movie_reviews.Total_Vote.str.len() > 4]

Unnamed: 0,Rating,Title,Date,Helpful_Vote,Total_Vote,Review,Movie
6163,3/10,A Let Down.,4 May 2007,789,1374,"There are some things that work really well, l...",Spider-Man 3
6185,5/10,"I'm so sorry, Spidey...",6 May 2007,1018,1489,As I was walking down the stairs and out of th...,Spider-Man 3
9956,5/10,"Too much, too unbalanced and a waste of talent",25 April 2014,643,1037,I can't say I went in to the theater with high...,The Amazing Spider-Man 2
11560,5/10,Spit in the face of spider man fans,6 July 2017,782,1330,Spoilers and review coming from a Spider-Man f...,Spider-Man: Homecoming
13584,1/10,Great movie if you're 8 yrs old,13 September 2019,715,1124,"We're a long way from Endgame, and it shows. T...",Spider-Man: Far from Home
15163,,Loved every second,15 December 2021,1481,1943,This Spiderman is really fantastic. It captiva...,Spider-Man: No Way Home
15165,10/10,Phenomenal conclusion,15 December 2021,942,1370,Spider-Man: No Way Home is a phenomenal conclu...,Spider-Man: No Way Home
15167,10/10,Somehow better than Endgame,18 December 2021,1381,2167,It's hard to discuss this movie and not get in...,Spider-Man: No Way Home
15169,9/10,Just Awesome !,16 December 2021,615,1003,It was very enjoyable to watch in the cinema. ...,Spider-Man: No Way Home


In [10]:
## Just to be safe I am removing all dots and commas from Total_Vote variable

movie_reviews["Total_Vote"] = movie_reviews.Total_Vote.str.replace(".", "") 
movie_reviews["Total_Vote"] = movie_reviews.Total_Vote.str.replace(",", "") 

  movie_reviews["Total_Vote"] = movie_reviews.Total_Vote.str.replace(".", "")


In [11]:
## Same thing fot the Helpful_Vote variable 

movie_reviews["Helpful_Vote"] = movie_reviews.Helpful_Vote.str.replace(".", "")
movie_reviews["Helpful_Vote"] = movie_reviews.Helpful_Vote.str.replace(",", "")

  movie_reviews["Helpful_Vote"] = movie_reviews.Helpful_Vote.str.replace(".", "")


In [12]:
## As a control measure, I am converting Total and Helpful_Vote variables
## into integers. If this process raises an exception something went wrong.

movie_reviews.astype({"Helpful_Vote" : "int64", "Total_Vote" : "int64"}).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21228 entries, 0 to 21227
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Rating        21228 non-null  object
 1   Title         21228 non-null  object
 2   Date          21228 non-null  object
 3   Helpful_Vote  21228 non-null  int64 
 4   Total_Vote    21228 non-null  int64 
 5   Review        21228 non-null  object
 6   Movie         21228 non-null  object
dtypes: int64(2), object(5)
memory usage: 1.1+ MB


I also kept `Rating` variable as it is. I am only cleaning that can become a real problem in data reading process such as thousand separators.

Other variables are kept as objects because they are text data. 

Lastly, I save the data as a csv file and re-reading it if there are problems.

In [13]:
movie_reviews.to_csv("file-name.csv", index=False)

In [14]:
## It is natural to have missing value on Rating column because 
## IMDB let's reviewers post reviews wihout a rating.

pd.read_csv("file-name.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2229 entries, 0 to 2228
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Rating        2170 non-null   object
 1   Title         2229 non-null   object
 2   Date          2229 non-null   object
 3   Helpful_Vote  2229 non-null   int64 
 4   Total_Vote    2229 non-null   int64 
 5   Review        2229 non-null   object
 6   Movie         2229 non-null   object
dtypes: int64(2), object(5)
memory usage: 122.0+ KB


In [13]:
## Closing the Selenium Webdriver.
## It saves up resources properly.

driver.close()
driver.quit()