# IMDB Review Scraper

This is an interactive scraper. It is robust but not %100 automized but can easily be converted to a script.

I used XPATHs in order to find elements on the website.

In [1]:
import selenium
from selenium import webdriver
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
## This sets the page language option for Firefox. It doesn't change the Firefox's visual language.

options = webdriver.FirefoxOptions()
options.set_preference('intl.accept_languages', 'en-US')

## I am use Firefox in this project but this can easily be changed.
driver = webdriver.Firefox(executable_path="geckodriver.exe",options=options)

In [3]:
## IMDB User Review Section Links go into this list.

review_links = [
    "https://www.imdb.com/title/tt1615147/reviews?ref_=tt_ov_rt"
    
]

## This dictionary is a control measure. Scraped movie names and total review numbers go here.
## At the end of the scraping process I compare this with what I scraped.

review_counts = {"Movie" : [], "Counter" : []}

## The Dictonary to hold the review data        

review_set = {
    "Rating" : [],
    "Title" : [],
    "Date" : [],
    "Helpful_Vote" : [],
    "Total_Vote" : [],
    "Review" : [],
    "Movie" : []
    
}

## User Review Section Links go into this for loop

for link in review_links:
    driver.get(link)

    ## After User Reviews Page loads, While loop starts to load all the comments
    
    ## Be sure to have a stable and fast internet connection
    
    ## Script explicitly waits for the "load more" element for 10 seconds in order to make sure that
    ## every review is loaded.

    ## As a safe guard against infinite while loop in case of the bad internet,
    ## I also added the times_continued variable to break the while loop after 6 tries.


    times_continued = 0
    page = True
    while page:
        try:
            load_more = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='ipl-load-more ipl-load-more--loaded']/button[@id='load-more-trigger']")))
            load_more.click()
            time.sleep(1)

        except selenium.common.exceptions.TimeoutException:
            loaded_all = driver.find_elements_by_xpath("//div[@class='ipl-load-more ipl-load-more--loaded-all']")

            if len(loaded_all) == 1:
                page = False
            else:
                times_continued += 1
                if times_continued < 7:
                    continue
                else:
                    page = False


    ## Finding Spoiler/Long Review Expanders. Be careful that even reviews without spoiler or long
    ## reviews have expanders. This is why I am using an exception here.

    expanders = driver.find_elements_by_xpath("//div[@class='ipl-expander ']")

    ## Expanding all the comments

    for button in expanders:
        try:
            button.click()
        except selenium.common.exceptions.ElementNotInteractableException:
            continue


    ## Ratings -- Some Reviews don't have ratings so throwing exception here.

    item_contents = driver.find_elements_by_xpath("//div[@class='lister-item-content']")

    for item in item_contents:
        try:
            review_set["Rating"].append(item.find_element_by_class_name("ipl-ratings-bar").text)
        except selenium.common.exceptions.NoSuchElementException:
            review_set["Rating"].append("NA")


    ## Review Titles

    review_set["Title"].extend(i.text for i in driver.find_elements_by_xpath("//div[@class='lister-item-content']/a[@class='title']"))

    ## Review Dates

    review_set["Date"].extend(i.text for i in driver.find_elements_by_xpath("//div[@class='display-name-date']/span[@class='review-date']"))

    ## Helpfulness and Movie Name

    movie_name = driver.find_element_by_xpath("//h3[@itemprop='name']/a").text

    for i in driver.find_elements_by_xpath("//div[@class='actions text-muted']"):
        temp_list = i.text.split(" ")
        review_set["Helpful_Vote"].append(temp_list[0])
        review_set["Total_Vote"].append(temp_list[3])
        review_set["Movie"].append(movie_name)

    ## Reviews

    review_set["Review"].extend(i.text for i in driver.find_elements_by_xpath("//div[@class='text show-more__control']"))
    
    ## Review Total
    
    review_total = driver.find_element_by_xpath("//div[@class='header']/div/span").text
    
    ## This is my control measure dictionary I mentioned above.
    
    review_counts["Movie"].append(movie_name)
    review_counts["Counter"].append(review_total)


In [4]:
## This is the other control measure.
## I am making sure that every list in the dictionary has the same length.
## If they are not equal, something went wrong.

for i in review_set.keys():
    print(len(review_set[i]))

349
349
349
349
349
349
349


In [5]:
## After seeing every list has the same length,
## I convert the dictionary into a Pandas DataFrame

movie_reviews = pd.DataFrame(review_set)
movie_reviews

Unnamed: 0,Rating,Title,Date,Helpful_Vote,Total_Vote,Review,Movie
0,9/10,the unofficial wall street sequel,8 September 2011,142,179,While I am a big fan of Oliver Stone and I did...,Margin Call
1,8/10,First-Time Filmmaker Deftly Handles the Financ...,23 October 2011,118,141,Having been the victim of corporate downsizing...,Margin Call
2,10/10,"11 years on, you realise little has changed.",10 December 2019,27,29,"A superb, low-key dramatisation of the initial...",Margin Call
3,,Perfect visualization of recent financial cris...,17 October 2011,79,104,I saw this film as part of the Ghent filmfesti...,Margin Call
4,10/10,Quietly gripping morality tale - a near perfec...,23 October 2011,394,460,Saw this last night. Set at a Wall Street firm...,Margin Call
...,...,...,...,...,...,...,...
344,7/10,"Frankly a little dull, but conveying a (likely...",30 December 2016,0,2,"I had read good reviews of this film, and comb...",Margin Call
345,5/10,Terrifying,1 February 2013,0,3,This movie is terrifying. There's this guy wit...,Margin Call
346,6/10,Great acting but not much excitement,26 June 2022,0,1,The best way that I can describe this movie is...,Margin Call
347,1/10,Shame on Netflix!,10 March 2022,1,22,Shame on Netflix for airing a program that sta...,Margin Call


In [6]:
## Printing movie names and total review counts from the IMDB.
## Then I am going to compare the number with the value_counts method of the DataFrame

for i,k in zip(review_counts["Movie"],review_counts["Counter"]):
    print(i," :",k)

Margin Call  : 349 Reviews


In [7]:
## There could be little differences of the counters
## so don't panic if your data frame has 1-5 less reviews than expected.
## My guess is that some reviews are hidden.

movie_reviews.value_counts("Movie")

Movie
Margin Call    349
dtype: int64

In [8]:
## Cheking the all data frame is something is wrong.

movie_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Rating        349 non-null    object
 1   Title         349 non-null    object
 2   Date          349 non-null    object
 3   Helpful_Vote  349 non-null    object
 4   Total_Vote    349 non-null    object
 5   Review        349 non-null    object
 6   Movie         349 non-null    object
dtypes: object(7)
memory usage: 19.2+ KB


In [9]:
## Thousand separators can be confusing and can change depending on your locale.
## In this case I am checking the lenghtiest Total_Vote instances to detect what is going on.

movie_reviews[movie_reviews.Total_Vote.str.len() > 4]

Unnamed: 0,Rating,Title,Date,Helpful_Vote,Total_Vote,Review,Movie


In [10]:
## Just to be safe I am removing all dots and commas from Total_Vote variable

movie_reviews["Total_Vote"] = movie_reviews.Total_Vote.str.replace(".", "") 
movie_reviews["Total_Vote"] = movie_reviews.Total_Vote.str.replace(",", "") 

  movie_reviews["Total_Vote"] = movie_reviews.Total_Vote.str.replace(".", "")


In [11]:
## Same thing fot the Helpful_Vote variable 

movie_reviews["Helpful_Vote"] = movie_reviews.Helpful_Vote.str.replace(".", "")
movie_reviews["Helpful_Vote"] = movie_reviews.Helpful_Vote.str.replace(",", "")

  movie_reviews["Helpful_Vote"] = movie_reviews.Helpful_Vote.str.replace(".", "")


In [12]:
## As a control measure, I am converting Total and Helpful_Vote variables
## into integers. If this process raises an exception something went wrong.

movie_reviews.astype({"Helpful_Vote" : "int64", "Total_Vote" : "int64"}).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Rating        349 non-null    object
 1   Title         349 non-null    object
 2   Date          349 non-null    object
 3   Helpful_Vote  349 non-null    int64 
 4   Total_Vote    349 non-null    int64 
 5   Review        349 non-null    object
 6   Movie         349 non-null    object
dtypes: int64(2), object(5)
memory usage: 19.2+ KB


I also kept `Rating` variable as it is. I am only cleaning that can become a real problem in data reading process such as thousand separators.

Other variables are kept as objects because they are text data. 

Lastly, I save the data as a csv file and re-reading it if there are problems.

In [13]:
movie_reviews.to_csv("file-name.csv", index=False)

In [14]:
## It is natural to have missing value on Rating column because 
## IMDB let's reviewers post reviews wihout a rating.

pd.read_csv("file-name.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2229 entries, 0 to 2228
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Rating        2170 non-null   object
 1   Title         2229 non-null   object
 2   Date          2229 non-null   object
 3   Helpful_Vote  2229 non-null   int64 
 4   Total_Vote    2229 non-null   int64 
 5   Review        2229 non-null   object
 6   Movie         2229 non-null   object
dtypes: int64(2), object(5)
memory usage: 122.0+ KB


In [13]:
## Closing the Selenium Webdriver.
## It saves up resources properly.

driver.close()
driver.quit()