# IMDB Review Scraper

This is an interactive scraper. It is robust but not %100 automized but can easily be converted to a script.

I used XPATHs in order to find elements on the website.

In [1]:
import selenium
from selenium import webdriver
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
## This sets the page language option for Firefox. It doesn't change the Firefox's visual language.

options = webdriver.FirefoxOptions()
options.set_preference('intl.accept_languages', 'en-US')

## I am use Firefox in this project but this can easily be changed.
driver = webdriver.Firefox(executable_path="geckodriver.exe",options=options)

In [4]:
## IMDB User Review Section Links go into this list.

review_links = [
    "https://www.imdb.com/title/tt1392190/reviews?ref_=tt_ov_rt"
    
]

## This dictionary is a control measure. Scraped movie names and total review numbers go here.
## At the end of the scraping process I compare this with what I scraped.

review_counts = {"Movie" : [], "Counter" : []}

## The Dictonary to hold the review data        

review_set = {
    "Rating" : [],
    "Title" : [],
    "Date" : [],
    "Helpful_Vote" : [],
    "Total_Vote" : [],
    "Review" : [],
    "Movie" : []
    
}

## User Review Section Links go into this for loop

for link in review_links:
    driver.get(link)

    ## After User Reviews Page loads, While loop starts to load all the comments
    
    ## Be sure to have a stable and fast internet connection
    
    ## Script explicitly waits for the "load more" element for 10 seconds in order to make sure that
    ## every review is loaded.

    ## As a safe guard against infinite while loop in case of the bad internet,
    ## I also added the times_continued variable to break the while loop after 6 tries.


    times_continued = 0
    page = True
    while page:
        try:
            load_more = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='ipl-load-more ipl-load-more--loaded']/button[@id='load-more-trigger']")))
            load_more.click()
            time.sleep(1)

        except selenium.common.exceptions.TimeoutException:
            loaded_all = driver.find_elements_by_xpath("//div[@class='ipl-load-more ipl-load-more--loaded-all']")

            if len(loaded_all) == 1:
                page = False
            else:
                times_continued += 1
                if times_continued < 7:
                    continue
                else:
                    page = False


    ## Finding Spoiler/Long Review Expanders. Be careful that even reviews without spoiler or long
    ## reviews have expanders. This is why I am using an exception here.

    expanders = driver.find_elements_by_xpath("//div[@class='ipl-expander ']")

    ## Expanding all the comments

    for button in expanders:
        try:
            button.click()
        except selenium.common.exceptions.ElementNotInteractableException:
            continue


    ## Ratings -- Some Reviews don't have ratings so throwing exception here.

    item_contents = driver.find_elements_by_xpath("//div[@class='lister-item-content']")

    for item in item_contents:
        try:
            review_set["Rating"].append(item.find_element_by_class_name("ipl-ratings-bar").text)
        except selenium.common.exceptions.NoSuchElementException:
            review_set["Rating"].append("NA")


    ## Review Titles

    review_set["Title"].extend(i.text for i in driver.find_elements_by_xpath("//div[@class='lister-item-content']/a[@class='title']"))

    ## Review Dates

    review_set["Date"].extend(i.text for i in driver.find_elements_by_xpath("//div[@class='display-name-date']/span[@class='review-date']"))

    ## Helpfulness and Movie Name

    movie_name = driver.find_element_by_xpath("//h3[@itemprop='name']/a").text

    for i in driver.find_elements_by_xpath("//div[@class='actions text-muted']"):
        temp_list = i.text.split(" ")
        review_set["Helpful_Vote"].append(temp_list[0])
        review_set["Total_Vote"].append(temp_list[3])
        review_set["Movie"].append(movie_name)

    ## Reviews

    review_set["Review"].extend(i.text for i in driver.find_elements_by_xpath("//div[@class='text show-more__control']"))
    
    ## Review Total
    
    review_total = driver.find_element_by_xpath("//div[@class='header']/div/span").text
    
    ## This is my control measure dictionary I mentioned above.
    
    review_counts["Movie"].append(movie_name)
    review_counts["Counter"].append(review_total)


In [5]:
## This is the other control measure.
## I am making sure that every list in the dictionary has the same length.
## If they are not equal, something went wrong.

for i in review_set.keys():
    print(len(review_set[i]))

2229
2229
2229
2229
2229
2229
2229


In [6]:
## After seeing every list has the same length,
## I convert the dictionary into a Pandas DataFrame

movie_reviews = pd.DataFrame(review_set)
movie_reviews

Unnamed: 0,Rating,Title,Date,Helpful_Vote,Total_Vote,Review,Movie
0,8/10,Well made and mindlessly entertaining....,10 September 2015,29,65,"""Mad Max: Fury Road"" is not the sort of film I...",Mad Max: Fury Road
1,10/10,Pure action spectacle,24 February 2016,26,46,MAD MAX: FURY ROAD is the finest piece of pure...,Mad Max: Fury Road
2,,Unlike Anything You've Previously Seen,1 January 2016,12,31,Mad Max: Fury Road (2015)\n\n**** (out of 4)\n...,Mad Max: Fury Road
3,9/10,unrelenting action,2 January 2016,11,25,Max (Tom Hardy) is haunted by his past failure...,Mad Max: Fury Road
4,5/10,It Never Stops,12 October 2015,54,92,I have to say that I tried really hard to like...,Mad Max: Fury Road
...,...,...,...,...,...,...,...
2224,9/10,"Awesome acting, amazing action, a Mad Max film...",15 May 2015,27,56,WOW. My initial reaction. Absolutely incredibl...,Mad Max: Fury Road
2225,2/10,Mild-mannered Max,19 August 2018,6,9,Throughout the entirety of this movie I was be...,Mad Max: Fury Road
2226,4/10,Vastly Overrated,1 March 2016,14,25,If you are thinking about seeing this movie be...,Mad Max: Fury Road
2227,8/10,Master of visual blocking,22 October 2018,0,1,It's incredible how the director packed so muc...,Mad Max: Fury Road


In [18]:
## Printing movie names and total review counts from the IMDB.
## Then I am going to compare the number with the value_counts method of the DataFrame

for i,k in zip(review_counts["Movie"],review_counts["Counter"]):
    print(i," :",k)

Cashback  : 200 Reviews
Being John Malkovich  : 918 Reviews
Adaptation.  : 779 Reviews
Climax  : 478 Reviews
Irreversible  : 773 Reviews
Force Majeure  : 151 Reviews
Midsommar  : 3,632 Reviews
The Worst Person in the World  : 135 Reviews
The Father  : 937 Reviews
King Richard  : 404 Reviews


In [7]:
## There could be little differences of the counters
## so don't panic if your data frame has 1-5 less reviews than expected.
## My guess is that some reviews are hidden.

movie_reviews.value_counts("Movie")

Movie
Mad Max: Fury Road    2229
dtype: int64

In [8]:
## Cheking the all data frame is something is wrong.

movie_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2229 entries, 0 to 2228
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Rating        2229 non-null   object
 1   Title         2229 non-null   object
 2   Date          2229 non-null   object
 3   Helpful_Vote  2229 non-null   object
 4   Total_Vote    2229 non-null   object
 5   Review        2229 non-null   object
 6   Movie         2229 non-null   object
dtypes: object(7)
memory usage: 122.0+ KB


In [9]:
## Thousand separators can be confusing and can change depending on your locale.
## In this case I am checking the lenghtiest Total_Vote instances to detect what is going on.

movie_reviews[movie_reviews.Total_Vote.str.len() > 4]

Unnamed: 0,Rating,Title,Date,Helpful_Vote,Total_Vote,Review,Movie
356,10/10,An absolutely insane action film that will blo...,13 May 2015,906,1704,Mad Max Fury Road is George Miller's return to...,Mad Max: Fury Road
626,9/10,"A blockbuster that dark, gritty, brutal and bl...",14 May 2015,535,1019,I was left speechless when this finished. It c...,Mad Max: Fury Road
1095,1/10,On Par for Modern Hollywood Remakes (Meaning i...,24 May 2015,1080,2112,"First off, this is another movie in which the ...",Mad Max: Fury Road
1267,10/10,A masterpiece on a massive scale.,11 May 2015,935,1780,George Miller returns with a bang and takes th...,Mad Max: Fury Road
1927,1/10,Waste of time and money,29 May 2015,585,1173,"I saw the original mad max... it was violent, ...",Mad Max: Fury Road
2066,1/10,Absolutely hated it,27 May 2015,788,1547,I can't believe why people like this movie unl...,Mad Max: Fury Road


In [10]:
## Just to be safe I am removing all dots and commas from Total_Vote variable

movie_reviews["Total_Vote"] = movie_reviews.Total_Vote.str.replace(".", "") 
movie_reviews["Total_Vote"] = movie_reviews.Total_Vote.str.replace(",", "") 

  movie_reviews["Total_Vote"] = movie_reviews.Total_Vote.str.replace(".", "")


In [11]:
## Same thing fot the Helpful_Vote variable 

movie_reviews["Helpful_Vote"] = movie_reviews.Helpful_Vote.str.replace(".", "")
movie_reviews["Helpful_Vote"] = movie_reviews.Helpful_Vote.str.replace(",", "")

  movie_reviews["Helpful_Vote"] = movie_reviews.Helpful_Vote.str.replace(".", "")


In [12]:
## As a control measure, I am converting Total and Helpful_Vote variables
## into integers. If this process raises an exception something went wrong.

movie_reviews.astype({"Helpful_Vote" : "int64", "Total_Vote" : "int64"}).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2229 entries, 0 to 2228
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Rating        2229 non-null   object
 1   Title         2229 non-null   object
 2   Date          2229 non-null   object
 3   Helpful_Vote  2229 non-null   int64 
 4   Total_Vote    2229 non-null   int64 
 5   Review        2229 non-null   object
 6   Movie         2229 non-null   object
dtypes: int64(2), object(5)
memory usage: 122.0+ KB


I also kept `Rating` variable as it is. I am only cleaning that can become a real problem in data reading process such as thousand separators.

Other variables are kept as objects because they are text data. 

Lastly, I save the data as a csv file and re-reading it if there are problems.

In [13]:
movie_reviews.to_csv("file-name.csv", index=False)

In [14]:
## It is natural to have missing value on Rating column because 
## IMDB let's reviewers post reviews wihout a rating.

pd.read_csv("file-name.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2229 entries, 0 to 2228
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Rating        2170 non-null   object
 1   Title         2229 non-null   object
 2   Date          2229 non-null   object
 3   Helpful_Vote  2229 non-null   int64 
 4   Total_Vote    2229 non-null   int64 
 5   Review        2229 non-null   object
 6   Movie         2229 non-null   object
dtypes: int64(2), object(5)
memory usage: 122.0+ KB


In [15]:
## Closing the Selenium Webdriver.
## It saves up resources properly.

driver.close()
driver.quit()