In [2]:
import pandas as pd #Using panda to create our dataframe
# Import Selenium and its sub libraries
import selenium 
from selenium import webdriver
# Import BS4
import requests #needed to load the page for BS4
from bs4 import BeautifulSoup


In [2]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

#driver = webdriver.Chrome(ChromeDriverManager().install())

In [3]:
PATH = "./chromedriver.exe" #path to the webdriver file

In [4]:
def get_review(url, folder_name):
    '''
    Get the review from input as url for IMDB movies list.
    The function takes 2 input the url of the movies and the name of the folder to store the data
    For each folder, the function will grab the review for each movies and store into respective file.
    '''

    driver = webdriver.Chrome(ChromeDriverManager().install())
    #driver = webdriver.Chrome(PATH) #tell selenium to use Chrome and find the webdriver file in this location
    driver.get(url) #tell Selenium to open the webpage
    driver.implicitly_wait(1) # tell the webdriver to wait for 1 seconds for the page to load

    #After the webpage opened, we can extract the title, hyperlink, year of each movies
    #Set initial empty list for each element:
    title = []
    link = []
    year = []
    user_review_links = []

    #Grab the block of each individual movie
    block = driver.find_elements_by_class_name('lister-item')
    #Set up for loop to run through all 50 movies in the first page
    for i in range(0,50):
        try:
            #Extracting title
            ftitle = block[i].find_element_by_class_name('lister-item-header').text

            #The extracted title has extra elements, so we will have to do some cleaning
            #Remove the order in front of the title
            forder = block[i].find_element_by_class_name('lister-item-index').text
            #Extract the year last 6 letter of the title
            fyear = ftitle[-6:]
            #Drop the order, year and only keep the movie's name
            ftitle = ftitle.replace(forder+' ', '')[:-7 ]
            #Then extract the link with cleaned title
            flink = block[i].find_element_by_link_text(ftitle).get_attribute('href')
            
            partial_link = (flink).split('/')[4]
            #print(partial_link)
            review_link = 'https://www.imdb.com/title/'+partial_link+'/reviews/?ref_=tt_ql_urv'
            
            #Append the newly grabed link into its list
            user_review_links.append(review_link)

            #Add item to the respective lists
            title.append(ftitle)
            year.append(fyear)
            link.append(flink)
            
            
        except:
            continue
    # After that, we can use BeautifulSoup to extract the user reviews link 
    #Set an empty list to store user review link
    #user_review_links = []
    #print(link)
    #print(user_review_links)
    #for url in link:
        #url = url
        #setup user agent for BS4, except some rare case, it would be the same for most browser 
        #user_agent = {'User-agent': 'Mozilla/5.0'}
        #Use request.get to load the whole page
        #response = requests.get(url, headers = user_agent)
        #Parse the request object to BS4 to transform it into html structure
        #soup = BeautifulSoup(response.text, 'html.parser')
        #Find the link marked by the USER REVIEWS link text.
        #https://www.imdb.com/title/tt1070874/reviews/?ref_=tt_ql_urv
        #partial_link = ("https://www.imdb.com/title/tt0111161/?ref_=adv_li_tt").split('/')[4]
        #print(partial_link)
        #review_link = 'https://www.imdb.com/title/'+soup.find('a', text = 'User reviews').get('href')+'/reviews/?ref_=tt_ql_urv'
        #Append the newly grabed link into its list
        #user_review_links.append(review_link)
        
    #print(user_review_links)

    #Then create the first data frame to summarize our data at this point:
    #Create dictionary for data and columns' name
    top_50_data = {'Movie_name': title, 
            'Year': year, 
            'link': link,
            'user_review' : user_review_links,
            }
    top50 = pd.DataFrame(data = top_50_data) #create dataframe
    driver.quit() #tell Selenium to close the webpage

    # Step 2, we will grab the data from each user review page
    # Use Selenium to go to each user review page
    for i in range(len(top50['user_review'])): 
        driver = webdriver.Chrome(ChromeDriverManager().install())
        #driver = webdriver.Chrome(PATH)
        driver.get(top50['user_review'][i])
        driver.implicitly_wait(1) # tell the webdriver to wait for 1 seconds for the page to load to prevent blocked by anti spam software


        # Set up action to click on 'load more' button
        # note that each page on imdb has 25 reviews
        page = 1 #Set initial variable for while loop
        #We want at least 1000 review, so get 50 at a safe number
        while page<50:  
            try:
                #find the load more button on the webpage
                load_more = driver.find_element_by_id('load-more-trigger')
                #click on that button
                load_more.click()
                page+=1 #move on to next loadmore button
            except:
                #If couldnt find any button to click, stop
                break
        # After fully expand the page, we will grab data from whole website
        review = driver.find_elements_by_class_name('review-container')
        #Set list for each element:
        title = []
        content = []
        rating = []
        date = []
        user_name = []
        #run for loop to get 
        for n in range(0,100):
            try:
                #Some reviewers only give review text or rating without the other, 
                #so we use try/except here to make sure each block of content must has all the element before append them to the list

                #Check if each review has all the elements
                ftitle = review[n].find_element_by_class_name('title').text
                #For the review content, some of them are hidden as spoiler, 
                #so we use the attribute 'textContent' here after extracting the 'content' tag
                fcontent = review[n].find_element_by_class_name('content').get_attribute("textContent").strip()
                frating = review[n].find_element_by_class_name('rating-other-user-rating').text
                fdate = review[n].find_element_by_class_name('review-date').text
                fname = review[n].find_element_by_class_name('display-name-link').text


                #Then add them to the respective list
                title.append(ftitle)
                content.append(fcontent)
                rating.append(frating)
                date.append(fdate)
                user_name.append(fname)
            except:
                continue
        #Build data dictionary for dataframe
        data = {'User_name': user_name, 
            'Review title': title, 
            'Review Rating': rating,
            'Review date' : date,
            'Review_body' : content
           }
        #Build dataframe for each movie to export
        review = pd.DataFrame(data = data)
        movie = top50['Movie_name'][i] #grab the movie name from the top50 list    
        review['Movie_name'] = movie #create new column with the same movie name column    
        review.to_csv(f'{folder_name}{i+1}.csv') #store them into individual file for each movies, so we can combine or check them later
        driver.quit()

In [None]:
top50_raw_link = 'https://www.imdb.com/search/title/?groups=top_100'
get_review(top50_raw_link, 'top50_raw')

In [5]:
nowplaying_raw_link = 'https://www.imdb.com/search/title/?groups=now-playing-us'
get_review(nowplaying_raw_link, 'nowplaying_raw')



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/Amanda/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache


Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/Amanda/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache


Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/Amanda/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache


Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/Amanda/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache


Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/Amanda/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in c

In [None]:
#https://github.com/hungpham89/IMDb_scraper/blob/main/Imdb_user_review_scraper.ipynb
#https://hungpham89.medium.com/a-beginner-guide-for-scraping-data-from-imdb-for-user-reviews-using-selenium-and-beautifulsoup-c60e89a4ad1a

In [38]:
imdb_review = pd.DataFrame()
df_temp = pd.DataFrame()

imdb_review = pd.read_csv('nowplaying_raw1.csv')

for i in range(2,34):
    df_temp = pd.read_csv('nowplaying_raw{}.csv'.format(i))
    #imdb_review.append(df_temp,ignore_index=True)
    imdb_review = pd.concat([imdb_review, df_temp], ignore_index=True)
    #print(df_temp)


In [39]:
imdb_review.sort_values("Movie_name").head()

Unnamed: 0.1,Unnamed: 0,User_name,Review title,Review Rating,Review date,Review_body,Movie_name
2326,86,The Creeper,The Beatles BEST Movie,10/10,31 December 2002,A Hard Day's Night is my Favorite of the Five ...,A Hard Day's Night
2327,87,aramis-112-804880,Fun Little Movie,10/10,16 August 2013,"""A Hard Day's Night"" is a pseudo-documentary o...",A Hard Day's Night
2266,26,anaconda-40658,The Ultimate Beatles Film.,10/10,21 July 2015,A Hard Day's Night (1964): Dir: Richard Lester...,A Hard Day's Night
2267,27,Wuchakk,Shut Up and Sing,6/10,15 February 2017,Directed by Richard Lester and written by Alun...,A Hard Day's Night
2268,28,christopher-underwood,it seems unfair that Paul gets lumbered with t...,7/10,4 December 2018,Its some considerable time since I had seen th...,A Hard Day's Night


In [40]:
imdb_review.to_csv('imdb_reviews.csv')