# Scraping IMDB ratings for Movies and TV Shows 

1) In this script amazon_prime_titles.csv has been imported and using the content type and movie/tv show titles ratings of this content has been scraped.
2) This script uses Selenium and scrapes ratings after entering the movie/tv show title into the google search bar along with the content type and scrapes the IMDB rating. 

In [6]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
import time

# Load the dataset
file_path = 'amazon_prime_titles.csv'  # Update the path
data = pd.read_csv(file_path)

# Setup Selenium WebDriver
driver_path = '/Users/rishisingh/Documents/Amazon_Prime_Dashboard/chromedriver-mac-arm64/chromedriver'  # Update the path
service = Service(driver_path)
driver = webdriver.Chrome(service=service)

def get_google_rating(driver, title, content_type):
    driver.get("https://www.google.com")
    search_box = driver.find_element("name", "q")
    search_query = f"{title} {content_type}"
    search_box.send_keys(search_query)
    search_box.send_keys(Keys.RETURN)
    time.sleep(2)  # Wait for the page to load
    
    try:
        rating_element = driver.find_element("xpath", '//span[contains(text(), "IMDb")]/preceding-sibling::span')
        rating = rating_element.text
        return rating
    except Exception as e:
        return None

# Define batch size
batch_size = 1000
start_index = 0  # Starting index
total_rows = min(len(data), 10000)  # Process up to 10,000 records

# Initialize an empty DataFrame to store all results
all_data = pd.DataFrame()

while start_index < total_rows:
    end_index = min(start_index + batch_size, total_rows)
    print(end_index)
    batch_data = data.iloc[start_index:end_index].copy()
    batch_data['imdb_rating'] = batch_data.apply(lambda row: get_google_rating(driver, row['title'], row['type']), axis=1)
    
    # Append the batch data to the final DataFrame
    all_data = pd.concat([all_data, batch_data], ignore_index=True)
    
    print(f"Processed records from {start_index} to {end_index}.")
    
    # Save the intermediate results
    all_data.to_csv('amazon_prime_titles_with_imdb_ratings2.csv', index=False)
    
    # Update start index for the next batch
    start_index = end_index

# Close the WebDriver
driver.quit()

# Display the final DataFrame
print(all_data.head())

8000
Processed records from 7000 to 8000.
9000
Processed records from 8000 to 9000.
9668
Processed records from 9000 to 9668.
  show_id   type              title                     director  \
0   s7001  Movie      Secret Window                  David Koepp   
1   s7002  Movie         Seabiscuit                    Gary Ross   
2   s7003  Movie  Saturday The 13th  Ogundoju Ojo, Musa Raji DOP   
3   s7004  Movie          Sanctuary              Reid Nicewonder   
4   s7005  Movie  Rollerball (1975)               Norman Jewison   

                                                cast        country  \
0  Johnny Depp, John Turturro, Maria Bello, Timot...  United States   
1  Jeff Bridges, Tobey Maguire, Chris Cooper, Eli...            NaN   
2  Kemi Komolafe, Jacob Moses, Marilyn Okhonfo, T...            NaN   
3  Alexandra Willers, Michael Andrew Scott, Blake...            NaN   
4  James Caan, John Houseman, Ralph Richardson, M...            NaN   

  date_added  release_year rating dura