In [35]:
import requests
from bs4 import BeautifulSoup as Bea
import pandas as pd
import re
import json

class Scraper:
    @staticmethod
    def fetch_html(movie_id):
        """
        Fetch the HTML for the movie's IMDb page using the movie_id.
        """
        url = f"https://www.themoviedb.org/movie/{movie_id}"
        response = requests.get(url)
        response.raise_for_status()  # Raise an error if the request fails

        return Bea(response.content, "html.parser")

    def extract_tmdb_rating(self, soup):
        """
        Extract the TMDb rating from the HTML page.
        """
        try:
            # Locate the user_score_chart div and extract the data-percent attribute
            rating_div = soup.find("div", class_="user_score_chart")
            if rating_div:
                return rating_div.get("data-percent")
            return None
        except Exception as e:
            print(f"Error extracting TMDb rating: {e}")
            return None

    def update_csv_with_ratings(input_csv, output_csv):
        """
        Read the input CSV, scrape IMDb ratings, and save the updated CSV.
        """
        # Load the CSV into a DataFrame
        df = pd.read_csv(input_csv, dtype={
        'movieId': 'string',
        'imdbId': 'string',
        'tmdbId': 'string',

        })

        # Initialize the scraper
        scraper = Scraper()

        # Add a new column for IMDb ratings
        tmdb_ratings = []
        
        print(df['tmdbId'].values)

        for values in df['tmdbId'].values:
            #print(values)
            tmdb_id = values
            #print(f"Fetching IMDb rating for movie ID: {imdb_id}")
            
            try:
                # Fetch the HTML
                soup = scraper.fetch_html(tmdb_id)
            
                # Extract the rating
                rating = scraper.extract_tmdb_rating(soup)
                rating = int(rating) / 100

                print(f"Rating: {rating}, for movie ID: {tmdb_id}")
                tmdb_ratings.append(rating)
            except Exception as e:
                print(f"Error fetching data for TMDb ID {tmdb_id}: {e}")
                tmdb_ratings.append(None)

        df['tmdb_rating'] = tmdb_ratings

        # add the new column to the CSV
        df.to_csv(output_csv, index=False)


    # Call the function
Scraper.update_csv_with_ratings('../data/ml/small/links2.csv', '../data/ml/small/links2_with_ratings.csv')


<StringArray>
[ '2383', '53879',  '1396', '11570',  '1541', '17443',  '8469', '27995',
 '36739',   '925',
 ...
 '14030',  '2687', '19064', '25241', '28942', '10739', '12767', '22309',
 '13156',   '277']
Length: 2000, dtype: string
Error fetching data for TMDb ID 2383: 429 Client Error: Too Many Requests for url: https://www.themoviedb.org/movie/2383-l-ours
Error fetching data for TMDb ID 53879: 429 Client Error: Too Many Requests for url: https://www.themoviedb.org/movie/53879-love-is-a-many-splendored-thing
Error fetching data for TMDb ID 1396: 429 Client Error: Too Many Requests for url: https://www.themoviedb.org/movie/1396
Error fetching data for TMDb ID 11570: 429 Client Error: Too Many Requests for url: https://www.themoviedb.org/movie/11570-the-crimson-pirate
Error fetching data for TMDb ID 1541: 429 Client Error: Too Many Requests for url: https://www.themoviedb.org/movie/1541-thelma-louise
Error fetching data for TMDb ID 17443: 429 Client Error: Too Many Requests for url: http

KeyboardInterrupt: 