In [20]:
pip install pandas selenium mysqlclient

Note: you may need to restart the kernel to use updated packages.


In [14]:
import os
import time
import glob
import mysql
import pandas as pd
from selenium import webdriver
from sqlalchemy import create_engine
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException


In [34]:
def webscrapper(url):
    # Initialize the WebDriver
    driver = webdriver.Chrome()

    try:
        # Open IMDb page
        driver.get(url)
        driver.maximize_window()
        time.sleep(2)
        print(driver.title)

        # Attempt to click "Read More" until all data is loaded
        while True:
            try:
                element = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[2]/div/span/button')
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
                time.sleep(1)
                element.click()
                print("Clicked 'Read More' button.")
                time.sleep(1)
            except NoSuchElementException:
                print("No 'Read More' button found. All data loaded.")
                break
            except ElementClickInterceptedException:
                print("Button is blocked by another element. Retrying...")
                time.sleep(2)
            except TimeoutException:
                print("Operation timed out. Retrying...")
                time.sleep(2)
            except Exception as e:
                print(f"Unexpected error: {e}")
                break

        print("Successfully retrieved all the data.")

        # Initialize a dictionary to store data by genre
        genre_data = {}

        # Locate all movie items
        movies = driver.find_elements(By.XPATH,'//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li')
        
        # Extract details for each movie
        for movie in movies:
            try:
                # Movie Name
                name = movie.find_element(By.CSS_SELECTOR, 'h3[class="ipc-title__text"]').text.split(". ", 1)[1]

                # Genre (Fixed selector)
                try:
                    genre = movie.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[1]/div/div/div[2]/button[3]/span').text.strip()
                except NoSuchElementException:
                    genre = "Unknown"

                # Ratings
                try:
                    rating = movie.find_element(By.CSS_SELECTOR, "span[class='ipc-rating-star--rating']").text.strip()
                except NoSuchElementException:
                    rating = "N/A"

                # Voting Counts
                try:
                    votes = movie.find_element(By.CSS_SELECTOR, "span[class='ipc-rating-star--voteCount']").text.replace("(", "").replace(")", "")
                except NoSuchElementException:
                    votes = "N/A"

                # Duration
                try:
                    duration = movie.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[2]/span[2]').text
                except NoSuchElementException:
                    duration = "N/A"

                # Split genres and store data in a dictionary
                for g in genre.split(", "):
                    if g not in genre_data:
                        genre_data[g] = []
                    genre_data[g].append({
                        "Movie Name": name,
                        "Rating": rating,
                        "Votes": votes,
                        "Duration": duration,
                        "Genre": genre
                    })
            except Exception as e:
                print(f"Error processing movie: {e}")

        return genre_data  # Return the full dataset after processing all movies

    except Exception as e:
        print(f"Error retrieving movie list: {e}")
        return {}  # Ensure function always returns a dictionary

    finally:
        driver.quit()  # Quit the driver at the end


In [36]:
 # Save data to CSV files

def genre_dataset(genre_data):
    #Create a new folder for the datas.
    output_dir = "IMDB_2024_Genres_Data"
    
    #Make the directory using os module.
    os.makedirs(output_dir, exist_ok=True)

    #Loop through the dictionary data to csv files.
    for genre, movies in genre_data.items():
        #convert the python object to dataframe.
        df = pd.DataFrame(movies)
        
        #creation of file name as genre name and put it inside the new folder.
        file_name = os.path.join(output_dir, f"{genre}.csv")
        
        #save as csv file using pandas method.
        df.to_csv(file_name, index=False)
        
        #Display the file name which is created.
        print (f"Saved data for genre '{genre}' to '{file_name}'")

In [38]:
# List of IMDb genre-specific movie URLs
genre_urls = [
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=news",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=talk-show",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=game-show",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=war",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=western"
]

# Loop through each genre URL and scrape data
for genre_url in genre_urls:
    try:
        print(f"URL Processing: {genre_url}")
        movies_by_genre = webscrapper(genre_url)
        print(f"Data type returned: {type(movies_by_genre)}")  # Debugging check
        
        if movies_by_genre and isinstance(movies_by_genre, dict):  # Ensure it's a non-empty dictionary
            try:
                genre_dataset(movies_by_genre)
                print(f"Successfully stored")
            except Exception as dataset_error:
                print(f"Error saving dataset for {genre_url}: {dataset_error}")
        else:
            print(f"Skipping {genre_url} as no valid data was retrieved.")

    except Exception as e:
        print(f"Error processing {genre_url}: {e}")

print('✅ Successfully completed processing all genres!')


URL Processing: https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=news
Movie, Release date between 2024-01-01 and 2024-12-31, News (Sorted by Popularity Ascending)
No 'Read More' button found. All data loaded.
Successfully retrieved all the data.
Data type returned: <class 'dict'>
Saved data for genre 'News' to 'IMDB_2024_Genres_Data\News.csv'
Successfully stored
URL Processing: https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=talk-show
Movie, Release date between 2024-01-01 and 2024-12-31, Talk-Show (Sorted by Popularity Ascending)
No 'Read More' button found. All data loaded.
Successfully retrieved all the data.
Data type returned: <class 'dict'>
Saved data for genre 'Talk-Show' to 'IMDB_2024_Genres_Data\Talk-Show.csv'
Successfully stored
URL Processing: https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=game-show
Movie, Game-Show, Release date bet

In [40]:
glob.glob('IMDB_2024_Genres_data\\*.csv')

['IMDB_2024_Genres_data\\Game-Show.csv',
 'IMDB_2024_Genres_data\\News.csv',
 'IMDB_2024_Genres_data\\Talk-Show.csv',
 'IMDB_2024_Genres_data\\War.csv',
 'IMDB_2024_Genres_data\\Western.csv']

In [42]:
df = pd.concat([pd.read_csv(one_file) for one_file in glob.glob('IMDB_2024_Genres_data\\*.csv')],ignore_index=True)
df = df.reset_index(drop=True)
df.to_csv('genre_df.csv', index=False)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239 entries, 0 to 238
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Movie Name  239 non-null    object 
 1   Rating      148 non-null    float64
 2   Votes       148 non-null    object 
 3   Duration    178 non-null    object 
 4   Genre       239 non-null    object 
dtypes: float64(1), object(4)
memory usage: 9.5+ KB


In [97]:
new_df = pd.read_csv('genre_df.csv')
new_df.dropna(inplace = True,ignore_index=True)
new_df

Unnamed: 0,Movie Name,Rating,Votes,Duration,Genre


In [109]:
new_df.isnull().sum()

Movie Name    0
Rating        0
Votes         0
Duration      0
Genre         0
dtype: int64

In [101]:
new_df.dtypes

Movie Name     object
Rating        float64
Votes          object
Duration      float64
Genre          object
dtype: object

In [103]:
new_df.drop_duplicates(inplace = True)
new_df.to_csv('genre_df_cleaned.csv', index=False)

In [46]:
engine = create_engine("mysql+mysqldb://root:shan@localhost:3306/imdb_2024_genres")#root@localhost:3306
conn = engine.connect()
data =pd.read_csv('genre_df_cleaned.csv')
data.to_sql('test', engine, index = False, if_exists = 'replace')
conn.close()