In [20]:
pip install pandas selenium mysqlclient

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import time
import glob
import mysql
import pandas as pd
from selenium import webdriver
from sqlalchemy import create_engine
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException


In [3]:
def webscrapper(url):
    # Initialize the WebDriver
    driver = webdriver.Chrome()

    try:
        # Open IMDb page
        driver.get(url)
        driver.maximize_window()
        time.sleep(2)
        print(driver.title)

        # Attempt to click "Read More" until all data is loaded
        while True:
            try:
                element = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[2]/div/span/button')
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
                time.sleep(1)
                element.click()
                print("Clicked 'Read More' button.")
                time.sleep(1)
            except NoSuchElementException:
                print("No 'Read More' button found. All data loaded.")
                break
            except ElementClickInterceptedException:
                print("Button is blocked by another element. Retrying...")
                time.sleep(2)
            except TimeoutException:
                print("Operation timed out. Retrying...")
                time.sleep(2)
            except Exception as e:
                print(f"Unexpected error: {e}")
                break

        print("Successfully retrieved all the data.")

        # Initialize a dictionary to store data by genre
        genre_data = {}

        # Locate all movie items
        movies = driver.find_elements(By.XPATH,'//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li')
        
        # Extract details for each movie
        for movie in movies:
            try:
                # Movie Name
                name = movie.find_element(By.CSS_SELECTOR, 'h3[class="ipc-title__text"]').text.split(". ", 1)[1]

                # Genre (Fixed selector)
                try:
                    genre = movie.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[1]/div/div/div[2]/button[3]/span').text.strip()
                except NoSuchElementException:
                    genre = "Unknown"

                # Ratings
                try:
                    rating = movie.find_element(By.CSS_SELECTOR, "span[class='ipc-rating-star--rating']").text.strip()
                except NoSuchElementException:
                    rating = "N/A"

                # Voting Counts
                try:
                    votes = movie.find_element(By.CSS_SELECTOR, "span[class='ipc-rating-star--voteCount']").text.replace("(", "").replace(")", "").strip()
                except NoSuchElementException:
                    votes = "N/A"

                # Duration
                try:
                    duration = movie.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[2]/span[2]').text.strip()
                except NoSuchElementException:
                    duration = "N/A"

                # Split genres and store data in a dictionary
                for g in genre.split(", "):
                    if g not in genre_data:
                        genre_data[g] = []
                    genre_data[g].append({
                        "Movie Name": name,
                        "Rating": rating,
                        "Votes": votes,
                        "Duration": duration,
                        "Genre": genre
                    })
            except Exception as e:
                print(f"Error processing movie: {e}")

        return genre_data  # Return the full dataset after processing all movies

    except Exception as e:
        print(f"Error retrieving movie list: {e}")
        return {}  # Ensure function always returns a dictionary

    finally:
        driver.quit()  # Quit the driver at the end


In [5]:
 # Save data to CSV files

def genre_dataset(genre_data):
    #Create a new folder for the datas.
    output_dir = "IMDB_2024_Genres_Data"
    
    #Make the directory using os module.
    os.makedirs(output_dir, exist_ok=True)

    #Loop through the dictionary data to csv files.
    for genre, movies in genre_data.items():
        #convert the python object to dataframe.
        df = pd.DataFrame(movies)
        
        #creation of file name as genre name and put it inside the new folder.
        file_name = os.path.join(output_dir, f"{genre}.csv")
        
        #save as csv file using pandas method.
        df.to_csv(file_name, index=False)
        
        #Display the file name which is created.
        print (f"Saved data for genre '{genre}' to '{file_name}'")

In [7]:
# List of IMDb genre-specific movie URLs
genre_urls = [
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=news",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=talk-show",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=game-show",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=war",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=western",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=action",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=comedy",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=drama",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=crime",
    "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=family"
]

# Loop through each genre URL and scrape data
for genre_url in genre_urls:
    try:
        print(f"URL Processing: {genre_url}")
        movies_by_genre = webscrapper(genre_url)
        print(f"Data type returned: {type(movies_by_genre)}")  # Debugging check
        
        if movies_by_genre and isinstance(movies_by_genre, dict):  # Ensure it's a non-empty dictionary
            try:
                genre_dataset(movies_by_genre)
                print(f"Successfully stored")
            except Exception as dataset_error:
                print(f"Error saving dataset for {genre_url}: {dataset_error}")
        else:
            print(f"Skipping {genre_url} as no valid data was retrieved.")

    except Exception as e:
        print(f"Error processing {genre_url}: {e}")

print('✅ Successfully completed processing all genres!')


URL Processing: https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=news
Movie, News, Release date between 2024-01-01 and 2024-12-31 (Sorted by Popularity Ascending)
No 'Read More' button found. All data loaded.
Successfully retrieved all the data.
Data type returned: <class 'dict'>
Saved data for genre 'News' to 'IMDB_2024_Genres_Data\News.csv'
Successfully stored
URL Processing: https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=talk-show
Movie, Release date between 2024-01-01 and 2024-12-31, Talk-Show (Sorted by Popularity Ascending)
No 'Read More' button found. All data loaded.
Successfully retrieved all the data.
Data type returned: <class 'dict'>
Saved data for genre 'Talk-Show' to 'IMDB_2024_Genres_Data\Talk-Show.csv'
Successfully stored
URL Processing: https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=game-show
Movie, Release date between 2024-0

In [41]:
#checking the folder
glob.glob('IMDB_2024_Genres_data\\*.csv')

['IMDB_2024_Genres_data\\Action.csv',
 'IMDB_2024_Genres_data\\Comedy.csv',
 'IMDB_2024_Genres_data\\Crime.csv',
 'IMDB_2024_Genres_data\\Drama.csv',
 'IMDB_2024_Genres_data\\Family.csv',
 'IMDB_2024_Genres_data\\Game-Show.csv',
 'IMDB_2024_Genres_data\\News.csv',
 'IMDB_2024_Genres_data\\Talk-Show.csv',
 'IMDB_2024_Genres_data\\War.csv',
 'IMDB_2024_Genres_data\\Western.csv']

In [43]:
#Combining the all genre file int single one and save it.
df = pd.concat([pd.read_csv(one_file) for one_file in glob.glob('IMDB_2024_Genres_data\\*.csv')],ignore_index=True)
df = df.reset_index(drop=True)
df.to_csv('genre_combined_df.csv', index=False)
df #12144 rows

Unnamed: 0,Movie Name,Rating,Votes,Duration,Genre
0,Gladiator II,6.6,190K,2h 28m,Action
1,Sonic the Hedgehog 3,7.0,40K,1h 50m,Action
2,Pushpa: The Rule - Part 2,6.2,49K,3h 21m,Action
3,The Killer's Game,5.7,13K,1h 44m,Action
4,Dune: Part Two,8.5,593K,2h 46m,Action
...,...,...,...,...,...
12139,Vadogga,,,1h 25m,Western
12140,The Ballad of Winchester Pete,,,,Western
12141,The Rattlesnake Twins,,,50m,Western
12142,"The Goods, Bads and Uglys",,,,Western


In [45]:
#Here checkig the file size and check for data types.
df.info()  

#474.5KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12144 entries, 0 to 12143
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Movie Name  12144 non-null  object 
 1   Rating      7207 non-null   float64
 2   Votes       7207 non-null   object 
 3   Duration    9099 non-null   object 
 4   Genre       12144 non-null  object 
dtypes: float64(1), object(4)
memory usage: 474.5+ KB


In [47]:
#Here we are going to read the file and drop the column which has NaN and delete the duplicates.
new_df = pd.read_csv('genre_combined_df.csv')
new_df.dropna(inplace = True,ignore_index=True)
new_df.drop_duplicates(inplace = True)
new_df.head()

Unnamed: 0,Movie Name,Rating,Votes,Duration,Genre
0,Gladiator II,6.6,190K,2h 28m,Action
1,Sonic the Hedgehog 3,7.0,40K,1h 50m,Action
2,Pushpa: The Rule - Part 2,6.2,49K,3h 21m,Action
3,The Killer's Game,5.7,13K,1h 44m,Action
4,Dune: Part Two,8.5,593K,2h 46m,Action


In [49]:
#Here we are checking one more time  
new_df.isnull().sum()

Movie Name    0
Rating        0
Votes         0
Duration      0
Genre         0
dtype: int64

In [51]:
#To view the Votes full data for conversion.
new_df["Votes"].to_string()

'0        190K\n1         40K\n2         49K\n3         13K\n4        593K\n5         35K\n6        148K\n7        462K\n8        224K\n9         91K\n10       266K\n11        16K\n12        28K\n13       208K\n14        21K\n15       153K\n16       127K\n17        41K\n18       2.4K\n19        22K\n20       149K\n21        94K\n22       132K\n23        11K\n24        81K\n25       145K\n26        53K\n27        44K\n28       117K\n29       5.1K\n30        30K\n31        11K\n32        14K\n33        97K\n34       7.7K\n35       163K\n36        90K\n37         9K\n38        51K\n39        88K\n40       106K\n41        428\n42        49K\n43       3.2K\n44        424\n45       6.3K\n46        17K\n47        22K\n48        65K\n49        63K\n50        19K\n51        31K\n52        11K\n53        810\n54        48K\n55       3.6K\n56        35K\n57        893\n58         1K\n59       3.2K\n60       6.8K\n61       2.7K\n62        12K\n63        79K\n64       1.7K\n65       2.1K\n66       

In [53]:
#Data cleaning

# Replace 'K' with 'e3' and 'M' with 'e6' to denote scientific notation
new_df['Votes'] = new_df['Votes'].str.replace('K','e3').str.replace('M','e6')
new_df["Votes"].to_string()

'0       190e3\n1        40e3\n2        49e3\n3        13e3\n4       593e3\n5        35e3\n6       148e3\n7       462e3\n8       224e3\n9        91e3\n10      266e3\n11       16e3\n12       28e3\n13      208e3\n14       21e3\n15      153e3\n16      127e3\n17       41e3\n18      2.4e3\n19       22e3\n20      149e3\n21       94e3\n22      132e3\n23       11e3\n24       81e3\n25      145e3\n26       53e3\n27       44e3\n28      117e3\n29      5.1e3\n30       30e3\n31       11e3\n32       14e3\n33       97e3\n34      7.7e3\n35      163e3\n36       90e3\n37        9e3\n38       51e3\n39       88e3\n40      106e3\n41        428\n42       49e3\n43      3.2e3\n44        424\n45      6.3e3\n46       17e3\n47       22e3\n48       65e3\n49       63e3\n50       19e3\n51       31e3\n52       11e3\n53        810\n54       48e3\n55      3.6e3\n56       35e3\n57        893\n58        1e3\n59      3.2e3\n60      6.8e3\n61      2.7e3\n62       12e3\n63       79e3\n64      1.7e3\n65      2.1e3\n66       

In [55]:
# Convert the 'Votes' column to numeric
new_df['Votes'] = pd.to_numeric(new_df['Votes'])
new_df["Votes"].to_string()

'0       190000.0\n1        40000.0\n2        49000.0\n3        13000.0\n4       593000.0\n5        35000.0\n6       148000.0\n7       462000.0\n8       224000.0\n9        91000.0\n10      266000.0\n11       16000.0\n12       28000.0\n13      208000.0\n14       21000.0\n15      153000.0\n16      127000.0\n17       41000.0\n18        2400.0\n19       22000.0\n20      149000.0\n21       94000.0\n22      132000.0\n23       11000.0\n24       81000.0\n25      145000.0\n26       53000.0\n27       44000.0\n28      117000.0\n29        5100.0\n30       30000.0\n31       11000.0\n32       14000.0\n33       97000.0\n34        7700.0\n35      163000.0\n36       90000.0\n37        9000.0\n38       51000.0\n39       88000.0\n40      106000.0\n41         428.0\n42       49000.0\n43        3200.0\n44         424.0\n45        6300.0\n46       17000.0\n47       22000.0\n48       65000.0\n49       63000.0\n50       19000.0\n51       31000.0\n52       11000.0\n53         810.0\n54       48000.0\n55       

In [57]:
# Convert to integer type if desired
new_df['Votes'] = new_df['Votes'].astype(int)

In [59]:
# Extract hours and minutes using string methods
Hours = new_df["Duration"].str.extract(r'(\d+)h').fillna(0).astype(int)
Minutes = new_df["Duration"].str.extract(r'(\d+)m').fillna(0).astype(int)

# Changing Duration column into Minutes
new_df["Duration"] = (Hours * 60) + Minutes

In [61]:
#Finally we are checking the datatype for Votes.
new_df.dtypes

Movie Name     object
Rating        float64
Votes           int32
Duration        int32
Genre          object
dtype: object

In [63]:
#checking the size of the dataset.
new_df.info()

#198.3KB file size is reduced.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6342 entries, 0 to 6341
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Movie Name  6342 non-null   object 
 1   Rating      6342 non-null   float64
 2   Votes       6342 non-null   int32  
 3   Duration    6342 non-null   int32  
 4   Genre       6342 non-null   object 
dtypes: float64(1), int32(2), object(2)
memory usage: 198.3+ KB


In [65]:
#Finally, saving the dataset. Now it is ready to push it in the database.
new_df.to_csv('genre_df_cleaned.csv', index=False)

In [67]:
# pushing the dataset to db
engine = create_engine("mysql+mysqldb://root:shan@localhost:3306/imdb_2024_genres")#root@localhost:3306
conn = engine.connect()
data =pd.read_csv('genre_df_cleaned.csv')
data.to_sql('movie data', engine, index = False, if_exists = 'replace')
conn.close()