In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time

def scrape_imdb_movies(url, num_pages=2):
    """
    Scrapes movie data from IMDb search results.
    
    Args:
        url (str): The IMDb search URL to start from.
        num_pages (int): Number of result pages to scrape. 
                         Each page has ~50 movies. Reduce if testing.
    
    Returns:
        list: A list of dictionaries, each containing movie details.
    """
    movies_data = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    for page in range(1, num_pages + 1):
        print(f"Scraping page {page}...")
        
        # Calculate the start parameter for pagination (IMDb uses 1, 51, 101...)
        start_param = (page - 1) * 50 + 1
        paginated_url = f"{url}&start={start_param}"
        
        try:
            response = requests.get(paginated_url, headers=headers, timeout=10)
            response.raise_for_status()  # Check for HTTP errors
        except requests.RequestException as e:
            print(f"Failed to retrieve page {page}: {e}")
            break
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all movie containers on the page
        movie_containers = soup.find_all('div', class_='lister-item-content')
        
        if not movie_containers:
            print("No movie containers found. The page structure might have changed.")
            break
        
        for container in movie_containers:
            movie = {}
            
            # Extract Title
            title_elem = container.find('h3', class_='lister-item-header')
            movie['title'] = title_elem.a.text.strip() if title_elem and title_elem.a else 'N/A'
            
            # Extract Year and clean it
            year_elem = container.find('span', class_='lister-item-year')
            if year_elem:
                # Remove parentheses and any non-digit characters at start/end
                movie['year'] = year_elem.text.strip('() ')[:4]
            else:
                movie['year'] = 'N/A'
            
            # Extract Rating
            rating_elem = container.find('div', class_='ratings-imdb-rating')
            movie['rating'] = rating_elem.strong.text.strip() if rating_elem and rating_elem.strong else 'N/A'
            
            # Extract Director (and stars)
            # Note: On IMDb search results, this <p> tag contains Director and Stars
            director_elem = container.find('p', class_='')
            if director_elem:
                text_parts = director_elem.text.split('|')
                movie['director'] = text_parts[0].replace('Director:', '').replace('Directors:', '').strip() if len(text_parts) > 0 else 'N/A'
            else:
                movie['director'] = 'N/A'
            
            # Extract Storyline (Plot)
            storyline_elem = container.find_all('p', class_='text-muted')
            # The plot is usually the second 'text-muted' paragraph
            plot_text = 'N/A'
            if len(storyline_elem) > 1:
                plot_text = storyline_elem[1].text.strip()
            elif storyline_elem:
                plot_text = storyline_elem[0].text.strip()
            movie['storyline'] = plot_text
            
            movies_data.append(movie)
        
        # Be polite to IMDb's servers
        time.sleep(1)  
    
    print(f"Scraping complete. Found {len(movies_data)} movies.")
    return movies_data

def save_to_csv(movies_data, filename='imdb_movies_2024.csv'):
    """Saves the scraped movie data to a CSV file."""
    if not movies_data:
        print("No data to save.")
        return
    
    fieldnames = ['title', 'year', 'rating', 'director', 'storyline']
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(movies_data)
        print(f"Data successfully saved to '{filename}'")
    except IOError as e:
        print(f"Error writing to file: {e}")

# Main execution
if __name__ == "__main__":
    # The URL you provided for 2024 feature films
    imdb_url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31"
    
    # Scrape data (start with 2 pages = ~100 movies for testing)
    movies = scrape_imdb_movies(imdb_url, num_pages=2)
    
    # Save to CSV
    save_to_csv(movies, 'imdb_movies_2024.csv')

In [2]:
import pandas as pd


In [4]:
pd.read_csv('imdb_movies_2024.csv')

Unnamed: 0,Movie_Name,Storyline,Release_Month
0,The Beekeeper,A former operative of a powerful organization ...,1
1,A Real Pain,Mismatched cousins reunite for a tour through ...,1
2,Argylle,A reclusive author who writes espionage novels...,1
3,Love Lies Bleeding,Reclusive gym manager Lou falls hard for Jacki...,1
4,Land of Bad,A Delta Force team fights for survival as an A...,1
...,...,...,...
595,Biggest Heist Ever,"Morgan, a hipster rapper otherwise known as Ra...",12
596,Love at 39 Degrees,"Fatih and Kumru, a lawyer from different citie...",12
597,Alangu,Centres around the clashes between Kerala's po...,12
598,Vanvaas,Delves into the dynamics between an elderly fa...,12


In [3]:
import pandas as pd
pd.read_csv('imdb_movies_2024.csv')


Unnamed: 0,Movie_Name,Storyline,Year,Rating,Genres
0,Dune: Part Two,"This action,adventure,drama feature film relea...",2024,8.4,"Action,Adventure,Drama"
1,Deadpool & Wolverine,"This action,adventure,comedy feature film rele...",2024,7.5,"Action,Adventure,Comedy"
2,The Substance,"This drama,horror,sci-fi feature film released...",2024,7.2,"Drama,Horror,Sci-Fi"
3,Furiosa: A Mad Max Saga,"This action,adventure,sci-fi feature film rele...",2024,7.5,"Action,Adventure,Sci-Fi"
4,Gladiator II,"This action,adventure,drama feature film relea...",2024,6.5,"Action,Adventure,Drama"
...,...,...,...,...,...
21067,Cacophony the Animated Movie,"This action,adventure,animation feature film r...",2024,,"Action,Adventure,Animation"
21068,Paint It Red - Making Spiral,This documentary feature film released in 2024...,2024,,Documentary
21069,The Bobby Dunbar Changeling,This documentary feature film released in 2024...,2024,,Documentary
21070,Return to the Dark House of Mystery,This horror feature film released in 2024 star...,2024,,Horror


In [7]:
df = pd.read_csv("imdb_movies_2024.csv")

In [10]:

df.head()

Unnamed: 0,Movie_Name,Storyline,Year,Rating,Genres,Cleaned_Storyline,Processed_Storyline
0,Dune: Part Two,"This action,adventure,drama feature film relea...",2024,8.4,"Action,Adventure,Drama",this actionadventuredrama feature film release...,this actionadventuredrama feature film release...
1,Deadpool & Wolverine,"This action,adventure,comedy feature film rele...",2024,7.5,"Action,Adventure,Comedy",this actionadventurecomedy feature film releas...,this actionadventurecomedy feature film releas...
2,The Substance,"This drama,horror,sci-fi feature film released...",2024,7.2,"Drama,Horror,Sci-Fi",this dramahorrorscifi feature film released in...,this dramahorrorscifi feature film released st...
3,Furiosa: A Mad Max Saga,"This action,adventure,sci-fi feature film rele...",2024,7.5,"Action,Adventure,Sci-Fi",this actionadventurescifi feature film release...,this actionadventurescifi feature film release...
4,Gladiator II,"This action,adventure,drama feature film relea...",2024,6.5,"Action,Adventure,Drama",this actionadventuredrama feature film release...,this actionadventuredrama feature film release...


In [11]:
df.tail()

Unnamed: 0,Movie_Name,Storyline,Year,Rating,Genres,Cleaned_Storyline,Processed_Storyline
21067,Cacophony the Animated Movie,"This action,adventure,animation feature film r...",2024,,"Action,Adventure,Animation",this actionadventureanimation feature film rel...,this actionadventureanimation feature film rel...
21068,Paint It Red - Making Spiral,This documentary feature film released in 2024...,2024,,Documentary,this documentary feature film released in star...,this documentary feature film released stars p...
21069,The Bobby Dunbar Changeling,This documentary feature film released in 2024...,2024,,Documentary,this documentary feature film released in star...,this documentary feature film released stars t...
21070,Return to the Dark House of Mystery,This horror feature film released in 2024 star...,2024,,Horror,this horror feature film released in stars as ...,this horror feature film released stars return...
21071,Lolipop Gang,"This action,comedy,crime feature film released...",2024,,"Action,Comedy,Crime",this actioncomedycrime feature film released i...,this actioncomedycrime feature film released s...


In [None]:
df.shape ---"know rows & columns"

(21072, 7)

In [15]:
df.info() --- "detect missing values & data types"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21072 entries, 0 to 21071
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Movie_Name           21072 non-null  object 
 1   Storyline            21072 non-null  object 
 2   Year                 21072 non-null  int64  
 3   Rating               11315 non-null  float64
 4   Genres               21072 non-null  object 
 5   Cleaned_Storyline    21072 non-null  object 
 6   Processed_Storyline  21072 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 1.1+ MB


TypeError: bad operand type for unary -: 'str'

In [16]:
df.isnull().sum()

Movie_Name                0
Storyline                 0
Year                      0
Rating                 9757
Genres                    0
Cleaned_Storyline         0
Processed_Storyline       0
dtype: int64

In [None]:
df.head()
df.tail()
