STEP 1 : Data Scraping and Storage

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import csv

# Set up WebDriver
driver = webdriver.Chrome()
driver.maximize_window()

# IMDb 2024 movie URL
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31"

movie_data = []
target_movies = 1000  

def scrape_current_page():
    """Scrape all movies on the current page."""
    movies = driver.find_elements(By.CSS_SELECTOR, 'li.ipc-metadata-list-summary-item')
    
    for movie in movies:
        try:
            # Movie title
            title_elem = movie.find_element(By.CSS_SELECTOR, "h3.ipc-title__text")
            title = title_elem.text
            if '.' in title.split()[0]:
                title = ' '.join(title.split()[1:])

            # Description
            try:
                desc = movie.find_element(By.CSS_SELECTOR, "div.ipc-html-content-inner-div").text
            except:
                desc = "N/A"

            # Rating (may not be available)
            try:
                rating = movie.find_element(By.CSS_SELECTOR, "span.ipc-rating-star--rating").text
            except:
                rating = "N/A"

            # Vote count
            try:
                votes = movie.find_element(By.CSS_SELECTOR, "span.ipc-rating-star--voteCount").text.strip('()')
            except:
                votes = "N/A"

            # Image (get URL, not text)
            try:
                image_elem = movie.find_element(By.CSS_SELECTOR, "img.ipc-image")
                image_url = image_elem.get_attribute("src")
            except:
                image_url = "N/A"

            # Duration and Year from metadata
            duration = "N/A"
            year = "N/A"
            try:
                metadata_items = movie.find_elements(By.CSS_SELECTOR, "span.dli-title-metadata-item")
                for item in metadata_items:
                    text = item.text.strip()
                    if 'h' in text or 'm' in text:  # Detect duration like "2h 1m"
                        duration = text
                    elif text.isdigit() and len(text) == 4:  # Detect year like "2024"
                        year = text
            except:
                pass
    


            if [title, desc, rating, votes, image_url, duration, year] not in movie_data:
                movie_data.append([title, desc, rating, votes, image_url, duration, year])

        except Exception as e:
            print(f"⚠️ Skipping a movie due to error: {str(e)[:100]}")

def click_load_more():
    """Click the 'Load More' button if present."""
    try:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        
        load_more_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.ipc-see-more__button'))
        )
        driver.execute_script("arguments[0].click();", load_more_button)

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'li.ipc-metadata-list-summary-item:last-child'))
        )
        time.sleep(2)
        return True
    except (NoSuchElementException, TimeoutException):
        print("🔚 No more 'Load More' button or timeout.")
        return False
    except Exception as e:
        print(f"⚠️ Error clicking 'Load More': {str(e)[:100]}")
        return False

# Main execution
try:
    driver.get(url)
    time.sleep(3)

    while len(movie_data) < target_movies:
        prev = len(movie_data)
        scrape_current_page()
        new = len(movie_data) - prev
        print(f"✅ Scraped {new} new movies. Total: {len(movie_data)}")
        
        if len(movie_data) >= target_movies:
            break

        if not click_load_more():
            break

except Exception as e:
    print(f"❌ Error during scraping: {str(e)}")

finally:
    with open('imdb_movies_2024.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Movie Name', 'Storyline', 'Rating', 'Voting Count', 'Image URL', 'Duration', 'Year'])
        writer.writerows(movie_data)

    print(f"Saved data to imdb_movies_2024.csv with {len(movie_data)} entries.")
    driver.quit()

✅ Scraped 50 new movies. Total: 50
✅ Scraped 50 new movies. Total: 100
✅ Scraped 50 new movies. Total: 150
✅ Scraped 50 new movies. Total: 200
✅ Scraped 50 new movies. Total: 250
✅ Scraped 50 new movies. Total: 300
✅ Scraped 50 new movies. Total: 350
✅ Scraped 50 new movies. Total: 400
✅ Scraped 50 new movies. Total: 450
✅ Scraped 50 new movies. Total: 500
✅ Scraped 50 new movies. Total: 550
✅ Scraped 50 new movies. Total: 600
✅ Scraped 50 new movies. Total: 650
✅ Scraped 50 new movies. Total: 700
✅ Scraped 50 new movies. Total: 750
✅ Scraped 50 new movies. Total: 800
✅ Scraped 50 new movies. Total: 850
✅ Scraped 50 new movies. Total: 900
✅ Scraped 50 new movies. Total: 950
✅ Scraped 50 new movies. Total: 1000
Saved data to imdb_movies_2024.csv with 1000 entries.


In [2]:
import pandas as pd

data = pd.read_csv('imdb_movies_2024.csv')
print(data.head())  

           Movie Name                                          Storyline  \
0            Conclave  When Cardinal Lawrence is tasked with leading ...   
1            Babygirl  A high-powered CEO puts her career and family ...   
2     Fight or Flight  A mercenary takes on the job of tracking down ...   
3  A Complete Unknown  In 1961, an unknown 19-year-old Bob Dylan arri...   
4               Anora  A young stripper from Brooklyn meets and impul...   

   Rating Voting Count                                          Image URL  \
0     7.4        (184K  https://m.media-amazon.com/images/M/MV5BYjgxMD...   
1     5.9         (58K  https://m.media-amazon.com/images/M/MV5BMmUwOD...   
2     6.4         (11K  https://m.media-amazon.com/images/M/MV5BMmE4YT...   
3     7.4         (82K  https://m.media-amazon.com/images/M/MV5BYTA2NT...   
4     7.5        (192K  https://m.media-amazon.com/images/M/MV5BYThiN2...   

  Duration  Year  
0       2h  2024  
1   1h 54m  2024  
2   1h 42m  2024  
3   

In [3]:
data.shape

(1000, 7)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Movie Name    1000 non-null   object 
 1   Storyline     998 non-null    object 
 2   Rating        998 non-null    float64
 3   Voting Count  998 non-null    object 
 4   Image URL     1000 non-null   object 
 5   Duration      995 non-null    object 
 6   Year          1000 non-null   int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 54.8+ KB


In [5]:
data.isnull().sum()

Movie Name      0
Storyline       2
Rating          2
Voting Count    2
Image URL       0
Duration        5
Year            0
dtype: int64

In [6]:
import nltk
nltk.data.path.append("C:/Users/slnle/AppData/Local/nltk_data")  # Your path from the error message

# Verify the path
print(nltk.data.path)

['C:\\Users\\slnle/nltk_data', 'c:\\Users\\slnle\\AppData\\Local\\Programs\\Python\\Python313\\nltk_data', 'c:\\Users\\slnle\\AppData\\Local\\Programs\\Python\\Python313\\share\\nltk_data', 'c:\\Users\\slnle\\AppData\\Local\\Programs\\Python\\Python313\\lib\\nltk_data', 'C:\\Users\\slnle\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data', 'C:/Users/slnle/AppData/Local/nltk_data']


In [7]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
print(stopwords.words('english')[:10])


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\slnle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
import pandas as pd
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# 1. NLTK Nuclear Option - Download EVERYTHING properly
def nuclear_nltk_download():
    print("Performing complete NLTK setup...")
    nltk_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
    os.makedirs(nltk_dir, exist_ok=True)
    
    # Set download directory
    nltk.download('popular', download_dir=nltk_dir)
    nltk.download('punkt', download_dir=nltk_dir)
    nltk.download('stopwords', download_dir=nltk_dir)
    nltk.download('wordnet', download_dir=nltk_dir)
    nltk.download('omw-1.4', download_dir=nltk_dir)  # Required for WordNet
    nltk.download('punkt_tab', download_dir=nltk_dir)  # Specific missing resource
    
    # Refresh paths
    nltk.data.path.append(nltk_dir)
    print("NLTK setup complete. Resources available at:", nltk_dir)

# Execute the nuclear option
nuclear_nltk_download()

# 2. Verify all resources are accessible
def verify_nltk():
    try:
        word_tokenize("test")  # Verify punkt
        stopwords.words('english')  # Verify stopwords
        WordNetLemmatizer()  # Verify wordnet
        return True
    except LookupError as e:
        print(f"Missing resource: {e}")
        return False

if not verify_nltk():
    print("Critical NLTK resources missing. Trying manual fix...")
    nuclear_nltk_download()
    if not verify_nltk():
        raise EnvironmentError("Failed to setup NLTK. Please check internet connection.")

# 3. Load and preprocess data
try:
    data = pd.read_csv('imdb_movies_2024.csv', on_bad_lines='skip', encoding='latin1')
except Exception as e:
    print(f"Error loading CSV: {e}")
    exit()

# 4. Text processing with ultimate safeguards
def clean_text(text):
    try:
        if not isinstance(text, str) or not text.strip():
            return ""
        
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
        
        # Tokenize with fallback
        try:
            tokens = word_tokenize(text)
        except:
            nuclear_nltk_download()
            tokens = word_tokenize(text)
            
        # Stopwords with fallback
        try:
            stop_words = set(stopwords.words('english'))
        except:
            nuclear_nltk_download()
            stop_words = set(stopwords.words('english'))
            
        tokens = [word for word in tokens if word not in stop_words]
        
        # Lemmatization with fallback
        try:
            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(word) for word in tokens]
        except:
            nuclear_nltk_download()
            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(word) for word in tokens]
            
        # Capitalize the first letter of each sentence
        sentences = text.split('. ')
        sentences = [sentence.capitalize() for sentence in sentences]  # Capitalize first letter of each sentence
        cleaned_text = '. '.join(sentences)
        
        return cleaned_text
    except Exception as e:
        print(f"Error processing text: {str(e)[:100]}...")
        return ""

# 5. Process data with progress feedback
print("\nStarting text processing...")
data['Cleaned_Storyline'] = data['Storyline'].fillna('').apply(clean_text)

# 6. Verify and save results
print("\nProcessing complete. Sample results:")
print(data[['Movie Name', 'Storyline', 'Cleaned_Storyline']].head(3))
data.to_csv('cleaned_movies.csv', index=False)
print(f"\nSuccessfully processed {len(data)} records. Saved to 'cleaned_movies.csv'")


Performing complete NLTK setup...


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\slnle\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\slnle\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\slnle\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\slnle\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\slnle\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     C:\Users\slnle\nltk_data...
[nltk_data]    |   Package movie_reviews is a

NLTK setup complete. Resources available at: C:\Users\slnle\nltk_data

Starting text processing...

Processing complete. Sample results:
        Movie Name                                          Storyline  \
0         Conclave  When Cardinal Lawrence is tasked with leading ...   
1         Babygirl  A high-powered CEO puts her career and family ...   
2  Fight or Flight  A mercenary takes on the job of tracking down ...   

                                   Cleaned_Storyline  
0  When cardinal lawrence is tasked with leading ...  
1  A highpowered ceo puts her career and family o...  
2  A mercenary takes on the job of tracking down ...  

Successfully processed 1000 records. Saved to 'cleaned_movies.csv'


In [9]:
df=pd.read_csv("cleaned_movies.csv")

In [10]:
df

Unnamed: 0,Movie Name,Storyline,Rating,Voting Count,Image URL,Duration,Year,Cleaned_Storyline
0,Conclave,When Cardinal Lawrence is tasked with leading ...,7.4,(184K,https://m.media-amazon.com/images/M/MV5BYjgxMD...,2h,2024,When cardinal lawrence is tasked with leading ...
1,Babygirl,A high-powered CEO puts her career and family ...,5.9,(58K,https://m.media-amazon.com/images/M/MV5BMmUwOD...,1h 54m,2024,A highpowered ceo puts her career and family o...
2,Fight or Flight,A mercenary takes on the job of tracking down ...,6.4,(11K,https://m.media-amazon.com/images/M/MV5BMmE4YT...,1h 42m,2024,A mercenary takes on the job of tracking down ...
3,A Complete Unknown,"In 1961, an unknown 19-year-old Bob Dylan arri...",7.4,(82K,https://m.media-amazon.com/images/M/MV5BYTA2NT...,2h 21m,2024,In an unknown yearold bob dylan arrives in ne...
4,Anora,A young stripper from Brooklyn meets and impul...,7.5,(192K,https://m.media-amazon.com/images/M/MV5BYThiN2...,2h 19m,2024,A young stripper from brooklyn meets and impul...
...,...,...,...,...,...,...,...,...
995,God's Not Dead: In God We Trust,"Amid political and spiritual turmoil, Reverend...",3.4,(1.3K,https://m.media-amazon.com/images/M/MV5BYTk0MG...,1h 36m,2024,Amid political and spiritual turmoil reverend ...
996,The Vampire Next Door,"Introverted Cameron, is desperately in love wi...",4.6,(778,https://m.media-amazon.com/images/M/MV5BZWJiOD...,1h 50m,2024,Introverted cameron is desperately in love wit...
997,Believer,A struggling writer finds herself at the cente...,4.6,(406,https://m.media-amazon.com/images/M/MV5BNjI5NW...,1h 48m,2024,A struggling writer finds herself at the cente...
998,Drowning Dry,To celebrate Lukas' victory at the martial art...,6.2,(441,https://m.media-amazon.com/images/M/MV5BMjhkZj...,1h 28m,2024,To celebrate lukas victory at the martial arts...


In [28]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle


# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limit features to 5000 most frequent words

# Fit and transform the cleaned storylines
tfidf_matrix = tfidf.fit_transform(df['Cleaned_Storyline'])

# Save TF-IDF vectorizer model and matrix to files
pickle.dump(tfidf, open('tfidf_model.pkl', 'wb'))
pickle.dump(tfidf_matrix, open('tfidf_matrix.pkl', 'wb'))

print("TF-IDF model and matrix saved successfully!")


TF-IDF model and matrix saved successfully!


In [29]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Save similarity matrix
pickle.dump(cosine_sim, open('cosine_sim.pkl', 'wb'))

In [30]:
def recommend_movies(movie_name, cosine_sim_matrix, df, top_n=5):
    """
    Get top N similar movies based on cosine similarity of storylines.
    
    Args:
        movie_name (str): Name of the movie to find similar movies for
        cosine_sim_matrix (np.array): Precomputed cosine similarity matrix
        df (pd.DataFrame): DataFrame containing movie data
        top_n (int): Number of recommendations to return
        
    Returns:
        pd.DataFrame: Recommended movies or None if input movie not found
    """
    # Find movie indices (case-insensitive and partial match)
    matches = df[df['Movie Name'].str.contains(movie_name, case=False, regex=False)]
    
    if len(matches) == 0:
        print(f"Movie '{movie_name}' not found in dataset. Try another title.")
        print("Some available movies:", df['Movie Name'].head(10).tolist())
        return None
    
    # Use first match if multiple found
    movie_index = matches.index[0]
    
    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim_matrix[movie_index]))
    
    # Sort by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N similar movies (skip the movie itself)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    
    return df.iloc[movie_indices][['Movie Name', 'Storyline', 'Cleaned_Storyline']]

In [31]:
# Try with different variations until you find a match
recommendations = recommend_movies("Nightbitch", cosine_sim, data)  # Case-insensitive
if recommendations is not None:
    print(recommendations)


           Movie Name                                          Storyline  \
165              MadS  A teenager stops off to see his dealer to test...   
839      Take My Hand  At the peak of her career in London, an Austra...   
158           Frewaka  Follow a student of nursing palliative care, w...   
954           Invader  A young woman arrives in the Chicago suburbs a...   
336  We Bury the Dead  Ava, a desperate woman whose husband is missin...   

                                     Cleaned_Storyline  
165  A teenager stops off to see his dealer to test...  
839  At the peak of her career in london an austral...  
158  Follow a student of nursing palliative care wh...  
954  A young woman arrives in the chicago suburbs a...  
336  Ava a desperate woman whose husband is missing...  


In [32]:
# Check exact movie names in your dataset
print(data['Movie Name'].unique())

# Search for partial matches
print(data[data['Movie Name'].str.contains('incep', case=False)])

['Conclave' 'Babygirl' 'Fight or Flight' 'A Complete Unknown' 'Anora'
 'Nosferatu' 'The Assessment' 'Rust' 'Gladiator II' 'The Substance'
 'The Surfer' 'Speak No Evil' 'Queen of the Ring' 'Heretic' 'Friendship'
 'The Luckiest Man in America' 'Freaky Tales' 'Wicked' 'The Brutalist'
 'Beetlejuice Beetlejuice' 'The Order' 'Deadpool & Wolverine' 'Twisters'
 'A Real Pain' 'Mufasa: The Lion King' 'Dune: Part Two' 'Alien: Romulus'
 'Trap' 'Kraven the Hunter' 'Sharp Corner' 'Longlegs' 'The Wild Robot'
 'Parthenope' 'Civil War' 'Flow' 'Smile 2' 'It Ends with Us' 'Moana 2'
 'The Ministry of Ungentlemanly Warfare' 'The Penguin Lessons'
 'Winter Spring Summer or Fall' 'The Friend' 'The Apprentice'
 'We Live in Time' "I'm Still Here" 'Sonic the Hedgehog 3'
 'Venom: The Last Dance' 'Blink Twice' 'Juror #2'
 'Furiosa: A Mad Max Saga' 'The Life of Chuck' 'The Damned'
 'Paddington in Peru' 'The Count of Monte-Cristo'
 'The Haunted Apartment: Miss K' 'The Beekeeper' 'Cuckoo' 'Megalopolis'
 'The Fall Guy