In [27]:
import kagglehub
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download dataset from Kaggle
path = kagglehub.dataset_download("rajugc/imdb-top-250-movies-dataset")

# List dataset files
print("Dataset files:", os.listdir(path))

# Load the correct CSV file
df = pd.read_csv(os.path.join(path, "IMDB Top 250 Movies.csv"))

# Display first few rows
print(df.head())

Dataset files: ['IMDB Top 250 Movies.csv']
   rank                      name  year  rating               genre  \
0     1  The Shawshank Redemption  1994     9.3               Drama   
1     2             The Godfather  1972     9.2         Crime,Drama   
2     3           The Dark Knight  2008     9.0  Action,Crime,Drama   
3     4     The Godfather Part II  1974     9.0         Crime,Drama   
4     5              12 Angry Men  1957     9.0         Crime,Drama   

  certificate run_time                                            tagline  \
0           R   2h 22m  Fear can hold you prisoner. Hope can set you f...   
1           R   2h 55m                         An offer you can't refuse.   
2       PG-13   2h 32m                                    Why So Serious?   
3           R   3h 22m       All the power on earth can't change destiny.   
4    Approved   1h 36m  Life Is In Their Hands -- Death Is On Their Mi...   

      budget  box_office                                           

In [30]:
# Standardize column names (convert to lowercase and strip spaces)
df.columns = df.columns.str.strip().str.lower()

# Print available columns again to confirm changes
print("Available columns after cleaning:", df.columns)


Available columns after cleaning: Index(['rank', 'name', 'year', 'rating', 'genre', 'certificate', 'run_time',
       'tagline', 'budget', 'box_office', 'casts', 'directors', 'writers'],
      dtype='object')


In [31]:
# Select the relevant columns
df = df[['name', 'genre', 'tagline']]

# Drop rows where any of these columns are missing
df = df.dropna(subset=['name', 'genre', 'tagline'])

# Rename 'name' â†’ 'title' for consistency
df.rename(columns={'name': 'title'}, inplace=True)

# Create a combined description using genre and tagline
df['description'] = df['genre'] + " - " + df['tagline']

# Keep only relevant columns
df = df[['title', 'genre', 'description']]

# Print dataset shape and sample rows
print("Dataset shape after preprocessing:", df.shape)
print(df.head())

Dataset shape after preprocessing: (250, 3)
                      title               genre  \
0  The Shawshank Redemption               Drama   
1             The Godfather         Crime,Drama   
2           The Dark Knight  Action,Crime,Drama   
3     The Godfather Part II         Crime,Drama   
4              12 Angry Men         Crime,Drama   

                                         description  
0  Drama - Fear can hold you prisoner. Hope can s...  
1           Crime,Drama - An offer you can't refuse.  
2               Action,Crime,Drama - Why So Serious?  
3  Crime,Drama - All the power on earth can't cha...  
4  Crime,Drama - Life Is In Their Hands -- Death ...  


In [17]:
# Configure the TF-IDF Vectorizer with optimized parameters
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),   # Use unigrams and bigrams
    max_df=0.85,          # Allow words that occur in up to 85% of documents
    min_df=2,             # Ignore words that occur in less than 2 documents
    sublinear_tf=True     # Apply sublinear TF scaling
)

# Fit and transform the movie descriptions to create the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'])
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

TF-IDF Matrix Shape: (250, 282)


In [18]:
def expand_query(query):
    """
    Expands the user query with additional synonyms for better matching.
    """
    replacements = {
        "space": "sci-fi intergalactic galaxy",
        "thrilling": "action suspense intense",
        "comedic": "comedy humorous funny",
        "crime": "thriller heist law",
        "fight": "battle war combat",
        "alien": "extraterrestrial sci-fi"
    }
    for word, synonym in replacements.items():
        query = query.replace(word, f"{word} {synonym}")  # Keep the original and add synonyms
    return query


In [21]:
def recommend_movies(user_query, df, vectorizer, tfidf_matrix, top_n=5):
    """
    Recommends the top-N most similar movies based on the user query.
    The function expands the query, increases its weight, and applies genre-based boosting and penalties.
    """
    # Expand the user query to include related synonyms/terms
    expanded_query = expand_query(user_query)
    
    # Transform the expanded query into a TF-IDF vector and boost its weight
    user_vector = vectorizer.transform([expanded_query]) * 3  # Stronger query weight
    
    # Compute cosine similarity between the query vector and all movie description vectors
    similarities = cosine_similarity(user_vector, tfidf_matrix).flatten()
    
    # Boost movies with relevant genres (Action, Sci-Fi, Comedy) and penalize irrelevant ones (Animation, Biography, Romance)
    genre_boost = df['genre'].apply(lambda x: 0.15 if any(kw in x for kw in ['Action', 'Sci-Fi', 'Comedy']) else 0)
    genre_penalty = df['genre'].apply(lambda x: -0.1 if any(kw in x for kw in ['Animation', 'Biography', 'Romance']) else 0)
    
    # Adjust similarity scores with the boost and penalty
    similarities += genre_boost + genre_penalty
    
    # Get indices of the top-N most similar movies
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    # Return the top recommended movies along with their title, description, genre, and similarity score.
    return df.iloc[top_indices][['title', 'description', 'genre']].assign(similarity_score=similarities[top_indices])

In [22]:
# Example user query
user_query = "I love thrilling action movies set in space, with a comedic twist."

# Get the top 5 recommendations
recommendations = recommend_movies(user_query, df, tfidf_vectorizer, tfidf_matrix)
print("Top Recommendations:")
print(recommendations)

Unnamed: 0,title,description,genre,similarity_score
50,Alien,"Horror,Sci-Fi - In space no one can hear you s...","Horror,Sci-Fi",0.687342
28,Terminator 2: Judgment Day,"Action,Sci-Fi - It's nothing personal.","Action,Sci-Fi",0.600087
13,Inception,"Action,Adventure,Sci-Fi - Your mind is the sce...","Action,Adventure,Sci-Fi",0.505708
175,Blade Runner,"Action,Drama,Sci-Fi - A Futuristic Vision Perf...","Action,Drama,Sci-Fi",0.503562
29,Back to the Future,"Adventure,Comedy,Sci-Fi - He's the only kid ev...","Adventure,Comedy,Sci-Fi",0.49427
