In [29]:
import kagglehub
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download dataset from Kaggle
path = kagglehub.dataset_download("rajugc/imdb-top-250-movies-dataset")

# List dataset files
print("Dataset files:", os.listdir(path))

# Load the correct CSV file
df = pd.read_csv(os.path.join(path, "IMDB Top 250 Movies.csv"))

# Display first few rows
print(df.head())

Dataset files: ['IMDB Top 250 Movies.csv']
   rank                      name  year  rating               genre  \
0     1  The Shawshank Redemption  1994     9.3               Drama   
1     2             The Godfather  1972     9.2         Crime,Drama   
2     3           The Dark Knight  2008     9.0  Action,Crime,Drama   
3     4     The Godfather Part II  1974     9.0         Crime,Drama   
4     5              12 Angry Men  1957     9.0         Crime,Drama   

  certificate run_time                                            tagline  \
0           R   2h 22m  Fear can hold you prisoner. Hope can set you f...   
1           R   2h 55m                         An offer you can't refuse.   
2       PG-13   2h 32m                                    Why So Serious?   
3           R   3h 22m       All the power on earth can't change destiny.   
4    Approved   1h 36m  Life Is In Their Hands -- Death Is On Their Mi...   

      budget  box_office                                           

In [30]:
# Keep relevant columns: Title, Genre, and Tagline
df = df[['name', 'genre', 'tagline']].dropna()

# Combine genre and tagline to create a more meaningful "description"
df['description'] = df['genre'] + " - " + df['tagline']

# Rename for consistency
df = df[['name', 'description']]
df.columns = ['title', 'description']

# Display dataset shape and sample rows
print("Dataset Shape:", df.shape)
df.head()


Dataset Shape: (250, 2)


Unnamed: 0,title,description
0,The Shawshank Redemption,Drama - Fear can hold you prisoner. Hope can s...
1,The Godfather,"Crime,Drama - An offer you can't refuse."
2,The Dark Knight,"Action,Crime,Drama - Why So Serious?"
3,The Godfather Part II,"Crime,Drama - All the power on earth can't cha..."
4,12 Angry Men,"Crime,Drama - Life Is In Their Hands -- Death ..."


In [31]:
# Improve TF-IDF vectorizer with relevant vocabulary
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),  # Use unigrams & bigrams
    max_df=0.8,          # Ignore overly common words
    min_df=2,            # Ignore very rare words
    vocabulary=['action', 'thriller', 'sci-fi', 'space', 'adventure', 'comedy']
)

# Convert descriptions to TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'])

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)  # (num_movies, num_unique_words)


TF-IDF Matrix Shape: (250, 6)


In [32]:
def expand_query(query):
    """
    Expands user query with relevant synonyms for better matching.
    """
    replacements = {
        "space": "sci-fi",
        "thrilling": "action",
        "comedic": "comedy",
        "crime": "thriller"
    }
    for word, synonym in replacements.items():
        query = query.replace(word, f"{word} {synonym}")  # Keep both words
    return query


In [33]:
def recommend_movies(user_query, df, vectorizer, tfidf_matrix, top_n=5):
    """
    Improved: Expands query and boosts action/sci-fi/comedy movies.
    """
    expanded_query = expand_query(user_query)  # Expand query
    user_vector = vectorizer.transform([expanded_query])

    similarities = cosine_similarity(user_vector, tfidf_matrix).flatten()

    # Boost movies in relevant genres
    genre_boost = df['description'].apply(lambda x: 0.1 if 'Action' in x or 'Sci-Fi' in x or 'Comedy' in x else 0)
    similarities += genre_boost  # Add boost to similarity scores

    top_indices = similarities.argsort()[-top_n:][::-1]
    
    return df.iloc[top_indices][['title', 'description']].assign(similarity_score=similarities[top_indices])


In [34]:
# Example query
user_query = "I love thrilling action movies set in space, with a comedic twist."

# Get recommendations
recommendations = recommend_movies(user_query, df, tfidf_vectorizer, tfidf_matrix)

# Display results
recommendations


Unnamed: 0,title,description,similarity_score
153,Finding Nemo,"Animation,Adventure,Comedy - 71% of the Earth'...",0.812985
194,Sherlock Jr.,"Action,Comedy,Romance - every inch of footage ...",0.788344
159,"Lock, Stock and Two Smoking Barrels","Action,Comedy,Crime - A Disgrace to Criminals ...",0.788344
50,Alien,"Horror,Sci-Fi - In space no one can hear you s...",0.786677
124,Dangal,"Action,Biography,Drama - You think our girls a...",0.741897
