In [1]:
# Importing necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load the IMDb Top 1000 movies dataset
df = pd.read_csv("imdb_top_1000.csv")
df.head()


Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [3]:
# Check for missing values in all columns
print("🔍 Checking for missing values:")
print(df[['Director', 'Genre', 'Overview']].isnull().sum())


🔍 Checking for missing values:
Director    0
Genre       0
Overview    0
dtype: int64


In [4]:
# # Filling any missing values to prevent errors during processing
df['Director'] = df['Director'].fillna('')
df['Genre'] = df['Genre'].fillna('')
df['Overview'] = df['Overview'].fillna('')


In [5]:
# Creating a new column combining key features to build the text data
df['combined_features'] = df['Director'] + ' ' + df['Genre'] + ' ' + df['Overview']


In [6]:
# To compare movies, we combine Director, Genre, and Overview into a single string.
# This helps the model understand the movie's content using TF-IDF.


In [7]:
# Convert combined text into numerical vectors
vectorizer = TfidfVectorizer(stop_words='english')
feature_matrix = vectorizer.fit_transform(df['combined_features'])


In [8]:
# TF-IDF (Term Frequency-Inverse Document Frequency) helps us convert movie descriptions into numeric vectors.
# We use stop_words='english' to remove common unhelpful words like 'the', 'is', etc.


In [9]:
# Compute similarity score between all movies
cosine_sim = cosine_similarity(feature_matrix)


In [10]:
# Cosine similarity gives us a score (from 0 to 1) showing how similar two movies are.
# Higher score = more similar content.


In [11]:
def recommend_movies(title, df, cosine_sim):
    # Check if movie exists
    if title not in df['Series_Title'].values:
        return f"❌ Movie '{title}' not found in the dataset."

    # Get index of the movie
    idx = df[df['Series_Title'] == title].index[0]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort based on similarity (excluding itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get indices of top 5 similar movies
    top_indices = [i[0] for i in sim_scores[1:6]]

    # Return recommended movie titles
    return df['Series_Title'].iloc[top_indices].tolist()


In [12]:
# This function takes a movie title, finds its index, and calculates similarity scores with other movies.
# It returns the top 5 movies that are most similar to the selected movie.


In [13]:
print("🎬 Movie Recommender System (Content Based)")
print("Try movies like: The Dark Knight, Inception, Titanic, The Godfather, Interstellar\n")
user_input = input("Enter a movie name: ")

results = recommend_movies(user_input, df, cosine_sim)

print("\n📽️ Top 5 Recommended Movies:")
if isinstance(results, list):
    for i, movie in enumerate(results, 1):
        print(f"{i}. {movie}")
else:
    print(results)


🎬 Movie Recommender System (Content Based)
Try movies like: The Dark Knight, Inception, Titanic, The Godfather, Interstellar


📽️ Top 5 Recommended Movies:
1. Call Me by Your Name
2. As Good as It Gets
3. The Notebook
4. Aliens
5. The Straight Story


In [14]:
# This part takes a movie name from the user and prints the top 5 similar movies.
# If the movie doesn't exist in the dataset, it shows an error message.
