In [None]:
# Project : Movie Recommendation System

In [6]:
# Content-Based Movie Recommendation System using TMDB 5000 Dataset

# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

# Step 2: Load Dataset
movies = pd.read_csv('C:\\Users\\Saikat\\.cache\\kagglehub\\datasets\\tmdb\\tmdb-movie-metadata\\versions\\2\\tmdb_5000_movies.csv')
credits = pd.read_csv('C:\\Users\\Saikat\\.cache\\kagglehub\\datasets\\tmdb\\tmdb-movie-metadata\\versions\\2\\tmdb_5000_credits.csv')

# Step 3: Merge Datasets on 'title'
movies = movies.merge(credits, on='title')

# Step 4: Keep Only Required Columns
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

# Step 5: Drop Missing Data
movies.dropna(inplace=True)

# Step 6: Convert JSON-like Strings to Python Lists
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Step 7: Get Top 3 Cast Members
def convert_cast(obj):
    L = []
    count = 0
    for i in ast.literal_eval(obj):
        if count < 3:
            L.append(i['name'])
            count += 1
        else:
            break
    return L

movies['cast'] = movies['cast'].apply(convert_cast)

# Step 8: Extract Director's Name
def get_director(obj):
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            return [i['name']]
    return []

movies['crew'] = movies['crew'].apply(get_director)

# Step 9: Split Overview into Words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Step 10: Remove Spaces in Multi-word Names
for feature in ['genres', 'keywords', 'cast', 'crew']:
    movies[feature] = movies[feature].apply(lambda x: [i.replace(" ", "") for i in x])

# Step 11: Create a New 'tags' Column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Step 12: Create New DataFrame
new = movies[['movie_id', 'title', 'tags']].copy()

# Step 13: Convert Tags List to String
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

# Step 14: Convert Tags to Lowercase
new['tags'] = new['tags'].apply(lambda x: x.lower())

# Step 15: Vectorization
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new['tags']).toarray()

# Step 16: Compute Cosine Similarity
similarity = cosine_similarity(vectors)

# Step 17: Movie Recommendation Function
def recommend(movie):
    movie = movie.lower()
    if movie not in new['title'].str.lower().values:
        return ["Movie not found"]
    index = new[new['title'].str.lower() == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    recommended = []
    for i in distances[1:6]:
        recommended.append(new.iloc[i[0]].title)
    return recommended

# Step 18: Example Usage
if __name__ == "__main__":
    print("Recommended movies for 'Avatar':")
    print(recommend('Avatar'))


Recommended movies for 'Avatar':
['Titan A.E.', 'Small Soldiers', 'Independence Day', "Ender's Game", 'Aliens vs Predator: Requiem']
