In [None]:
# STEP 1 – Import Libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

In [None]:
# STEP 2 – Load Data

# Set path to your data folder
data_path = "data/"

# Load datasets
credits = pd.read_csv(data_path + "credits.csv")
keywords = pd.read_csv(data_path + "keywords.csv")
links = pd.read_csv(data_path + "links.csv")
links_small = pd.read_csv(data_path + "links_small.csv")
movies_metadata = pd.read_csv(data_path + "movies_metadata.csv", low_memory=False)
ratings = pd.read_csv(data_path + "ratings.csv")
rating_small = pd.read_csv(data_path + "rating_small.csv")

In [None]:
# STEP 3 – Clean & Prepare Data

# Convert IDs to numeric safely
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')
keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce')

# Drop NaNs
movies_metadata.dropna(subset=['id'], inplace=True)

# Merge additional features
movies = movies_metadata.merge(credits, on='id', how='left')
movies = movies.merge(keywords, on='id', how='left')

In [None]:
# STEP 4 – Create Content Features

# We’ll combine: Genres, Overview, Keywords, Cast, Crew then Create 'soup' text for vectorizer

# Fill NaN with empty strings
movies['overview'] = movies['overview'].fillna('')
movies['genres'] = movies['genres'].fillna('[]')
movies['keywords'] = movies['keywords'].fillna('[]')
movies['cast'] = movies['cast'].fillna('[]')
movies['crew'] = movies['crew'].fillna('[]')

import ast

def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        if isinstance(i, dict) and 'name' in i:
            L.append(i['name'])
    return ' '.join(L)

# Apply to each relevant column
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert)
movies['crew'] = movies['crew'].apply(convert)

# Create a text soup
movies['soup'] = movies['genres'] + ' ' + movies['keywords'] + ' ' + movies['overview'] + ' ' + movies['cast'] + ' ' + movies['crew']

In [None]:
# STEP 5 – Vectorize Content Features

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['soup'])

# Compute similarity matrix:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
# STEP 6 – Create Movie Title Index

indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

In [None]:
# STEP 7 – Collaborative Filtering Model (SVD)

# We’ll train on all ratings but keep a test split to measure performance. Prepare ratings data for Surprise

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(ratings, test_size=0.20, random_state=42)

# Use full training data for fitting SVD
train_data = Dataset.load_from_df(trainset[['userId', 'movieId', 'rating']], reader)
trainset_full = train_data.build_full_trainset()

# Fit SVD
algo = SVD()
algo.fit(trainset_full)

# Optional cross-validation:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
# STEP 8 – Define Recommendation Functions

# Function: Recommend Similar Movies

def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices.get(title)
    if idx is None:
        return []
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:101]  # Exclude itself, get top 100
    
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices][['title', 'genres']]

# Function: Recommend by Genre
def recommend_by_genre(genre_name):
    results = movies[movies['genres'].str.contains(genre_name, case=False, na=False)]
    return results[['title', 'genres']].head(100)

# Function: Hybrid Recommendation

# If the user gives both movie title & genre:
def hybrid_recommendation(movie_title=None, genre=None):
    if movie_title and genre:
        movies_by_genre = recommend_by_genre(genre)
        similar_movies = get_recommendations(movie_title)
        hybrid_df = pd.merge(movies_by_genre, similar_movies, on='title', how='inner')
        return hybrid_df.head(100)
    
    elif movie_title:
        return get_recommendations(movie_title)
    
    elif genre:
        return recommend_by_genre(genre)
    
    else:
        return pd.DataFrame({"title": [], "genres": []})

In [None]:
# STEP 9 – User Input & Test

# Example:
# Test hybrid function
title_input = "The Dark Knight"
genre_input = "Action"

recommendations = hybrid_recommendation(title_input, genre_input)
print(recommendations.head(10))

In [None]:
# STEP 10 – Save Model (Optional)

import pickle

# Save trained SVD model
with open("svd_model.pkl", "wb") as f:
    pickle.dump(algo, f)