In [7]:
# ==========================================
# üé• MOVIE RECOMMENDATION SYSTEM (TMDB DATA)
# ==========================================

# --- Imports ---
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# --- Load Data ---
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# --- Merge both datasets on title ---
movies = movies.merge(credits, on='title')

# --- Keep only required columns ---
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# --- Drop missing values ---
movies.dropna(inplace=True)

# --- Helper function to convert JSON-like text to list of names ---
def convert(text):
    L = []
    try:
        for i in ast.literal_eval(text):
            L.append(i['name'])
    except:
        pass
    return L

# --- Function to get top 3 cast members ---
def fetch_cast(text):
    L = []
    try:
        for i in ast.literal_eval(text):
            L.append(i['name'])
            if len(L) == 3:
                break
    except:
        pass
    return L

# --- Function to fetch director name ---
def fetch_director(text):
    L = []
    try:
        for i in ast.literal_eval(text):
            if i['job'] == 'Director':
                L.append(i['name'])
    except:
        pass
    return L

# --- Remove spaces between multi-word names ---
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ", ""))
    return L1

# --- Apply transformations ---
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(fetch_cast)
movies['crew'] = movies['crew'].apply(fetch_director)

# --- Clean spaces ---
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)

# --- Split overview text into words ---
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# --- Combine all features into 'tags' ---
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# --- Create new DataFrame and make a full copy (to avoid warnings) ---
new = movies[['movie_id', 'title', 'tags']].copy()

# --- Join list of words into a single string ---
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

# --- Convert text to lowercase ---
new['tags'] = new['tags'].apply(lambda x: x.lower())

# --- Convert tags into feature vectors ---
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new['tags']).toarray()

# --- Compute cosine similarity matrix ---
similarity = cosine_similarity(vector)

# --- Recommendation function ---
def recommend(movie):
    if movie not in new['title'].values:
        print(f"‚ùå Movie '{movie}' not found in database!")
        return
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    
    print(f"\nüé¨ Recommended movies similar to '{movie}':\n")
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)

# --- Example ---
recommend('Avatar')

# --- Save data and similarity matrix for deployment ---
pickle.dump(new, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

print("\n‚úÖ Model training and saving completed successfully!")



üé¨ Recommended movies similar to 'Avatar':

Titan A.E.
Small Soldiers
Independence Day
Ender's Game
Aliens vs Predator: Requiem

‚úÖ Model training and saving completed successfully!
