In [1]:
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Load the datasets from the CSV files
credits_df = pd.read_csv("tmdb_5000_credits.csv")
movies_df = pd.read_csv("tmdb_5000_movies.csv")

# Merge them into a single dataframe based on the movie's ID
movies_df = movies_df.merge(credits_df, on="title")

# Select only the columns we need for our content-based recommender
movies_df = movies_df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Display the first 2 rows to see our combined data
movies_df.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [3]:
# The data in columns like 'genres' is in a tricky format. We need helper functions to clean it up.

# This function extracts the 'name' from the JSON-like string
def parse_json_features(features_str):
    try:
        features = json.loads(features_str)
        return [feature['name'] for feature in features]
    except:
        return []

# Apply this function to 'genres' and 'keywords'
for feature in ['genres', 'keywords']:
    movies_df[feature] = movies_df[feature].apply(parse_json_features)

# This function gets the top 3 actors' names
def get_top_3_actors(cast_str):
    try:
        cast = json.loads(cast_str)
        return [actor['name'] for actor in cast[:3]]
    except:
        return []

movies_df['cast'] = movies_df['cast'].apply(get_top_3_actors)

# This function gets the director's name from the crew list
def get_director(crew_str):
    try:
        crew = json.loads(crew_str)
        for member in crew:
            if member['job'] == 'Director':
                return [member['name']]
        return []
    except:
        return []

movies_df['crew'] = movies_df['crew'].apply(get_director)

print("Data cleaning complete. Here's a sample:")
movies_df.head(1)

Data cleaning complete. Here's a sample:


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
