# Dataset Preview

In [1]:
import pandas as pd

# Load the data
df = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

# Merge them on the movie title
df = df.merge(credits, on='title')
df.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


# Data Pre-Processing & Vectorization
Here, we try to make a single tags column summarizing each movie into one feature containing columns: overview, words, genres, keywords, cast (top 3 actors), and director.

In [2]:
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [16]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your dataset 
df = df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Helper Functions 
def convert(obj):
    try:
        L = []
        for i in ast.literal_eval(obj):
            L.append(i['name'])
        return L
    except:
        return []

def get_top_cast(obj):
    try:
        L = []
        for i in ast.literal_eval(obj):
            if i['order'] < 3:
                L.append(i['name'])
        return L
    except:
        return []

def get_director(obj):
    try:
        for i in ast.literal_eval(obj):
            if i['job'] == 'Director':
                return i['name']
        return ''
    except:
        return ''

# Apply the conversion functions 
df['genres'] = df['genres'].apply(convert)
df['keywords'] = df['keywords'].apply(convert)
df['cast'] = df['cast'].apply(get_top_cast)
df['crew'] = df['crew'].apply(get_director)
df['overview'] = df['overview'].apply(lambda x: x if isinstance(x, list) else str(x).split())
df['crew'] = df['crew'].apply(lambda x: [x])
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']
df['tags'] = df['tags'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
df['tags'] = df['tags'].apply(lambda x: x.lower())

Vectorization and similarity matrix done


Then, we proceed to **vectorization** part:
- We turned the "tags" text into a numerical vector using Bag-of-Words (BoW)
- It’s like assigning weights to words: how often they show up across all movies
- We keep only the top 5000 most frequent words

Then we compared vectors using **cosine similarity**, which tells us how close two movies are in terms of content. If the cosine score is close to 1, it means the movies are super similar.

In [17]:
# --- Vectorization ---
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df['tags']).toarray()

# --- Similarity Matrix ---
similarity = cosine_similarity(vectors)

print("Vectorization and similarity matrix done")
df['tags'].head()

0    in the 22nd century, a paraplegic marine is di...
1    captain barbossa, long believed to be dead, ha...
2    a cryptic message from bond’s past sends him o...
3    following the death of district attorney harve...
4    john carter is a war-weary, former military ca...
Name: tags, dtype: object

# Recommendations
The **recommend()** function:
- Takes the index of your input movie
- Finds all cosine similarity scores for it
- Sorts them from highest to lowest
- Returns the top 5 (excluding itself)

A few examples of recommendations are shown.

In [20]:
def recommend(movie):
    movie = movie.lower()
    if movie not in df['title'].str.lower().values:
        print("Movie not found!")
        return
    
    # Get index of the movie
    index = df[df['title'].str.lower() == movie].index[0]
    
    # Fetch similarity scores
    distances = list(enumerate(similarity[index]))
    
    # Sort based on similarity (excluding itself)
    sorted_distances = sorted(distances, key=lambda x: x[1], reverse=True)[1:6]
    
    print(f"\nTop 5 movies similar to '{df.iloc[index]['title']}' are:\n")
    for i in sorted_distances:
        print(df.iloc[i[0]]['title'])


In [28]:
recommend("Interstellar")


Top 5 movies similar to 'Interstellar' are:

Space Pirate Captain Harlock
The Green Inferno
Beyond the Valley of the Dolls
Stuart Little
There Goes My Baby


In [29]:
recommend("Inception")


Top 5 movies similar to 'Inception' are:

Up
Peaceful Warrior
Hotel Transylvania 2
Pitch Perfect 2
Open Road
