In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# Load dataset
df = pd.read_csv('tmdb_5000_movies.csv')

# View sample
df[['title', 'genres', 'keywords']].head()

Unnamed: 0,title,genres,keywords
0,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."
1,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na..."
2,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name..."
3,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,..."
4,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":..."


In [10]:
import ast

# Function to extract names from JSON-like string
def extract_names(text):
    try:
        return ' '.join([i['name'] for i in ast.literal_eval(text)])
    except:
        return ''

# Apply to 'genres' and 'keywords'
df['genres'] = df['genres'].apply(extract_names)
df['keywords'] = df['keywords'].apply(extract_names)

# Combine genres + keywords into one feature
df['combined'] = df['genres'] + ' ' + df['keywords']

# Drop rows with missing data
df.dropna(subset=['combined'], inplace=True)

df[['title', 'combined']].head()

Unnamed: 0,title,combined
0,Avatar,Action Adventure Fantasy Science Fiction cultu...
1,Pirates of the Caribbean: At World's End,Adventure Fantasy Action ocean drug abuse exot...
2,Spectre,Action Adventure Crime spy based on novel secr...
3,The Dark Knight Rises,Action Crime Drama Thriller dc comics crime fi...
4,John Carter,Action Adventure Science Fiction based on nove...


In [12]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined'])

print("TF-IDF Matrix shape:", tfidf_matrix.shape)

TF-IDF Matrix shape: (4803, 7069)


In [14]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [16]:
# Reset index for matching
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'].str.lower())

# Recommendation function
def recommend(title, cosine_sim=cosine_sim):
    title = title.lower()
    if title not in indices:
        return "Movie not found in database."
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # top 5 excluding the movie itself

    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

In [18]:
recommend("Avatar")

278     Planet of the Apes
239                Gravity
2403                Aliens
838                 Alien³
541                Soldier
Name: title, dtype: object

In [20]:
pip install streamlit

Note: you may need to restart the kernel to use updated packages.
