In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv
/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv


In [2]:
import os, ast, json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
DATA_DIR = "data"
MOVIES_CSV = os.path.join(DATA_DIR, "/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv")
CREDITS_CSV = os.path.join(DATA_DIR, "/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv")

def safe_parse(x):
    if pd.isna(x): return []
    if isinstance(x, (list, dict)): return x
    try: return ast.literal_eval(x)
    except:
        try: return json.loads(x)
        except: return []

movies = pd.read_csv(MOVIES_CSV)
credits = pd.read_csv(CREDITS_CSV).rename(columns={'movie_id':'id'})
df = movies.merge(credits[['id','cast','crew']], on='id', how='left')
for c in ['genres','keywords','cast','crew']:
    if c in df.columns:
        df[c] = df[c].apply(safe_parse)
if 'overview' not in df.columns:
    df['overview'] = ''
df = df.reset_index(drop=True)
df.shape

(4803, 22)

In [4]:
def names_from_list(ld, key='name', top_n=None):
    if not isinstance(ld, list): return []
    out = []
    for item in ld:
        if isinstance(item, dict):
            val = item.get(key) or item.get('title') or item.get('name')
            if val: out.append(str(val).replace(' ','').lower())
        elif isinstance(item, str):
            out.append(item.replace(' ','').lower())
    return out[:top_n] if top_n else out

def director_from_crew(crew):
    if not isinstance(crew, list): return ''
    for p in crew:
        if isinstance(p, dict) and (p.get('job') or '').lower() == 'director':
            return str(p.get('name','')).replace(' ','').lower()
    return ''

def build_soup_row(r, cast_top_n=3, weights=None):
    if weights is None:
        weights = {'genres':3,'keywords':2,'cast':2,'director':2,'overview':1}
    g = names_from_list(r.get('genres', []))
    k = names_from_list(r.get('keywords', []))
    c = names_from_list(r.get('cast', []), top_n=cast_top_n)
    d = director_from_crew(r.get('crew', []))
    overview = str(r.get('overview','') or '').lower()
    parts = []
    parts += g * weights.get('genres',1)
    parts += k * weights.get('keywords',1)
    parts += c * weights.get('cast',1)
    if d: parts += [d] * weights.get('director',1)
    parts.append(overview)
    return " ".join(parts)

df['soup'] = df.apply(lambda r: build_soup_row(r, cast_top_n=3), axis=1)

In [5]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['soup'].fillna(''))
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
title_to_index = pd.Series(df.index, index=df['title']).to_dict()
lower_to_index = {t.lower(): i for t,i in title_to_index.items()}

In [6]:
def recommend(title, n=5):
    if title not in title_to_index and title.lower() not in lower_to_index:
        candidates = [t for t in title_to_index.keys() if title.lower() in t.lower()]
        if candidates:
            return pd.DataFrame({"suggestions": candidates[:10]})
        return pd.DataFrame({"error":[f"Title '{title}' not found"]})
    idx = title_to_index.get(title, lower_to_index.get(title.lower()))
    scores = list(enumerate(cosine_sim[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = [s for s in scores if s[0] != idx][:n]
    indices = [i for i,_ in scores]
    out = df.iloc[indices][['id','title','overview','genres']].copy().reset_index(drop=True)
    out['overview'] = out['overview'].astype(str).str.slice(0,180).str.replace('\n',' ') + '...'
    def genres_to_str(g):
        if isinstance(g, list):
            names = []
            for item in g:
                if isinstance(item, dict):
                    names.append(item.get('name',''))
                else:
                    names.append(str(item))
            return ", ".join([x for x in names if x])[:120]
        return str(g)[:120]
    out['genres'] = out['genres'].apply(genres_to_str)
    out['score'] = [float(f"{s:.4f}") for _,s in scores]
    return out

In [7]:
print("Some movie titles:", df['title'].head(10).tolist())

queries = ["Avatar", "Spectre", "Pirates of the Caribbean: At World's End"]
for q in queries:
    print("\nQuery:", q)
    print(recommend(q, n=5).to_string(index=False))

Some movie titles: ['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice']

Query: Avatar
    id                   title                                                                                                                                                                                overview                                       genres  score
   679                  Aliens When Ripley's lifepod is found by a salvage crew over 50 years later, she finds that terra-formers are on the very planet they found the alien species. When the company sends a fam...    Horror, Action, Thriller, Science Fiction 0.2842
 54138 Star Trek Into Darkness When the crew of the Enterprise is called back home, they find an unstoppable force of terror from within their own organization has detonated the fleet and ever