In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movies-name-and-ratings/title.basics.tsv
/kaggle/input/movies-name-and-ratings/title.ratings.tsv


In [3]:
import pandas as pd

# Load both datasets
basics = pd.read_csv('/kaggle/input/movies-name-and-ratings/title.basics.tsv', sep='\t', low_memory=False)
ratings = pd.read_csv('/kaggle/input/movies-name-and-ratings/title.ratings.tsv', sep='\t', low_memory=False)

# Merge first
movies = basics.merge(ratings, on='tconst')
print("After merge:", movies.shape)


After merge: (1587436, 11)


In [4]:
# Only movies
movies = movies[movies['titleType'] == 'movie']

# Not adult
movies = movies[movies['isAdult'] == 0]

# Drop rows where 'primaryTitle' or 'genres' are missing
movies = movies.dropna(subset=['primaryTitle', 'genres'])

# Drop duplicates based on movie name
movies = movies.drop_duplicates(subset='primaryTitle')

# Reset index
movies = movies.reset_index(drop=True)

print("After filtering:", movies.shape)


After filtering: (0, 11)


In [5]:
# Combine title and genres
movies['tags'] = movies['primaryTitle'] + ' ' + movies['genres']


In [6]:
print("🔹 basics shape:", basics.shape)
print("🔹 ratings shape:", ratings.shape)

# Merge again
movies = basics.merge(ratings, on='tconst')
print("🔹 after merge:", movies.shape)


🔹 basics shape: (11766461, 9)
🔹 ratings shape: (1587436, 3)
🔹 after merge: (1587436, 11)


In [7]:
print(movies['titleType'].value_counts())


titleType
tvEpisode       808523
movie           332208
short           172297
tvSeries        105050
video            57180
tvMovie          55716
tvMiniSeries     22040
videoGame        18464
tvSpecial        13480
tvShort           2478
Name: count, dtype: int64


In [8]:
movies = movies[movies['titleType'].str.lower() == 'movie']
print("✅ after filtering for movies:", movies.shape)


✅ after filtering for movies: (332208, 11)


In [9]:
# Filter out adult content
movies = movies[movies['isAdult'].astype(str) == '0']

# Drop rows with missing genres or titles
movies = movies.dropna(subset=['primaryTitle', 'genres'])

# Drop duplicates
movies = movies.drop_duplicates(subset='primaryTitle')

# Reset index
movies = movies.reset_index(drop=True)

print("✅ after all filtering:", movies.shape)


✅ after all filtering: (289777, 11)


In [10]:
movies['tags'] = movies['primaryTitle'] + ' ' + movies['genres']
print("Tags created!")
print(movies['tags'].sample(5))


Tags created!
68302     The Bishop's Bedroom Comedy,Crime,Drama
201778                             Snapshot Drama
106318                 Gangster Party Crime,Drama
257515                      Penny Pincher! Comedy
55783           Chinese Kamasutra Fantasy,Romance
Name: tags, dtype: object


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies['tags']).toarray()


In [14]:
# 🔢 Sort by popularity (numVotes) and select top 10,000 movies
movies = movies.sort_values(by='numVotes', ascending=False).head(10000).reset_index(drop=True)

# 🏷️ Rebuild the tags column (title + genres)
movies['tags'] = movies['primaryTitle'] + ' ' + movies['genres']


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies['tags']).toarray()

similarity = cosine_similarity(vectors)


In [16]:
def recommend(movie_name):
    movie_name = movie_name.lower()
    
    # Find the movie by title (case-insensitive)
    matches = movies[movies['primaryTitle'].str.lower() == movie_name]
    
    if matches.empty:
        print("❌ Movie not found in the dataset.")
        return
    
    index = matches.index[0]
    distances = similarity[index]
    
    # Sort distances and get top 5 similar movies (excluding itself)
    movie_list = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:6]
    
    print(f"\n🎬 Movies similar to '{movie_name.title()}':")
    for i in movie_list:
        print("➡️", movies.iloc[i[0]].primaryTitle)


In [17]:
recommend("Inception")
recommend("Titanic")
recommend("The Godfather")



🎬 Movies similar to 'Inception':
➡️ 2012
➡️ Jumper
➡️ Insurgent
➡️ Bumblebee
➡️ Jupiter Ascending
❌ Movie not found in the dataset.

🎬 Movies similar to 'The Godfather':
➡️ The Godfather Part II
➡️ The Godfather Part III
➡️ Dogville
➡️ Breathless
➡️ The Bikeriders


In [23]:
def recommend(movie):
    movie = movie.lower()
    matches = movies[movies['primaryTitle'].str.lower() == movie]

    if matches.empty:
        return []

    index = matches.index[0]
    distances = similarity[index]

    # Get top 20 most similar movies (excluding itself)
    movie_list = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:21]

    # Collect movie info with ratings
    recommendations = []
    for i in movie_list:
        title = movies.iloc[i[0]].primaryTitle
        rating = movies.iloc[i[0]].averageRating if 'averageRating' in movies.columns else 0
        recommendations.append((title, rating))

    # 🔽 Sort by rating (descending) and return top 5
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:5]
    
    return recommendations


In [24]:
recommend("Inception")
recommend("Titanic")
recommend("The Godfather")


[('The Godfather Part II', 9.0),
 ('Dogville', 8.0),
 ('Lilya 4-Ever', 7.8),
 ('Joji', 7.8),
 ('Breathless', 7.7)]

In [25]:
import pickle
import shutil

# Save the required files
pickle.dump(movies[['primaryTitle']], open('data.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

# Move to working dir to make them downloadable
shutil.move("data.pkl", "/kaggle/working/data.pkl")
shutil.move("similarity.pkl", "/kaggle/working/similarity.pkl")


'/kaggle/working/similarity.pkl'