In [2]:
# Step 1: Install necessary libraries
!pip install pandas scikit-learn

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Step 2: Download MovieLens dataset (100k version)
!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

# Step 3: Load movies.csv
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# Step 4: Preview data
print("Movies Data:\n", movies.head())
print("Ratings Data:\n", ratings.head())


--2025-07-08 08:55:17--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2025-07-08 08:55:17 (3.67 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  
Movies Data:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Fat

In [3]:
# Step 2: Build TF-IDF matrix from movie genres
tfidf = TfidfVectorizer(stop_words='english')

# Some movies have (no genres listed) — replace with empty string
movies['genres'] = movies['genres'].fillna('')

# Compute the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Calculate cosine similarity between all movies
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Create a reverse mapping of movie title to index
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Recommendation function
def recommend_movies(title, num_recommendations=5):
    if title not in indices:
        return f"Movie '{title}' not found in database."

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

# Try with a sample
recommend_movies("Toy Story (1995)")


Unnamed: 0,title
1706,Antz (1998)
2355,Toy Story 2 (1999)
2809,"Adventures of Rocky and Bullwinkle, The (2000)"
3000,"Emperor's New Groove, The (2000)"
3568,"Monsters, Inc. (2001)"


In [8]:
recommend_movies("Toy Story (1995)")


Unnamed: 0,title
1706,Antz (1998)
2355,Toy Story 2 (1999)
2809,"Adventures of Rocky and Bullwinkle, The (2000)"
3000,"Emperor's New Groove, The (2000)"
3568,"Monsters, Inc. (2001)"


In [9]:
recommend_movies("Heat (1995)")

Unnamed: 0,title
22,Assassins (1995)
138,Die Hard: With a Vengeance (1995)
156,"Net, The (1995)"
249,Natural Born Killers (1994)
417,Judgment Night (1993)
