In [9]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


In [6]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [3]:
print(movies.head())
print(ratings.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [10]:
# Create user-movie matrix (userId x movieId)
ratings_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
ratings_matrix.fillna(0, inplace=True)
ratings_matrix.head()


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Create a mapping dictionary
movie_title_to_id = pd.Series(movies.movieId.values, index=movies.title).to_dict()
movie_id_to_title = pd.Series(movies.title.values, index=movies.movieId).to_dict()


In [12]:
def get_similar_movies(target_movie_title, min_ratings=50):
    # Get movieId from title
    if target_movie_title not in movie_title_to_id:
        print(f"Movie '{target_movie_title}' not found.")
        return None

    target_id = movie_title_to_id[target_movie_title]
    
    # Check if movie is in matrix
    if target_id not in ratings_matrix.columns:
        print(f"Movie ID '{target_id}' not found in ratings matrix.")
        return None

    # Get the target movie's rating vector
    target_vector = ratings_matrix[target_id]
    
    # Compute similarity with all other movies
    similarity = ratings_matrix.corrwith(target_vector)
    
    # Build result DataFrame
    similar_movies = pd.DataFrame(similarity, columns=['Correlation'])
    similar_movies.dropna(inplace=True)
    
    # Join with number of ratings
    movie_stats = ratings.groupby('movieId')['rating'].count()
    similar_movies = similar_movies.join(movie_stats.rename('rating_count'))
    
    # Filter and sort
    result = similar_movies[similar_movies['rating_count'] >= min_ratings].sort_values('Correlation', ascending=False)
    result = result[result.index != target_id]  # exclude the input movie
    
    # Add movie titles back
    result['title'] = result.index.map(movie_id_to_title)
    
    return result[['title', 'Correlation', 'rating_count']].head(10)


In [13]:
similar_movies = get_similar_movies("Jumanji (1995)")
print(similar_movies)


                                          title  Correlation  rating_count
movieId                                                                   
364                       Lion King, The (1994)     0.481138           172
500                       Mrs. Doubtfire (1993)     0.443177           144
367                            Mask, The (1994)     0.433618           157
551      Nightmare Before Christmas, The (1993)     0.430304            93
586                           Home Alone (1990)     0.427304           116
158                               Casper (1995)     0.422629            62
317                    Santa Clause, The (1994)     0.415130            81
19        Ace Ventura: When Nature Calls (1995)     0.412808            88
480                        Jurassic Park (1993)     0.397671           238
595                 Beauty and the Beast (1991)     0.388356           146
