In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load datasets
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv', low_memory=False)

# Check the first few rows
print(ratings.head())
print(movies[['id', 'title']].head())
# Convert 'id' in movies to numeric, errors='coerce' will turn bad rows to NaN
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')

# Drop rows with NaN movie IDs
movies = movies.dropna(subset=['id'])

# Convert to integer to match ratings
movies['id'] = movies['id'].astype(int)

# Merge both datasets
df = pd.merge(ratings, movies, left_on='movieId', right_on='id')
df = df[['userId', 'title', 'rating']]
df.head()
user_movie_matrix = df.pivot_table(index='userId', columns='title', values='rating')
user_movie_matrix.head()
# Choose a movie
movie_liked = 'The Dark Knight'

# Get ratings of that movie
movie_ratings = user_movie_matrix[movie_liked]

# Find similar movies
similar_movies = user_movie_matrix.corrwith(movie_ratings)

# Build DataFrame
corr_df = pd.DataFrame(similar_movies, columns=['Correlation'])
corr_df.dropna(inplace=True)

# Add # of ratings per movie
rating_counts = df.groupby('title')['rating'].count()
corr_df['NumRatings'] = rating_counts

# Filter: recommend only movies with enough ratings
recommendations = corr_df[corr_df['NumRatings'] > 50].sort_values('Correlation', ascending=False).head(10)
print(recommendations)


   userId  movieId  rating   timestamp
0       1       31     2.5  1260759144
1       1     1029     3.0  1260759179
2       1     1061     3.0  1260759182
3       1     1129     2.0  1260759185
4       1     1172     4.0  1260759205
      id                        title
0    862                    Toy Story
1   8844                      Jumanji
2  15602             Grumpier Old Men
3  31357            Waiting to Exhale
4  11862  Father of the Bride Part II


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


                           Correlation  NumRatings
title                                             
Imagine Me & You                   1.0          67
K-19: The Widowmaker               1.0          74
Beetlejuice                        1.0          95
Monsieur  Hulot's Holiday          1.0         124
The Bourne Supremacy               1.0         106
5 Card Stud                        1.0         200
The Last Laugh                     1.0          51
Sweet Sixteen                      1.0          65
Who Killed Bambi?                  1.0         113
When Saturday Comes                1.0          70
