Build a Recommendation System using Collaborative Filtering or Matrix Factorization.


In [2]:
import pandas as pd

# Load movie ratings dataset
ratings_url = 'https://raw.githubusercontent.com/smanihwr/ml-latest-small/master/ratings.csv'
df = pd.read_csv(ratings_url)

print(df.head())
print("\nShape:", df.shape)


   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Shape: (100836, 4)


In [3]:
user_item_matrix = df.pivot_table(index='userId', columns='movieId', values='rating')
user_item_matrix.fillna(0, inplace=True)

print(user_item_matrix.head())


movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

print(user_similarity_df.head())


userId       1         2         3         4         5         6         7    \
userId                                                                         
1       1.000000  0.027283  0.059720  0.194395  0.129080  0.128152  0.158744   
2       0.027283  1.000000  0.000000  0.003726  0.016614  0.025333  0.027585   
3       0.059720  0.000000  1.000000  0.002251  0.005020  0.003936  0.000000   
4       0.194395  0.003726  0.002251  1.000000  0.128659  0.088491  0.115120   
5       0.129080  0.016614  0.005020  0.128659  1.000000  0.300349  0.108342   

userId       8         9         10   ...       601       602       603  \
userId                                ...                                 
1       0.136968  0.064263  0.016875  ...  0.080554  0.164455  0.221486   
2       0.027257  0.000000  0.067445  ...  0.202671  0.016866  0.011997   
3       0.004941  0.000000  0.000000  ...  0.005048  0.004892  0.024992   
4       0.062969  0.011361  0.031163  ...  0.085938  0.128273  0

In [5]:
def recommend_movies(user_id, num_recommendations=5):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:]
    
    recommended_movies = pd.Series(dtype='float64')

    for similar_user, similarity_score in similar_users.items():
        user_movies = user_item_matrix.loc[similar_user]
        user_movies = user_movies[user_movies > 0]
        for movie, rating in user_movies.items():
            if user_item_matrix.loc[user_id, movie] == 0:  # If user hasn't watched it
                if movie in recommended_movies:
                    recommended_movies[movie] += similarity_score * rating
                else:
                    recommended_movies[movie] = similarity_score * rating

    recommended_movies = recommended_movies.sort_values(ascending=False).head(num_recommendations)
    return recommended_movies

# Example: Recommendations for user 1
recommendations = recommend_movies(1)
print(recommendations)


318     215.449703
589     169.401182
858     150.915327
2762    135.009864
4993    131.865541
dtype: float64
