In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies_df = pd.read_csv('../data/movies.csv', usecols=['movieId', 'title'])
users_rating_df = pd.read_csv('../data/user_rating_history.csv', usecols=['userId', 'movieId', 'rating'])

In [3]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
users_rating_df.head()

Unnamed: 0,userId,movieId,rating
0,42170,1,4.0
1,42170,7,4.0
2,42170,17,4.0
3,42170,24,2.0
4,42170,36,2.0


In [5]:
movies_df.isnull().mean()

movieId    0.0
title      0.0
dtype: float64

In [6]:
users_rating_df.isnull().mean()

userId     0.000000
movieId    0.000000
rating     0.017849
dtype: float64

In [7]:
print('Number of duplicated movies:', movies_df.duplicated().sum())
print('Number of duplicated ratings:', users_rating_df.duplicated().sum())

Number of duplicated movies: 0
Number of duplicated ratings: 94087


In [8]:
users_rating_df.drop_duplicates(keep='first', inplace=True)

In [9]:
users_rating_df.dropna(inplace=True)

In [10]:
print('Number of movies:', movies_df.shape[0])
print('Number of users:', users_rating_df['userId'].nunique())
print('Number of ratings:', users_rating_df.shape[0])

Number of movies: 105071
Number of users: 4418
Number of ratings: 1924470


In [11]:
movies_df['movieId'].agg(['min', 'max', 'count'])

min           1
max      300373
count    105071
Name: movieId, dtype: int64

In [12]:
users_rating_df['movieId'].agg(['min', 'max', 'count'])

min            1
max       300341
count    1924470
Name: movieId, dtype: int64

In [13]:
min_ratings = 20

items_rating_count = users_rating_df['movieId'].value_counts()
popular_items = items_rating_count[items_rating_count > min_ratings].index
users_rating_df_filtered = users_rating_df.query('movieId in @popular_items')

print('Number of popular items:', len(popular_items))

Number of popular items: 10502


In [14]:
user_movie_ratings_matrix = pd.pivot_table(data=users_rating_df_filtered, index='userId', columns='movieId', values='rating', fill_value=0)

In [15]:
user_movie_ratings_matrix.iloc[:10, :20]

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
42170,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
43715,4.0,2.0,0.0,3.5,0.0,2.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0
44282,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0
50108,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50602,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,2.0,0.0
51273,5.0,2.25,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,3.25,0.0,3.0,0.0,0.0,2.25,3.0,0.0,1.75,2.0
51779,3.0,1.5,0.0,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,4.0,0.0,0.0,2.0,2.0
55083,3.5,2.0,1.75,0.0,1.0,3.5,5.0,0.0,0.0,0.0,3.75,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
55641,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.5,0.0,0.0,3.0,3.5,0.0,0.0,0.0
55904,5.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
users_similarity_matrix = cosine_similarity(user_movie_ratings_matrix)
users_similarity_df = pd.DataFrame(users_similarity_matrix, index=user_movie_ratings_matrix.index, columns=user_movie_ratings_matrix.index)

In [17]:
items_similarity_matrix = cosine_similarity(user_movie_ratings_matrix.T)
items_similarity_df = pd.DataFrame(items_similarity_matrix, index=user_movie_ratings_matrix.columns, columns=user_movie_ratings_matrix.columns)

In [18]:
def ubcf_recommendations(user_similarity, user_ratings, user_id, k=10):

    if user_id not in user_ratings.index:
        raise Exception('User ID does not exist')
    
    user_ratings_filled = user_ratings.replace(0, np.nan)

    target_profile = user_ratings_filled.loc[user_id]
    target_mean = target_profile.mean()

    target_not_rated_movies = target_profile[target_profile.isna()].index
    
    similarities = (
        user_similarity.loc[user_id]
        .drop(user_id)
        .sort_values(ascending=False)
        .head(k)
    )

    neighbors = user_ratings_filled.loc[similarities.index]
    neighbors_mean = neighbors.mean(axis=1)

    ratings_predictions = []

    for movie in target_not_rated_movies:
        neighbors_ratings = neighbors[movie].dropna()

        if neighbors_ratings.empty:
            continue

        sim = similarities.loc[neighbors_ratings.index]
        mean_diff = neighbors_ratings - neighbors_mean.loc[neighbors_ratings.index]

        pred = target_mean + np.sum(sim * mean_diff) / np.sum(np.abs(sim))

        ratings_predictions.append((movie, pred))

    return pd.DataFrame(ratings_predictions, columns=['movieId', 'predicted_rating'])


def ibcf_recommendations(item_similarity, user_ratings, user_id, k=10):
    
    if user_id not in user_ratings.index:
        raise Exception('User ID does not exist')
    
    user_ratings_filled = user_ratings.replace(0, np.nan)
    
    target_profile = user_ratings_filled.loc[user_id]
    
    target_not_rated_movies = target_profile.loc[target_profile.isna()].index
    
    predictions = []
    
    for movie in target_not_rated_movies:
        neighbors_similarity = (item_similarity.loc[movie]
                          .drop(index=target_not_rated_movies)
                          .sort_values(ascending=False)
                                .head(k))
        target_movie_rates = target_profile.loc[neighbors_similarity.index]
        
        pred = np.sum(neighbors_similarity * target_movie_rates) / np.sum(np.abs(neighbors_similarity))
        
        predictions.append((movie, pred))
        
    return pd.DataFrame(predictions, columns=['movieId', 'predicted_rating'])


def hybrid_recommendations(user_similarity, items_similarity, user_ratings, user_id, 
                           k=30, sim_users_k=10, sim_items_k=10, alpha=0.5):
    ubcf_recommends = ubcf_recommendations(user_similarity, user_ratings, user_id, sim_users_k).set_index('movieId')
    ibcf_recommends = ibcf_recommendations(items_similarity, user_ratings, user_id, sim_items_k).set_index('movieId')
    
    ubcf_recommends.rename(columns={'predicted_rating': 'ubcf_rating'}, inplace=True)
    ibcf_recommends.rename(columns={'predicted_rating': 'ibcf_rating'}, inplace=True)
    
    final_recommends = ubcf_recommends.merge(ibcf_recommends, how='outer', left_index=True, right_index=True)
    
    final_recommends['hybrid_rating'] = ((final_recommends['ubcf_rating'].fillna(0) * alpha) + 
                                             (final_recommends['ibcf_rating'].fillna(0) * (1 - alpha)))
    
    final_recommends = final_recommends.sort_values(by='hybrid_rating', ascending=False)
    
    return final_recommends['hybrid_rating'].head(k)

In [19]:
def get_movies_name(movies_id, movies_name):
    movies = movies_name.loc[movies_name['movieId'].isin(movies_id)]
    return movies['title'].tolist()

In [21]:
try:
    recommendations = hybrid_recommendations(
        user_similarity=users_similarity_df,
        items_similarity=items_similarity_df,
        user_ratings=user_movie_ratings_matrix,
        k=20,
        user_id=42170
    )
    
    recommended_movies_name = get_movies_name(recommendations.index, movies_df)
    
    for i, movie in enumerate(recommended_movies_name, 1):
        print(f'{i}.{movie}')
        
except Exception as e:
    print(e)

1.Strictly Ballroom (1992)
2.Raising Arizona (1987)
3.Grosse Pointe Blank (1997)
4.Perfect Blue (1997)
5.Moonstruck (1987)
6.Fisher King, The (1991)
7.Monty Python Live at the Hollywood Bowl (1982)
8.Turning Red (2022)
9.Guillermo del Toro's Pinocchio (2022)
10.My Neighbors the Yamadas (HÃ´hokekyo tonari no Yamada-kun) (1999)
11.Pan's Labyrinth (Laberinto del fauno, El) (2006)
12.Muppet Family Christmas, A (1987)
13.John Mulaney: New In Town (2012)
14.Barbie (2023)
15.Hunt for the Wilderpeople (2016)
16.Piper (2016)
17.Shin Godzilla (2016)
18.The Big Sick (2017)
19.Three Billboards Outside Ebbing, Missouri (2017)
20.Booksmart (2019)
