In [1]:
import pandas as pd

In [2]:
def load_data():
    ratings_df = pd.read_csv("dataset\\ratings.csv")
    movies_df = pd.read_csv("dataset\\movies.csv")
    return ratings_df, movies_df

ratings_df, movies_df = load_data()

In [3]:
print(ratings_df.head())
print(movies_df.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


The damped mean formula:

$$s(i) = \frac{\sum_{u \in U_i} r_{ui} + \alpha\mu}{|U_i| + \alpha}$$

Notation:
- $s(i)$: Damped mean rating for item $i$.
- $\sum_{u \in U_i} r_{ui}$: Sum of ratings for item $i$.
- $\alpha$: Damping factor, a value that determines the extent of smoothing.
- $\mu$: Global mean rating across all items.
- $|U_i|$: Total number of ratings for item $i$.

In [4]:
def calculate_popularity(movies_df, ratings_df, damping_factor=10):
    num_rating = ratings_df.groupby("movieId")['rating'].count()
    mean_rating = ratings_df.groupby("movieId")['rating'].mean()
    sum_rating = ratings_df.groupby("movieId")['rating'].sum()
    global_mean = ratings_df['rating'].mean()

    damped_numerator = sum_rating + damping_factor * global_mean
    damped_denominator =  num_rating + damping_factor
    damped_mean_ratings = damped_numerator/damped_denominator

    movies_df["num_ratings"] = movies_df["movieId"].map(num_rating)
    movies_df["mean_ratings"] = movies_df["movieId"].map(mean_rating)
    movies_df["damped_mean_ratings"] = movies_df["movieId"].map(damped_mean_ratings)

    return movies_df
    

In [5]:
popularity_df = calculate_popularity(movies_df, ratings_df, damping_factor=10)
popularity_df.sort_values(by = "damped_mean_ratings", ascending= False).head(10)

Unnamed: 0,movieId,title,genres,num_ratings,mean_ratings,damped_mean_ratings
277,318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022,4.400659
659,858,"Godfather, The (1972)",Crime|Drama,192.0,4.289062,4.250077
2226,2959,Fight Club (1999),Action|Crime|Drama|Thriller,218.0,4.272936,4.239103
922,1221,"Godfather: Part II, The (1974)",Crime|Drama,129.0,4.25969,4.205148
46,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,204.0,4.237745,4.203344
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0,4.231076,4.203125
602,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,97.0,4.268041,4.196407
914,1213,Goodfellas (1990),Crime|Drama,126.0,4.25,4.194967
461,527,Schindler's List (1993),Drama|War,220.0,4.225,4.193546
6710,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,149.0,4.238255,4.191922
