In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
ratings = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv')

In [3]:
movies = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv')

In [4]:
print(ratings.head())
print(movies.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [5]:
print('Liczba wszystkich ocen:', ratings.shape[0])
print('Liczba filmów w bazie:', movies.shape[0])
print('Liczba użytkowników:', ratings['userId'].nunique())
print('Średnia liczba ocen na użytkownika:', ratings.shape[0] / ratings['userId'].nunique())
print('Średnia liczba ocen na film:', ratings.shape[0] / movies.shape[0])
print('Rozkład ocen:')
print(ratings['rating'].value_counts())
print(ratings['rating'].describe())

Liczba wszystkich ocen: 100836
Liczba filmów w bazie: 9742
Liczba użytkowników: 610
Średnia liczba ocen na użytkownika: 165.30491803278687
Średnia liczba ocen na film: 10.350646684459043
Rozkład ocen:
rating
4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: count, dtype: int64
count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64


In [6]:
lista = ratings['movieId'].value_counts().sort_values(ascending=False)
print(lista.head(10))
print(movies[movies['movieId'] == 356].head(1))

movieId
356     329
318     317
296     307
593     279
2571    278
260     251
480     238
110     237
589     224
527     220
Name: count, dtype: int64
     movieId                title                    genres
314      356  Forrest Gump (1994)  Comedy|Drama|Romance|War


In [7]:
movie_titles = dict(zip(movies['movieId'],movies['title']))

In [8]:
#pogrupowanie ocen po filmach i obliczenie średniej
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()
C = movie_stats['count'].mean()
m = movie_stats['mean'].mean()

def bayesian_avg(ratings):
    bayesian_avg = (C*m+ratings.sum())/(C+ratings.count())
    return bayesian_avg

bayesian_avg_ratings = ratings.groupby('movieId')['rating'].agg(bayesian_avg).reset_index()
bayesian_avg_ratings.columns = ['movieId', 'bayesian_avg']
movie_stats = movie_stats.merge(bayesian_avg_ratings, on='movieId')

In [9]:
movie_stats = movie_stats.merge(movies[['movieId', 'title']])
movie_stats.sort_values('bayesian_avg', ascending=False).head()

Unnamed: 0,movieId,count,mean,bayesian_avg,title
277,318,317,4.429022,4.39207,"Shawshank Redemption, The (1994)"
659,858,192,4.289062,4.236457,"Godfather, The (1972)"
2224,2959,218,4.272936,4.227052,Fight Club (1999)
224,260,251,4.231076,4.192646,Star Wars: Episode IV - A New Hope (1977)
46,50,204,4.237745,4.190567,"Usual Suspects, The (1995)"


In [10]:
movie_stats.sort_values('bayesian_avg', ascending=True).head()

Unnamed: 0,movieId,count,mean,bayesian_avg,title
1172,1556,19,1.605263,2.190377,Speed 2: Cruise Control (1997)
2679,3593,19,1.657895,2.224426,Battlefield Earth (2000)
1372,1882,33,1.954545,2.267268,Godzilla (1998)
1144,1499,27,1.925926,2.2968,Anaconda (1997)
1988,2643,16,1.6875,2.306841,Superman IV: The Quest for Peace (1987)


In [11]:
from scipy.sparse import csr_matrix

def create_X(df):
    N = df['userId'].nunique()
    M = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))

    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))

    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [12]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)


In [13]:
sparsity = X.count_nonzero()/(X.shape[0]*X.shape[1])

print(f"Matrix sparsity: {round(sparsity*100,2)}%")

Matrix sparsity: 1.7%


In [14]:
from sklearn.neighbors import NearestNeighbors

def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
    """
    Finds k-nearest neighbours for a given movie id.

    Args:
        movie_id: id of the movie of interest
        X: user-item utility matrix
        k: number of similar movies to retrieve
        metric: distance metric for kNN calculations

    Returns:
        list of k similar movie ID's
    """
    neighbour_ids = []

    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [15]:
movie_titles = dict(zip(movies['movieId'], movies['title']))

movie_id = 1
similar_ids = find_similar_movies(movie_id, X, k=100, metric="euclidean")

movie_title = movie_titles[movie_id]
print(f"Because you watched {movie_title}:")
for i in similar_ids:
    print(movie_titles[i])

Because you watched Toy Story (1995):
Toy Story 2 (1999)
Mission: Impossible (1996)
Independence Day (a.k.a. ID4) (1996)
Bug's Life, A (1998)
Nutty Professor, The (1996)
Willy Wonka & the Chocolate Factory (1971)
Babe (1995)
Groundhog Day (1993)
Mask, The (1994)
Honey, I Shrunk the Kids (1989)
Monsters, Inc. (2001)
Men in Black (a.k.a. MIB) (1997)
Indiana Jones and the Temple of Doom (1984)
Twister (1996)
Austin Powers: International Man of Mystery (1997)
Mrs. Doubtfire (1993)
Beauty and the Beast (1991)
Beetlejuice (1988)
Lion King, The (1994)
Little Mermaid, The (1989)
Ghostbusters (a.k.a. Ghost Busters) (1984)
Happy Gilmore (1996)
Toy Story 3 (2010)
Shrek 2 (2004)
When Harry Met Sally... (1989)
Big (1988)
Pinocchio (1940)
Star Wars: Episode I - The Phantom Menace (1999)
Lost World: Jurassic Park, The (1997)
Home Alone (1990)
Ferris Bueller's Day Off (1986)
Mary Poppins (1964)
Shrek (2001)
Armageddon (1998)
Mars Attacks! (1996)
Batman Forever (1995)
Truman Show, The (1998)
Jerry Magu

In [17]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m122.9/154.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357243 sha256=78381d9f9aea2c078d128cbdfb

In [18]:
# importing relevant libraries
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.prediction_algorithms import SVD, KNNBasic, KNNBaseline
from surprise import Dataset
from surprise import Reader

In [19]:
# Get minimum and maximum rating from the dataset
min_rating = ratings.rating.min()
max_rating = ratings.rating.max()
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings[['userId','movieId', 'rating']], reader)

In [25]:
'''
Przegląd danych:
1.Podaj liczbę filmów dla dzieci
2.Pokaż rozkład ocen filmów z 1995
3.Podaj średnią ocen wszystkich filmów akcji oraz 3 filmy najwyżej
oceniane
'''
print(movies.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [26]:
# Filter and return all rows with children movies
children_movies = movies[movies['genres'].str.contains("Children", case=False, na=False)]
print(children_movies)

      movieId                                title  \
0           1                     Toy Story (1995)   
1           2                       Jumanji (1995)   
7           8                  Tom and Huck (1995)   
12         13                         Balto (1995)   
26         27                  Now and Then (1995)   
...       ...                                  ...   
9670   182731                 Pixel Perfect (2004)   
9679   183301  The Tale of the Bunny Picnic (1986)   
9697   184987             A Wrinkle in Time (2018)   
9708   187541                 Incredibles 2 (2018)   
9710   187595       Solo: A Star Wars Story (2018)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
1                      Adventure|Children|Fantasy  
7                              Adventure|Children  
12                   Adventure|Animation|Children  
26                                 Children|Drama  
...                                    

In [58]:
# Function to extract last 5 characters and drop the last one
# Function to extract the year from the title
def extract_year(s):
    if '(' in s and ')' in s:
        return s[s.rfind('(')+1:s.rfind(')')]
    return None


# Apply the function to the 'title' column
movies['year'] = movies['title'].apply(extract_year)

movies_1995 = movies[movies['year'] == "1995"]
movies_1995['year'] = movies_1995['year'].astype(int)

# Filter ratings for movies from 1995
ratings_1995 = ratings[ratings['movieId'].isin(movies_1995['movieId'])]


# Display the distribution of ratings
print('Rozkład ocen:')
print(ratings_1995['rating'].value_counts())
print(ratings_1995['rating'].describe())

Rozkład ocen:
rating
3.0    1701
4.0    1625
5.0     898
2.0     522
3.5     467
4.5     308
1.0     254
2.5     244
1.5      75
0.5      50
Name: count, dtype: int64
count    6144.000000
mean        3.443848
std         1.059566
min         0.500000
25%         3.000000
50%         3.500000
75%         4.000000
max         5.000000
Name: rating, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_1995['year'] = movies_1995['year'].astype(int)


In [61]:
# Filtrujemy filmy akcji
action_movies = movies[movies['genres'].str.contains('Action')]

# Łączymy DataFrame 'movies' z 'ratings'
action_ratings = pd.merge(action_movies, ratings, on='movieId')

# Obliczamy średnią ocen dla filmów akcji
average_rating = action_ratings['rating'].mean()
print(f'Średnia ocen filmów akcji: {average_rating:.2f}')

# Znajdujemy trzy najwyżej oceniane filmy akcji
top_rated_action_movies = action_ratings.groupby('title')['rating'].mean().sort_values(ascending=False).head(3)
print('Trzy najwyżej oceniane filmy akcji:')
print(top_rated_action_movies)

Średnia ocen filmów akcji: 3.45
Trzy najwyżej oceniane filmy akcji:
title
Knock Off (1998)                                                      5.0
On the Other Side of the Tracks (De l'autre côté du périph) (2012)    5.0
Sonatine (Sonachine) (1993)                                           5.0
Name: rating, dtype: float64


In [21]:
algo = SVD()
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8675  0.8808  0.8776  0.8759  0.8697  0.8743  0.0049  
MAE (testset)     0.6692  0.6755  0.6724  0.6740  0.6675  0.6717  0.0030  
Fit time          2.79    1.78    1.69    1.67    1.67    1.92    0.44    
Test time         0.36    0.14    0.29    0.17    0.14    0.22    0.09    


{'test_rmse': array([0.86753646, 0.8807968 , 0.87756638, 0.87592878, 0.86972176]),
 'test_mae': array([0.66922388, 0.67551877, 0.67237688, 0.67400656, 0.6674545 ]),
 'fit_time': (2.7938482761383057,
  1.7787654399871826,
  1.6914734840393066,
  1.672886610031128,
  1.670043706893921),
 'test_time': (0.36281371116638184,
  0.1393299102783203,
  0.29428839683532715,
  0.16559123992919922,
  0.13882136344909668)}

In [22]:
algo = KNNBasic()
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9494  0.9494  0.9596  0.9480  0.9413  0.9495  0.0058  
MAE (testset)     0.7276  0.7270  0.7318  0.7288  0.7223  0.7275  0.0031  
Fit time          0.13    0.17    0.17    0.21    0.23    0.18    0.03    
Test time         1.46    1.70    1.68    2.31    1.87    1.80    0.29    


{'test_rmse': array([0.94936995, 0.94936806, 0.95957617, 0.94797381, 0.94134223]),
 'test_mae': array([0.72760013, 0.72703406, 0.73175929, 0.7287998 , 0.72228348]),
 'fit_time': (0.12828326225280762,
  0.17137432098388672,
  0.17299556732177734,
  0.21114158630371094,
  0.22558188438415527),
 'test_time': (1.4643447399139404,
  1.6961021423339844,
  1.6775038242340088,
  2.313500165939331,
  1.8667035102844238)}

In [59]:
param_grid = {"n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)
# best RMSE score
print(gs.best_score["rmse"])
# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

0.8945117103423653
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [60]:
param_grid = {"n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6]}
gs = GridSearchCV(KNNBasic, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)
# best RMSE score
print(gs.best_score["rmse"])
# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [62]:
import os
from surprise import BaselineOnly, Dataset, Reader
from surprise.model_selection import cross_validate
# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format="user item rating timestamp", sep="\t")

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(BaselineOnly(), data, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8726  0.8702  0.8719  0.8776  0.8710  0.8727  0.0026  
MAE (testset)     0.6743  0.6723  0.6703  0.6754  0.6718  0.6728  0.0018  
Fit time          0.49    0.43    0.43    0.44    0.43    0.44    0.02    
Test time         0.15    0.26    0.08    0.33    0.14    0.19    0.09    


{'test_rmse': array([0.87257246, 0.87023099, 0.87185925, 0.87763482, 0.87103727]),
 'test_mae': array([0.67434882, 0.67231097, 0.67034289, 0.67537306, 0.67183208]),
 'fit_time': (0.4859163761138916,
  0.4305276870727539,
  0.42862987518310547,
  0.43869614601135254,
  0.4309701919555664),
 'test_time': (0.15207123756408691,
  0.26218461990356445,
  0.08045268058776855,
  0.32962679862976074,
  0.13805484771728516)}