# Laboratorium 1 - content-based recommender, collaborative filterng

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas sklearn`

## Część 1. - przygotowanie danych

In [19]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix

In [20]:
# definiujemy stala oznaczajaca liczbe cech kazdego z filmow
# te zmienna wykorzystamy w algorytmie collaborative filtering

K = 20

In [21]:
# tworzymy reprezentacje filmow jako wektorow cech - na podstawie gatunkow
# te macierz wykorzystamy w algorytmie content-based

genres = [
    '(no genres listed)', 
    'Action', 
    'Adventure', 
    'Animation', 
    'Children', 
    'Comedy', 
    'Crime', 
    'Documentary', 
    'Drama', 
    'Fantasy', 
    'Film-Noir', 
    'Horror', 
    'IMAX', 
    'Musical', 
    'Mystery', 
    'Romance', 
    'Sci-Fi', 
    'Thriller', 
    'War', 
    'Western'
]
genres_no = len(genres)

movies_in_genres = pandas.read_csv('ml-latest-small/movies.csv')
movies_in_genres_count = movies_in_genres.shape[0]

movies_in_genres['bias'] = 1.0
for genre in genres:
    movies_in_genres[genre] = np.where(movies_in_genres['genres'].str.contains(genre, regex=False), 1.0, 0.0)
    
movies_in_genres = movies_in_genres.drop(columns=['title', 'genres']).set_index('movieId')
movies_in_genres

Unnamed: 0_level_0,bias,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# wczytujemy oceny uytkownikow i od razu dzielimy je na dwa zbiory - treningowy i testowy

all_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
train_ratings_set, test_ratings_set = train_test_split(all_ratings, test_size=0.05)
train_ratings_set

Unnamed: 0,userId,movieId,rating
15538,102,344,4.0
15123,97,1210,5.0
19546,125,155064,2.5
90015,585,2959,5.0
9598,64,1644,2.5
...,...,...,...
42544,288,2316,2.0
6656,45,1883,3.0
99618,610,1215,4.5
90602,590,784,1.5


In [23]:
# inicjalizujemy macierz cech filmow liczbami losowymi z przedzialu [0.0, 1.0]
# te macierz wykorzystamy w algorytmie collaborative filtering

def initialize_movies(raw_ratings, k):
    movies_no = raw_ratings['movieId'].unique().size
    movies = pandas.DataFrame(np.random.uniform(size=(movies_no, k)), index=raw_ratings['movieId'].unique(), columns=['x%s' % i for i in range(k)])
    movies.sort_index(inplace=True) 
    return movies_no, movies

movies_random_count, movies_random = initialize_movies(train_ratings_set, genres_no+1)
movies_random

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20
1,0.981756,0.045932,0.369852,0.219193,0.814947,0.322866,0.430543,0.860843,0.797433,0.208493,...,0.782993,0.985786,0.881148,0.218235,0.966473,0.313861,0.059110,0.953487,0.426169,0.408217
2,0.265581,0.677980,0.829495,0.785469,0.319703,0.336417,0.639629,0.684850,0.778191,0.681397,...,0.977256,0.239239,0.441702,0.353714,0.886831,0.749483,0.949088,0.257823,0.164105,0.581644
3,0.457486,0.289085,0.321970,0.133794,0.896532,0.048765,0.942228,0.623183,0.776557,0.392791,...,0.940441,0.919180,0.065669,0.628505,0.259516,0.658207,0.662968,0.074608,0.384438,0.308357
4,0.264475,0.159015,0.307123,0.185412,0.063597,0.868098,0.358614,0.324399,0.467251,0.294444,...,0.253582,0.045046,0.406316,0.956715,0.322721,0.876292,0.764493,0.211171,0.817756,0.638245
5,0.759471,0.863375,0.502304,0.806767,0.965373,0.113936,0.530598,0.233490,0.408349,0.459172,...,0.249638,0.876810,0.511347,0.872368,0.535708,0.980793,0.407827,0.074906,0.375477,0.549313
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.372974,0.399088,0.460030,0.983452,0.701371,0.263264,0.288611,0.794515,0.550356,0.226818,...,0.078089,0.734184,0.784216,0.590657,0.900228,0.342095,0.959572,0.328707,0.786898,0.514736
193583,0.380783,0.310624,0.860190,0.646511,0.313044,0.739399,0.239132,0.741289,0.344190,0.488950,...,0.659199,0.253433,0.255674,0.283735,0.346705,0.261226,0.151294,0.746619,0.895854,0.733946
193585,0.488331,0.127666,0.781139,0.269375,0.981084,0.202217,0.148978,0.628515,0.794193,0.051191,...,0.554993,0.326008,0.699775,0.572299,0.502618,0.047770,0.530132,0.499757,0.035729,0.403281
193587,0.134039,0.580296,0.711077,0.873218,0.552946,0.270962,0.981275,0.611565,0.753004,0.091227,...,0.115724,0.137808,0.397127,0.993882,0.653196,0.036350,0.012549,0.874984,0.676978,0.795418


In [24]:
# odkomentuj te linie, jesli implementujesz content-based recommender
movies = movies_in_genres
movies_no = movies_in_genres_count

# odkomentuj te linie, jesli implementujesz collaborative filtering
# movies = movies_random
# movies_no = movies_random_count

In [25]:
# inicjalizujemy macierz preferencji uzytkownikow liczbami losowymi z przedzialu [0.0, 5.0]

def initialize_users(raw_ratings, k):
    users_no = raw_ratings['userId'].unique().size
    users = pandas.DataFrame(5.0 * np.random.uniform(size=(users_no, k)), index=raw_ratings['userId'].unique(), columns=['x%s' % i for i in range(k)])
    users.sort_index(inplace=True) 
    return users_no, users

# w przypadku content-based wektory uzytkownikow powinny byc dlugosci (genres_no+1),
# a w przypadku collaborative filtering długości K
users_no, users = initialize_users(train_ratings_set, genres_no+1)
users

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20
1,1.532917,1.257879,0.421649,0.243115,4.893369,2.522131,0.446744,0.446971,0.522787,4.626178,...,0.771641,0.400627,1.986421,2.708896,0.753791,2.887923,1.232203,4.134511,0.963330,3.250929
2,0.432778,1.542815,3.700942,4.669502,2.169282,0.319556,0.984245,0.986020,0.088107,4.240699,...,3.358678,3.099199,0.639673,4.456207,4.851846,2.227405,2.948977,4.347362,0.949393,1.850998
3,1.494256,4.087420,4.734325,0.178597,1.601310,3.171864,3.559128,3.361441,0.908256,1.822037,...,1.618424,3.970212,1.555026,0.670050,0.616771,2.273223,2.301060,1.241242,2.347479,4.310909
4,3.966903,4.315838,1.496918,4.069292,2.939755,4.116018,0.217686,0.517449,1.316242,0.390596,...,3.991684,2.537173,3.230237,0.013269,3.035445,4.185632,3.577037,1.282318,3.787080,1.569167
5,2.104129,2.407457,4.912456,4.315071,0.915457,2.583743,0.393097,0.004285,1.849109,2.663611,...,4.394698,3.334313,4.761248,2.187076,0.957995,4.188244,3.963733,4.293611,2.594698,4.811541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.307468,0.148442,2.422173,4.673282,1.569623,0.570615,2.666433,2.645641,3.347671,2.046435,...,3.536769,2.823033,0.112505,0.999896,1.858063,4.135479,2.580416,2.756653,1.176514,1.862360
607,4.992329,4.299978,1.699682,1.201755,3.842974,4.376456,2.026469,2.824973,2.089581,3.300785,...,4.752442,1.038645,3.013312,2.148746,3.050835,3.066654,3.428135,3.486139,0.131138,3.976361
608,2.957105,3.320343,0.088723,0.233904,2.707201,2.256483,4.676099,3.183618,2.416409,0.547061,...,1.685195,2.093894,3.416487,4.114438,1.634215,2.530201,1.385592,2.614306,4.105313,1.253545
609,3.000744,2.430951,3.819476,0.514877,4.021162,0.095922,1.058634,4.479621,2.078854,4.922998,...,0.029908,2.630702,4.164556,2.957601,2.197353,1.131061,1.746106,4.660343,2.805147,0.733006


In [26]:
# za pomoca sprytnej sztuczki przeksztalcamy oceny z formatu dostarczonego przez MovieLens do uzytecznej macierzy
# zwroc uwage na to, ze czesci filmow moze brakowac po podziale datasetu na dwie czesci - musimy uzueplnic brakujace kolumny

nan = 3.0
def get_ratings(raw_ratings, movies):
    ratings = raw_ratings.pivot(*raw_ratings.columns).fillna(nan)
    missing_movies = set(movies.index).difference(set(raw_ratings['movieId']))
    for movie in missing_movies:
        ratings[movie] = nan
    ratings = ratings.reindex(sorted(ratings.columns), axis=1)
    
    # uzupelnij takze kolumny dla brakujacych uzytkownikow
    missing_users = set(movies.index).difference(set(raw_ratings['userId']))
    for user in missing_users:
        ratings[user] = nan
    ratings = ratings.reindex(sorted(ratings.columns),axis=1)
    
    
    return ratings

ratings = get_ratings(train_ratings_set, movies).copy()
ratings

  ratings[movie] = nan


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
2,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
4,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
5,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,3.0,3.0,3.0,3.0,3.0,2.5,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
607,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
608,2.5,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
609,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0


## Część 2A. - trening modelu content-based

In [27]:
users

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20
1,1.532917,1.257879,0.421649,0.243115,4.893369,2.522131,0.446744,0.446971,0.522787,4.626178,...,0.771641,0.400627,1.986421,2.708896,0.753791,2.887923,1.232203,4.134511,0.963330,3.250929
2,0.432778,1.542815,3.700942,4.669502,2.169282,0.319556,0.984245,0.986020,0.088107,4.240699,...,3.358678,3.099199,0.639673,4.456207,4.851846,2.227405,2.948977,4.347362,0.949393,1.850998
3,1.494256,4.087420,4.734325,0.178597,1.601310,3.171864,3.559128,3.361441,0.908256,1.822037,...,1.618424,3.970212,1.555026,0.670050,0.616771,2.273223,2.301060,1.241242,2.347479,4.310909
4,3.966903,4.315838,1.496918,4.069292,2.939755,4.116018,0.217686,0.517449,1.316242,0.390596,...,3.991684,2.537173,3.230237,0.013269,3.035445,4.185632,3.577037,1.282318,3.787080,1.569167
5,2.104129,2.407457,4.912456,4.315071,0.915457,2.583743,0.393097,0.004285,1.849109,2.663611,...,4.394698,3.334313,4.761248,2.187076,0.957995,4.188244,3.963733,4.293611,2.594698,4.811541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.307468,0.148442,2.422173,4.673282,1.569623,0.570615,2.666433,2.645641,3.347671,2.046435,...,3.536769,2.823033,0.112505,0.999896,1.858063,4.135479,2.580416,2.756653,1.176514,1.862360
607,4.992329,4.299978,1.699682,1.201755,3.842974,4.376456,2.026469,2.824973,2.089581,3.300785,...,4.752442,1.038645,3.013312,2.148746,3.050835,3.066654,3.428135,3.486139,0.131138,3.976361
608,2.957105,3.320343,0.088723,0.233904,2.707201,2.256483,4.676099,3.183618,2.416409,0.547061,...,1.685195,2.093894,3.416487,4.114438,1.634215,2.530201,1.385592,2.614306,4.105313,1.253545
609,3.000744,2.430951,3.819476,0.514877,4.021162,0.095922,1.058634,4.479621,2.078854,4.922998,...,0.029908,2.630702,4.164556,2.957601,2.197353,1.131061,1.746106,4.660343,2.805147,0.733006


In [28]:
# trenujemy model iteracyjnie, wykorzystujac gradient descent


alpha = 0.0001 # learning speed
delta = 10 # minimal upgrade for each step
lambd = 0.01 # regularization weight

def calculate_user_preferences(users, movies, ratings, raw_ratings, users_no, movies_no, alpha, delta, lambd):
    total_error = 0.0
    model = users

    while(True):
        previous_total_error = total_error

        predicted_ratings = np.dot(model, movies.T) # mozemy to policzyc jako iloczyn skalarny preferencji uzytkownikow i cech filmow
        # tu stosujemy bardzo przydatna funkcje NumPy
        predicted_ratings[predicted_ratings > 5.0] = 5.0
        predicted_ratings[predicted_ratings < 0.0] = 0.0
        predicted_ratings[predicted_ratings < 1.0] = 1.0
        
        
        errors = np.where(ratings==nan, pandas.DataFrame(np.zeros((users_no, movies_no))), (predicted_ratings - ratings))
        gradient = np.dot(errors, movies) # znow iloczyn skalarny - tym razem bledow

        # tu stosujemy pewna sztuczke - rozbijamy sobie macierz z wyrazami regularyzujacymi na dwie
        # pierwsza to kolumna zlozona z zer
        regularization_k0 = pandas.DataFrame(np.zeros((users_no, 1)), index=raw_ratings['userId'].unique(), columns=['x0'])
        # druga to macierz preferencji uzytkownikow (czyli modelu) - bez pierwszej kolumny
        regularization_k = model.drop('x0', axis=1)

        # teraz sklejamy obie macierze
        regularization = pandas.concat([regularization_k0, regularization_k], axis=1)

        # najwazniejszy krok - aktualizacja modelu, czyli wszystkich wag
        model = model - alpha*(gradient + lambd*regularization)

        total_error = np.sum(errors)
        progress = abs(previous_total_error - total_error)
        if progress < delta:
            break
            
    return model

prediction_model = calculate_user_preferences(users, movies, ratings, train_ratings_set, users_no, movies_no, alpha, delta, lambd)

## Część 2B. - trening modelu collaborative filtering

In [29]:
# # trenujemy model iteracyjnie, wykorzystujac gradient descent

# alpha = 0.00003 # learning speed
# delta = 100 # minimal upgrade for each step
# lambd = 0.01 # regularization weight

# def calculate_user_preferences(users, movies, ratings, raw_ratings, users_no, movies_no, alpha, delta, lambd):
#     total_error = 0.0
#     users_model = users.copy()
#     movies_model = movies.copy()
    
#     while(True):
#         previous_total_error = total_error

#         predicted_ratings = # ...
#         errors = np.where(ratings==0.0, pandas.DataFrame(np.zeros((users_no, movies_no))), predicted_ratings - ratings)
#         users_gradient = # ...
#         movies_gradient = # ...
        
#         # zauwaz, ze nie uzywamy biasow i nie potrzebujemy dodatkowej macierzy do regularyzacji
#         #  - wystarczy, ze uzyjemy odpowiednio macierzy users_model i movies_model
        
#         # musimy zaktualizowac dwa modele
        
#         users_model = # ...
#         movies_model = # ...

#         total_error = np.sum(errors ** 2)
#         print(total_error)
#         progress = abs(previous_total_error - total_error)
#         if progress < delta:
#             break
            
#     return users_model, movies_model

# users_model, movies_model = calculate_user_preferences(users, movies, ratings, train_ratings_set, users_no, movies_no, alpha, delta, lambd)

## Część 3. - ocena jakości algorytmu

In [30]:
# na podstawie zbioru testowego i wytrenowanego modelu obliczamy metryki opisujace jakosc modelu

positive_threshold = 4.0
negative_threshold = 2.0

def calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold):
    # obliczamy true_positives itp.
    # nastepnie wszystkie metryki
    # mozesz skorzystac z funkcji confusion_matrix, ale nie jest to konieczne
    
    tp = fp = tn = fn = 0
    for row in test_ratings_set.iterrows():
        row = row[1]
        if predicted_ratings[row['movieId']][row['userId']] <= negative_threshold:
            if row['rating'] <= negative_threshold:
                tn += 1
            elif row['rating'] >= positive_threshold:
                fn += 1
        elif predicted_ratings[row['movieId']][row['userId']] >= positive_threshold:
            if row['rating'] >= positive_threshold:
                tp += 1
            elif row['rating'] <= negative_threshold:
                fp += 1
    
    total = tp+tn+fp+fn
    
    
    accuracy = (tp+tn)/total
    precision =  tp/(tp+fp)
    recall =  tp/(tp+fn)
    f1 =  2*(1/(1/precision + 1/recall))
            
    return {
        'true_positives': tp,
        'true_negatives': tn,
        'false_positives': fp,
        'false_negatives': fn,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [31]:
prediction_model.columns = movies.columns
predicted_ratings = prediction_model.dot(movies.T)
predicted_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,12.239168,6.898061,4.868128,9.493297,1.980985,6.532871,4.868128,4.298146,1.953598,6.330084,...,8.527020,11.051364,6.606153,6.426196,2.055823,9.894620,9.474058,6.158205,6.846757,1.980985
2,11.797915,8.643878,3.645455,7.886146,1.418054,9.467601,3.645455,5.422343,4.134230,13.151075,...,10.237239,6.843264,5.658745,2.602573,0.521401,10.509802,6.808867,4.673986,6.303508,1.418054
3,10.712183,5.551756,7.325132,9.145702,5.051913,10.829782,7.325132,4.843247,6.227108,7.646944,...,13.688591,4.914669,6.872484,3.094099,2.401047,12.096045,7.361729,3.313363,7.828415,5.051913
4,19.847114,16.693919,8.354594,8.740390,4.171970,7.251001,8.354594,12.141416,5.454638,10.802437,...,12.184263,7.283668,4.557767,6.897872,5.274562,13.160336,11.664021,4.344120,8.394187,4.171970
5,12.485842,11.179891,6.676046,9.337836,2.490010,11.305585,6.676046,8.995514,7.009760,15.615425,...,12.278836,5.675959,5.151800,3.014169,3.948219,10.500088,5.589442,4.760904,7.924815,2.490010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,16.265331,12.036783,10.075067,12.109098,5.947396,11.102119,10.075067,8.527555,5.706036,13.129157,...,12.513595,6.890519,7.981427,4.856488,6.635332,13.443812,11.025444,5.321699,7.274856,5.947396
607,16.706034,10.838403,10.079642,13.378421,7.013394,12.993708,10.079642,10.564929,6.685808,11.370494,...,15.980968,12.130075,10.312173,8.831296,7.078107,12.826913,11.129634,8.287308,10.528575,7.013394
608,14.941266,7.587581,10.085542,10.622351,7.563046,8.772852,10.085542,5.380180,2.986640,5.817886,...,11.719980,6.152761,8.099855,5.615952,5.329060,12.547726,12.473743,3.449466,5.689935,7.563046
609,12.866819,7.787432,5.188435,10.110423,4.057576,15.957558,5.188435,3.609742,6.818613,11.992822,...,13.644102,11.942487,8.979565,7.020498,5.078194,16.075689,12.256421,7.921333,10.839767,4.057576


In [32]:
calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold)

{'true_positives': 2283,
 'true_negatives': 13,
 'false_positives': 573,
 'false_negatives': 32,
 'accuracy': 0.79145122371596,
 'precision': 0.7993697478991597,
 'recall': 0.9861771058315335,
 'f1': 0.8830013537033456}

In [33]:
# dla porownania - obliczmy te same metryki dla modelu losowego
# zauwaz, w jaki sposob ponownie wykorzystujemy funkcje inicjalizujaca preferencje uzytkownikow

_, random_model = initialize_users(train_ratings_set, genres_no+1)
random_model.columns = movies.columns
random_prediction = random_model.dot(movies.T)
calculate_stats(test_ratings_set, random_prediction, positive_threshold, negative_threshold)

{'true_positives': 2250,
 'true_negatives': 5,
 'false_positives': 581,
 'false_negatives': 50,
 'accuracy': 0.7813582813582813,
 'precision': 0.7947721653126104,
 'recall': 0.9782608695652174,
 'f1': 0.8770220229974666}

## Część 4. - istotność statystyczna

In [34]:
# wielokrotnie uruchamiamy trening modelu
# za każdym razem dzielimy dataset na zbior treningowy i testowy w inny sposob - klasa KFold robi to za nas
# zwroc uwage na bardzo istotny szczegol - oba modele, wytrenowany i losowy, musza byc porownywane na tym samym zbiorze testowym

n_tests = 5
results = []
random_results = []

for train, test in KFold(n_splits=n_tests, shuffle=True).split(all_ratings):
    # wygeneruj macierz użytkowników i ocen
    train_set = all_ratings.iloc[train]
    test_set = all_ratings.iloc[test]
    ratings = get_ratings(train_set, movies)
    users_no, users = initialize_users(train_set, genres_no+1)
    # wytrenuj model
    prediction_model = calculate_user_preferences(users, movies, ratings, train_set, users_no, movies_no, alpha, delta, lambd)
    prediction_model.columns = movies.columns
    predicted_ratings = prediction_model.dot(movies.T)
    # oblicz metryki dla wytrenowanego modelu
    my_model_stats = calculate_stats(test_set, predicted_ratings, positive_threshold, negative_threshold)
    results.append(my_model_stats)
    # oblicz metryki dla modelu losowego
    _, random_model = initialize_users(train_set, genres_no+1)
    random_model.columns = movies.columns
    random_prediction = random_model.dot(movies.T)
    random_stats = calculate_stats(test_set, random_prediction, positive_threshold, negative_threshold)
    random_results.append(random_stats)

  ratings[movie] = nan


In [35]:
# obliczamy, w ilu probach wytrenowany model okazal sie lepszy od losowego
# przeprowadzamy test statystyczny - jak prawdopodobne jest to, by k pozytywnych prob bylo dzielem przypadku

def possibility_of_at_least_k_successes_in_n(k, n):
    p = 0.0
    # obliczamy kolejno prawdopodobienstwo k sukcesow, k+1 sukcesow, ...
    # przydadza Ci sie funkcje math.comb() i math.pow()
    for i in range(k, n):
        p += math.comb(n, i) * math.pow(p, i) * math.pow(1-p, n - i) 
    
    return p

p = 0.05
metric = 'f1'

positive_tests_count =  0 # w ilu przypadkach okazalismy sie lepsi niz random?
for i, stats in enumerate(results):
    if stats[metric] > random_results[i][metric]:
        positive_tests_count += 1

if possibility_of_at_least_k_successes_in_n(positive_tests_count, n_tests) <= p:
    print('We are better than random!')
else:
    print('There is no evidence we are better')

We are better than random!


In [36]:
positive_tests_count

4