# Laboratorium 2 - collaborative filtering

## Przygotowanie

 * dataset i potrzebne biblioteki są dokładnie takie same jak na poprzednim laboratorium
 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas sklearn`

## Część 1. - przygotowanie danych

In [125]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from sklearn.model_selection import train_test_split, KFold

In [126]:
# liczba parametrow opisujacych filmy i uzytkownikow zalezy tylko od nas
K = 20

In [127]:
# wczytujemy oceny uytkownikow i od razu dzielimy je na dwa zbiory - treningowy i testowy

all_ratings = pandas.read_csv('data/ml-latest-small/ratings.csv').drop(columns=['timestamp'])
train_ratings_set, test_ratings_set = train_test_split(all_ratings, test_size=0.05)
train_ratings_set

Unnamed: 0,userId,movieId,rating
64227,414,6974,4.0
56103,371,2571,5.0
29728,202,3271,4.0
48063,312,1175,5.0
80203,504,4014,3.5
...,...,...,...
59572,387,2136,2.5
66953,432,6874,2.5
6675,45,2081,5.0
54467,357,52973,4.0


In [128]:
# inicjalizujemy macierz preferencji uzytkownikow liczbami losowymi z przedzialu [0.0, 5.0]

def initialize_users(raw_ratings, k):
    users_no = raw_ratings['userId'].unique().size
    users = pandas.DataFrame(5.0 * np.random.uniform(size=(users_no, k)), index=raw_ratings['userId'].unique(), columns=['x%s' % i for i in range(k)])
    users.sort_index(inplace=True) 
    return users_no, users

users_no, users = initialize_users(train_ratings_set, K)
users

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19
1,0.501506,0.749764,3.213952,1.200474,3.449154,4.313152,4.418306,4.523797,1.225074,0.104940,4.142189,1.538740,1.654568,3.303699,3.250518,0.009768,0.155791,3.292117,0.097054,1.500145
2,1.766784,4.325358,2.570891,3.045434,0.590929,0.249643,3.661844,3.101015,2.942316,1.985532,4.298025,4.271599,3.264561,1.524858,2.417432,2.827411,0.751503,0.258333,2.531891,4.650552
3,0.233443,1.083091,2.517809,3.902760,2.188591,4.839112,1.025561,2.753003,2.299852,2.595687,2.954830,4.702195,0.863690,1.384211,0.360106,3.031820,3.010067,1.618232,2.679171,4.241693
4,3.623176,0.973212,0.561934,0.610359,4.628393,3.571036,0.486021,4.291149,2.757818,2.947774,3.231587,1.876463,0.442913,2.537778,0.261712,2.969887,0.003614,3.408584,1.660908,0.377168
5,4.194766,1.549190,3.068092,4.673160,4.448301,2.499338,4.440243,0.798020,0.611801,0.871517,0.419353,2.604538,3.243655,0.563045,0.379393,4.391979,0.658592,3.071035,2.542469,2.563682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.781623,3.263052,1.855180,4.773897,2.197988,3.537420,1.584961,2.141117,4.103740,1.919335,0.492502,0.675038,1.305407,4.230345,2.864900,3.727144,3.243270,1.803868,3.985531,2.784826
607,3.943191,3.311585,0.864350,2.583083,4.354539,3.635779,4.668856,4.483701,1.851623,4.035368,1.254920,1.086006,1.085710,4.913379,1.498295,0.966898,4.725071,1.896465,4.675951,2.835589
608,4.881209,2.836211,4.133366,3.142467,4.510011,0.820999,4.935880,3.607559,2.837736,4.333434,0.715833,4.859654,1.482456,0.979785,4.233621,1.187384,1.399189,2.865141,2.475510,0.151455
609,3.946927,3.712318,0.615529,3.024360,1.450411,4.086718,1.367061,2.764001,3.629190,4.962255,1.421508,0.720917,0.346534,0.987606,0.906806,2.308684,2.768433,0.156244,4.260258,3.189787


In [129]:
# inicjalizujemy macierz cech filmow liczbami losowymi z przedzialu [0.0, 1.0]

def initialize_movies(raw_ratings, k):
    movies_no = raw_ratings['movieId'].unique().size
    movies = pandas.DataFrame((1/K)* np.random.uniform(size=(movies_no, k)), index=raw_ratings['movieId'].unique(), columns=['x%s' % i for i in range(k)])
    movies.sort_index(inplace=True) 
    return movies_no, movies

movies_no, movies = initialize_movies(train_ratings_set, K)
movies

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19
1,0.009723,0.003066,0.020133,0.019989,0.012048,0.016698,0.017732,0.007071,0.049400,0.034540,0.003119,0.016759,0.046109,0.044259,0.024146,0.004538,0.045859,0.038082,0.001905,0.017321
2,0.014377,0.019346,0.016112,0.046436,0.023997,0.002068,0.019167,0.036346,0.036583,0.006313,0.020900,0.026316,0.034131,0.007500,0.036000,0.031295,0.002740,0.048613,0.026404,0.032628
3,0.049320,0.047454,0.036082,0.036472,0.008377,0.028520,0.004532,0.003867,0.030237,0.016777,0.015139,0.006091,0.039255,0.004503,0.031120,0.018192,0.047500,0.038067,0.031032,0.041292
4,0.036965,0.018053,0.009713,0.033518,0.046687,0.008485,0.018125,0.005860,0.019223,0.030953,0.024105,0.030539,0.045257,0.020699,0.028247,0.048137,0.011169,0.035326,0.026028,0.041584
5,0.048789,0.011583,0.026871,0.010731,0.013171,0.049061,0.030958,0.000351,0.018455,0.032587,0.023988,0.037176,0.045293,0.018953,0.037705,0.019922,0.048935,0.025487,0.030988,0.005994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.031438,0.038431,0.048030,0.003778,0.034601,0.043147,0.033823,0.013465,0.013330,0.015649,0.005421,0.007162,0.005776,0.038595,0.031997,0.025145,0.000905,0.005030,0.021139,0.030970
193583,0.023914,0.000774,0.012042,0.036601,0.040156,0.018296,0.005822,0.012921,0.023160,0.033484,0.041168,0.032819,0.030955,0.012308,0.024128,0.023578,0.032852,0.018978,0.021143,0.028486
193585,0.010586,0.016613,0.023244,0.042477,0.007209,0.042792,0.000879,0.036096,0.016201,0.008233,0.026434,0.043954,0.022809,0.031790,0.025751,0.038718,0.012128,0.005106,0.011911,0.034961
193587,0.043347,0.045524,0.006887,0.006938,0.004362,0.022048,0.034152,0.029048,0.010893,0.000210,0.045469,0.037756,0.038319,0.000616,0.039826,0.040593,0.013524,0.007204,0.046635,0.015837


In [130]:
# za pomoca sprytnej sztuczki przeksztalcamy oceny z formatu dostarczonego przez MovieLens do uzytecznej macierzy
# zwroc uwage na to, ze czesci filmow i uzytkownikow moze brakowac po podziale datasetu na dwie czesci
#   - byc moze warto uzupelnic brakujace kolumny i wiersze

def get_ratings(raw_ratings, movies, nan=False):
    ratings = raw_ratings.pivot(*raw_ratings.columns)
    if not nan:
        ratings = ratings.fillna(0.0)
    else:
        ratings = np.ma.masked_array(ratings,mask=np.isnan(ratings))
    missing_movies = list(set(movies.index).difference(set(raw_ratings['movieId'])))
    for movie in missing_movies:
        ratings[movie] = 0.0
    ratings = ratings.reindex(sorted(ratings.columns), axis=1)
    return ratings

ratings = get_ratings(train_ratings_set, movies, False)
ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Część 2. - trening modelu

In [131]:
# trenujemy model iteracyjnie, wykorzystujac gradient descent

alpha = 0.00003 # learning speed
delta = 100 # minimal upgrade for each step
lambd = 0.01 # regularization weight

def calculate_user_preferences(users, movies, ratings, users_no, movies_no, alpha, delta, lambd):
    total_error = 0.0
    users_model = users.copy()
    movies_model = movies.copy()
    
    while(True):
        previous_total_error = total_error

        predicted_ratings = users_model.dot(movies_model.T)
        errors = np.where(ratings==0.0, pandas.DataFrame(np.zeros((users_no, movies_no))), predicted_ratings - ratings)
        users_gradient = errors.dot(movies_model)
        movies_gradient = errors.T.dot(users_model)
        
        # zauwaz, ze nie uzywamy biasow i nie potrzebujemy dodatkowej macierzy do regularyzacji
        #  - wystarczy, ze uzyjemy odpowiednio macierzy users_model i movies_model
        
        # musimy zaktualizowac dwa modele
        
        users_model = users_model - alpha * (users_gradient + lambd * users_model)
        movies_model = movies_model - alpha * (movies_gradient + lambd * movies_model)

        total_error = np.sum(errors ** 2)
        print(total_error)
        progress = abs(previous_total_error - total_error)
        if progress < delta:
            break
            
    return users_model, movies_model

users_model, movies_model = calculate_user_preferences(users, movies, ratings, users_no, movies_no, alpha, delta, lambd)

600451.9006556909
410206.00718187116
334639.26354699425
290679.9851285137
261263.99404860853
239969.64901120594
223730.64497212367
210873.61551686138
200399.54074170053
191671.7306269469
184263.83335997976
177879.0808446055
172304.3521740587
167382.5803437759
162995.4328109767
159052.04265562294
155481.453847421
152227.4263620553
149244.78488831004
146496.8026021348
143953.29414650236
141589.20387248133
139383.54586654837
137318.59772655775
135379.27996230253
133552.672957194
131827.6371054309
130194.51120617591
128644.8708397941
127171.33317290392
125767.39803477476
124427.31757695999
123145.98864255256
121918.86331792359
120741.87414832733
119611.37126086389
118524.06921903999
117477.00187929669
116467.48386519126
115493.07754416618
114551.56460323978
113640.92148702148
112759.298094346
111904.99923613996
111076.46844271198
110272.2737778909
109491.09537376389
108731.71444580918
107993.00358603803
107273.9181629665
106573.48868310112
105890.81399014342
105225.05519609444
104575.43025

## Część 3. - podobieństwo elementów

In [132]:
# przygotujmy funkcje obliczajaca odleglosc cosinusowa miedzy kazda para elementow (filmow lub uzytkownikow)

def cosine_similarity(vectors):
    # przydadza nam sie dlugosci wektorow
    # poniewaz w kolejnej czesci bedziemy korzystac z masked arrays, nie mozemy uzyc najprostszej metody
    # lengths = np.linalg.norm(vectors, axis=1)
    # musimy zaimplementowac to sami
    lengths = np.sqrt(np.sum(vectors ** 2, axis=1))
    # podobienstwo liczymy w dwoch krokach - najpierw liczymy iloczyn skalarny kazdej pary wektorow
    dot_products = vectors.dot(vectors.T)
    # nastepnie dzielimy zarowno wiersze jak i kolumny przez dlugosci wektorow - przyda sie zmienna lengths oraz funkcja divide()
    similarity = dot_products.divide(lengths)
    similarity = similarity.T.divide(lengths)
    return similarity

cosine_similarity(movies_model)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193573,193579,193581,193583,193585,193587,193609
1,1.000000,0.791501,0.684953,0.808576,0.808367,0.840439,0.732227,0.835389,0.919572,0.818909,...,0.858524,0.771521,0.881247,0.821131,0.815816,0.813749,0.839559,0.843640,0.817957,0.831155
2,0.791501,1.000000,0.588012,0.714536,0.679541,0.701944,0.564734,0.708471,0.781974,0.667123,...,0.919629,0.811597,0.884782,0.849798,0.835163,0.832253,0.882508,0.856445,0.758324,0.784255
3,0.684953,0.588012,1.000000,0.608358,0.704350,0.754711,0.625329,0.687938,0.640077,0.598873,...,0.677784,0.610569,0.706612,0.647795,0.650174,0.665696,0.580176,0.609971,0.592047,0.618992
4,0.808576,0.714536,0.608358,1.000000,0.852575,0.697180,0.797340,0.850807,0.748259,0.825898,...,0.802486,0.866427,0.859442,0.856649,0.927419,0.825989,0.882826,0.817239,0.816641,0.787342
5,0.808367,0.679541,0.704350,0.852575,1.000000,0.789526,0.752211,0.790094,0.834483,0.717524,...,0.788914,0.768564,0.829730,0.816628,0.845360,0.808318,0.820507,0.816202,0.807195,0.776815
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.813749,0.832253,0.665696,0.825989,0.808318,0.831742,0.676442,0.820150,0.789761,0.798839,...,0.930877,0.926580,0.933702,0.929310,0.936766,1.000000,0.916269,0.937041,0.923234,0.893360
193583,0.839559,0.882508,0.580176,0.882826,0.820507,0.778682,0.765618,0.837454,0.809205,0.835689,...,0.949483,0.948798,0.953198,0.945499,0.946833,0.916269,1.000000,0.950804,0.920434,0.862510
193585,0.843640,0.856445,0.609971,0.817239,0.816202,0.831018,0.769412,0.836315,0.820692,0.795169,...,0.940775,0.928379,0.950421,0.925853,0.922646,0.937041,0.950804,1.000000,0.923937,0.867762
193587,0.817957,0.758324,0.592047,0.816641,0.807195,0.839399,0.736979,0.855286,0.745568,0.804950,...,0.920485,0.942543,0.919462,0.945682,0.883031,0.923234,0.920434,0.923937,1.000000,0.834688


In [133]:
# teraz mozemy znalexc k elementow najbardziej podobnych do danego

def k_most_similar(vectors, i, k):
    sim_matrix = cosine_similarity(vectors)
    # przyda sie funkcja np.argsort()
    sim_sorted = np.argsort(sim_matrix[i],axis=1)
    return sim_sorted[:k]
        

k_most_similar(movies, 193587, 8)

1    4570
2    6679
3     971
4    3387
5    6680
6    2428
7    5348
8       0
Name: 193587, dtype: int64

## Część 4. - Item2Item collaborative filtering

In [134]:
# sprobujmy innego podejscia - Item2Item CF przewiduje rating tylko na podstawie macierzy ratingow, bez koniecznosci trenowania
#   dodatkowych macierzy

# zauwaz, ze nie chcemy przeprowadzac obliczen tam, gdzie brakuje nam elementow
#   - oblicz macierz ratings z parametrem nan=True oraz wykorzystaj tzw. masked arrays: np.ma.array(x, mask=np.isnan(x))
#   w ten sposob unikniesz przeprowadzania niepotrzebnych obliczen

def item_to_item(ratings):
    similarity = cosine_similarity(ratings.T) # prawdopodobnie bedziesz musial zmodyfikowac te funkcje, by obslugiwala NaN
    sums = similarity.sum(axis=1)
    model = ratings.dot(similarity).divide(sums)
    return model

item_to_item(ratings)

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.296791,0.267826,0.316050,0.286236,0.250315,0.303061,0.223390,0.245064,0.257142,0.318927,...,0.006700,0.006700,0.006700,0.006700,0.006700,0.006700,0.006700,0.006700,0.006700,0.148853
2,0.025208,0.026342,0.014069,0.006537,0.022433,0.022582,0.011410,0.017532,0.010343,0.022311,...,0.053981,0.053981,0.053981,0.053981,0.053981,0.053981,0.053981,0.053981,0.053981,0.239428
3,0.009739,0.010027,0.011323,0.007701,0.008464,0.011978,0.008646,0.009084,0.013885,0.011324,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.178326,0.140490,0.146139,0.253208,0.152224,0.174875,0.162193,0.113174,0.115758,0.172111,...,0.009039,0.009039,0.009039,0.009039,0.009039,0.009039,0.009039,0.009039,0.009039,0.204147
5,0.056538,0.055071,0.047979,0.148922,0.066909,0.051761,0.058177,0.083917,0.047326,0.066744,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.058812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.691933,0.562594,0.532275,0.776843,0.570426,0.629085,0.837882,0.413940,0.390678,0.625715,...,0.047485,0.047485,0.047485,0.047485,0.047485,0.047485,0.047485,0.047485,0.047485,0.952335
607,0.212655,0.192282,0.193826,0.265532,0.183554,0.203889,0.166372,0.202154,0.203365,0.231188,...,0.004020,0.004020,0.004020,0.004020,0.004020,0.004020,0.004020,0.004020,0.004020,0.094730
608,0.651820,0.675413,0.655072,0.562990,0.587711,0.645558,0.522054,0.668648,0.560639,0.785341,...,0.042153,0.042153,0.042153,0.042153,0.042153,0.042153,0.042153,0.042153,0.042153,1.429762
609,0.038250,0.035928,0.035556,0.071400,0.044612,0.035294,0.038149,0.054096,0.053559,0.054391,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.036657


## Część 5. - porównanie algorytmów

In [135]:
# korzystając z funkcji z poprzedniego laboratorium, porownaj dwa zaimplementowane algorytmy Collaborative Filtering

import sklearn
positive_threshold = 4.0
negative_threshold = 2.0

def calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold):
    # obliczamy true_positives itp.
    test_ratings_set = test_ratings_set.set_index(['userId', 'movieId'])
    predicted_ratings.index.name = 'userId'
    predicted_ratings.columns.name = 'movieId'
    predicted_ratings = predicted_ratings.unstack().reset_index(name='rating').set_index(['userId', 'movieId'])
    
    test_ratings = test_ratings_set.values.reshape(-1)
    predicted_ratings = predicted_ratings.values.reshape(-1)

    def thresholds(x):
        return True if x <= negative_threshold or x >= positive_threshold else False

    def normalize(x):
        if x <= negative_threshold:
            return 0
        if x >= positive_threshold:
            return 1

    ratings = [(t,p) for (t,p) in  zip(test_ratings,predicted_ratings) if thresholds(t) and thresholds(p)]        

    test_ratings = [normalize(x) for (x,_) in ratings]
    predicted_ratings = [normalize(x) for (_,x) in ratings]

    true_negatives, false_positives, false_negatives, true_positives = sklearn.metrics.confusion_matrix(test_ratings, predicted_ratings, labels=[0, 1]).ravel()
    # nastepnie wszystkie metryki
    accuracy = (true_negatives + true_positives) / (true_negatives + false_positives + false_negatives + true_positives)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2*precision*recall / (precision + recall)
        
    return {
        'true_positives': true_positives,
        'true_negatives': true_negatives,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [136]:
predicted_ratings = users_model.dot(movies_model.T)
calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold)

{'true_positives': 323,
 'true_negatives': 6,
 'false_positives': 92,
 'false_negatives': 51,
 'accuracy': 0.6970338983050848,
 'precision': 0.7783132530120482,
 'recall': 0.8636363636363636,
 'f1': 0.8187579214195185}

In [137]:
predicted_ratings = item_to_item(ratings)
calculate_stats(test_ratings_set, predicted_ratings, positive_threshold, negative_threshold)

  precision = true_positives / (true_positives + false_positives)


{'true_positives': 0,
 'true_negatives': 669,
 'false_positives': 0,
 'false_negatives': 2341,
 'accuracy': 0.22225913621262458,
 'precision': nan,
 'recall': 0.0,
 'f1': nan}

In [138]:
# wielokrotnie uruchamiamy trening modelu
# za każdym razem dzielimy dataset na zbior treningowy i testowy w inny sposob - klasa KFold robi to za nas
# zwroc uwage na bardzo istotny szczegol - oba modele, wytrenowany i losowy, musza byc porownywane na tym samym zbiorze testowym

n_tests = 5
results = []
i2i_results = []
random_results = []

for train, test in KFold(n_splits=n_tests, shuffle=True).split(all_ratings):# wygeneruj macierz użytkowników i ocen
    users_no, users = initialize_users(all_ratings,K)
    train_ratings = all_ratings.iloc[train]
    ratings = get_ratings(train_ratings, movies, False)
    # wytrenuj model
    print(users_no)
    print(movies_no)
    print(ratings.shape)
    users_model, movies_model = calculate_user_preferences(users, movies, ratings, users_no, movies_no, alpha, delta, lambd)
    # oblicz metryki dla wytrenowanego modelu
    test_ratings = all_ratings.iloc[test]
    predicted_ratings = users_model.dot(movies_model.T)
    results.append(calculate_stats(test_ratings, predicted_ratings, positive_threshold, negative_threshold))
    # oblicz metryki dla item to item
    i2i_prediction = item_to_item(train)
    i2i_results.append(calculate_stats(test_ratings, i2i_prediction, positive_threshold, negative_threshold))
    # oblicz metryki dla modelu losowego
    _, random_model = initialize_users(train_ratings)
    random_prediction = random_model.dot(movies.T)
    random_results.append(calculate_stats(test_ratings, random_prediction, positive_threshold, negative_threshold))

  ratings[movie] = 0.0


610
9533
(610, 9692)


ValueError: operands could not be broadcast together with shapes (610,9692) (610,9533) (610,9692) 