# Sistemi preporuka zasnovani na uzajamnom filtriranju koristeci matricu kovarijansi

In [7]:
import pandas as pd
import numpy as np
from sklearn import model_selection

In [8]:
header = ['user', 'item', 'rating', 'timestemp']

In [9]:
data = pd.read_table('data/ratings.dat', sep = '::', names = header, engine = 'python')

In [10]:
data.head()

Unnamed: 0,user,item,rating,timestemp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   user       1000209 non-null  int64
 1   item       1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestemp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [22]:
data['item'].max()

3952

In [12]:
train_data, test_data = model_selection.train_test_split(data, test_size = 0.2, random_state = 7, stratify = data['user'])

In [13]:
train_data

Unnamed: 0,user,item,rating,timestemp
943475,5689,1203,4,961829920
69354,464,1224,5,976227084
914137,5526,1892,2,963099584
51981,342,700,2,976339072
388280,2271,1080,4,974576205
...,...,...,...,...
52162,343,2643,2,988597096
42967,294,141,4,976542811
350745,2059,3148,5,974755816
700348,4190,435,1,965326908


In [14]:
test_data

Unnamed: 0,user,item,rating,timestemp
277825,1676,110,5,974801037
176345,1114,2049,4,974915238
991738,5990,1010,3,956869994
518162,3197,357,3,968636483
827051,4964,1094,4,962659085
...,...,...,...,...
618050,3745,1377,3,966141593
573830,3513,2616,2,966975373
3520,26,2759,2,978139536
477810,2934,3020,3,971488893


In [15]:
number_of_users = train_data['user'].unique().shape[0]

In [16]:
number_of_users

6040

In [17]:
number_of_items = train_data['item'].unique().shape[0]

In [18]:
number_of_items

3673

In [19]:
max_item = train_data['item'].max()

In [20]:
max_item

3952

In [23]:
train_data['item'].min()

1

In [24]:
train_data['user'].min()

1

In [25]:
train_data['user'].max()

6040

In [135]:
user_item_matrix = np.zeros((number_of_users, max_item))
for row in data.itertuples(): # da li ovde treba train_data??
    user_item_matrix[row[1]-1, row[2]-1] = row[3]

In [27]:
user_item_matrix

array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.]])

In [28]:
zero = np.sum(user_item_matrix == 0)

In [29]:
zero

22869871

In [30]:
user_item_matrix.shape

(6040, 3952)

In [31]:
fields = user_item_matrix.shape[0]*user_item_matrix.shape[1]

In [32]:
fields

23870080

In [33]:
fields - zero

1000209

In [34]:
common_items = (user_item_matrix[0,:] != 0) & (user_item_matrix[1,:] !=0)

In [35]:
np.sum(common_items==True)

7

In [36]:
user_item_matrix[0, common_items]

array([5., 4., 4., 4., 5., 3., 5.])

In [136]:
def calculate_covariance_between_users(ui_matrix, i, j):
    if (i != j):
        ratings_i = ui_matrix[i,:]
        ratings_j = ui_matrix[j,:]
    
        common_items = (ratings_i != 0) & (ratings_j != 0)
    
        part1 = np.sum(ratings_i[common_items]*ratings_j[common_items])*(1/number_of_items)
        part2 = np.sum(ratings_i)*np.sum(ratings_j)*(1/(number_of_items**2)) # da li je number of items ili max items
    
        return part1 - part2
    else:
        return float('inf')
    

In [38]:
calculate_covariance_between_users(user_item_matrix, 1, 2)

0.04466326966808587

In [39]:
calculate_covariance_between_users(user_item_matrix, 2, 1)

0.04466326966808587

In [40]:
calculate_covariance_between_users(user_item_matrix, 344, 2413)

0.012823282962944955

In [41]:
calculate_covariance_between_users(user_item_matrix, 2413, 344)

0.012823282962944955

In [42]:
calculate_covariance_between_users(user_item_matrix, 1, 1)

inf

In [43]:
covariance_matrix = np.zeros((number_of_users, number_of_users))

In [44]:
for i in range(number_of_users):
    for j in range(i, number_of_users):
        covariance_matrix[i, j] = calculate_covariance_between_users(user_item_matrix, i, j)
        covariance_matrix[j, i] = covariance_matrix[i, j]

In [45]:
covariance_matrix

array([[        inf,  0.02751123,  0.02585686, ..., -0.00125062,
         0.05640338,  0.05806205],
       [ 0.02751123,         inf,  0.04466327, ...,  0.0111867 ,
         0.01764059,  0.13718989],
       [ 0.02585686,  0.04466327,         inf, ...,  0.01548663,
         0.02536252,  0.05442457],
       ...,
       [-0.00125062,  0.0111867 ,  0.01548663, ...,         inf,
         0.03134499,  0.02634259],
       [ 0.05640338,  0.01764059,  0.02536252, ...,  0.03134499,
                inf,  0.14608297],
       [ 0.05806205,  0.13718989,  0.05442457, ...,  0.02634259,
         0.14608297,         inf]])

In [46]:
covariance_matrix[1,2]

0.04466326966808587

In [47]:
covariance_matrix[2, 1]

0.04466326966808587

In [48]:
covariance_matrix[0,0]

inf

In [49]:
covariance_matrix[6039, 0]

0.058062050434036075

In [50]:
covariance_matrix[0, 6039]

0.058062050434036075

In [51]:
np.sum(covariance_matrix == 0)

0

In [52]:
def GetKSimilarUser(user_i, K):
    user_values = covariance_matrix[user_i,:]
    values = user_values.argsort()[-K-1:]
    
    return values[0:K]

In [53]:
print(GetKSimilarUser(0, 10))
for i in GetKSimilarUser(0,10):
    print(covariance_matrix[0, i])

[1119 4578 1265 3589 5099 1050 1696 5761 1111 2072]
0.17063969427161021
0.17095842695488206
0.1710781370208086
0.17117108836611622
0.1790000525538308
0.1795268509677873
0.18254473061121293
0.188287700572733
0.20221869079586735
0.20607365141422063


In [54]:
def GetKDSimilarUser(user_i, K):
    user_values = covariance_matrix[user_i,:]
    
    return user_values.argsort()[:K]

In [137]:
np.flipud(covariance_matrix[0,:].argsort()[-10-1:][0:10]) # funkcija za obrtanje niza

array([2072, 1111, 5761, 1696, 1050, 5099, 3589, 1265, 4578, 1119])

In [68]:
a = covariance_matrix[0,:].argsort()[:10]
print(a)
for i in a:
    print(covariance_matrix[0, i])

[ 277 1068 3376  209 5928 2154  666 2351 6004 1409]
-0.008561456368201183
-0.007816881995302179
-0.007741646257274056
-0.007289787085826336
-0.006088535489290619
-0.005479681940361558
-0.005437876072137065
-0.0053151269271374865
-0.0053151269271374865
-0.005216393919203044


In [74]:
ru_average = user_item_matrix[0, :].mean()
ru_average

s = np.zeros(10)
s

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [138]:
def predict_rating(user_u, item_i, Nus, Nud, COV, UI, K, alpha): # ovo nam ne treba vise ceo kod je u algoritmu
    ru_average = UI[user_u-1, UI[user_u-1, :] != 0].mean()
    similar = np.zeros(K)
    divide_s = np.zeros(K)
    disimilar = np.zeros(K)
    divide_ds = np.zeros(K)
    
    for i in range(K):
        similar[i] = COV[user_u-1, Nus[i]]*(UI[Nus[i],item_i-1] - UI[Nus[i], :].mean())
        divide_s[i] = np.abs(COV[user_u-1, Nus[i]])
        disimilar[i] = np.abs(COV[user_u-1, Nud[i]])*(UI[Nud[i],item_i-1] - UI[Nud[i], :].mean())
        divide_ds = np.abs(COV[user_u-1, Nud[i]])
        
    rui = ru_average + alpha * (np.sum(similar)/np.sum(divide_s)) + (1-alpha) * (np.sum(disimilar)/np.sum(divide_ds))
    
    return rui

In [105]:
Nus = GetKSimilarUser(200, 10)
Nud = GetKDSimilarUser(200, 10)

In [106]:
predict_rating(200, 100, Nus, Nud, covariance_matrix, user_item_matrix, 10, 0.8)

2.5696688900933564

In [103]:
user_item_matrix[200, user_item_matrix[200,:] != 0].mean()

3.954022988505747

In [98]:
similar

array([0., 2., 0., 0., 0., 0., 0., 0., 0., 0.])

In [140]:
def CFCM(UI, user_u, alpha, K, N):
    Nus = GetKSimilarUser(user_u, K)
    Nud = GetKDSimilarUser(user_u, K)
    
    # prosecna ocena koju je dao user, izracunata samo nad itemima koji nisu 0
    ru_average = UI[user_u-1, UI[user_u-1, :] != 0].mean()
    # niz predvidjenih ocena za sve iteme
    all_predicted_ratings = np.zeros(max_item)
    # drugo, krace ime za matricu kovarijansi
    COV = covariance_matrix
    
    
    for item_i in range(max_item):
        
        # za racunanje dela iznad razlomacke crte u prvom delu gde se posmatraju slicni
        similar = np.zeros(K)
        # za racunanje dela ispod razlomacke crte u prvom delu gde se posmatraju slicni
        divide_s = np.zeros(K)
        # za racunanje dela iznad razlomacke crte u drugom delu gde se posmatraju razliciti
        disimilar = np.zeros(K)
        # za racunanje dela ispod razlomacke crte u drugom delu gde se posmatraju razliciti
        divide_ds = np.zeros(K)
    
        for i in range(K):
            similar[i] = COV[user_u-1, Nus[i]]*(UI[Nus[i],item_i-1] - UI[Nus[i], :].mean())
            divide_s[i] = np.abs(COV[user_u-1, Nus[i]])
            disimilar[i] = np.abs(COV[user_u-1, Nud[i]])*(UI[Nud[i],item_i-1] - UI[Nud[i], :].mean())
            divide_ds = np.abs(COV[user_u-1, Nud[i]])
        
        # formula iz algoritma
        rui = ru_average + alpha * (np.sum(similar)/np.sum(divide_s)) + (1-alpha) * (np.sum(disimilar)/np.sum(divide_ds))
        # za svaki element stavljamo u niz
        all_predicted_ratings[item_i] = rui
        
    # uzimamo prvih N najboljih predloga za usera u     
    Result = np.flipud(all_predicted_ratings.argsort())[:N]
    return Result

In [141]:
CFCM(user_item_matrix, 200, 0.8, 10, 5)

array([ 260, 1196, 1097, 3471, 2657])

In [None]:
# U narednom delu sledi proba da li matrica kovarijansi dobro radi
# proba se sastoji u tome da se primeni na podatke iz naucnog rada i da se utvrdi da li se dobija ista matrica
# Dobija se ista matrica jeeeeeeej

In [142]:
# Aaaaaaliiiii postoji caka, ovde je koriscen broj itema koji je ocigledno 6, a u nasem radu je koriscen broj itema 
# number of items koji je 3673 dok je max item 3952.... to se mora razmisliti 

In [143]:
ui_proba = np.array([0, 5, 2, 0, 3, 0, 4, 0, 0, 3, 0, 4, 0, 0, 2, 0, 0, 2, 5, 0, 0, 3, 0, 0, 0, 5, 5, 0, 0, 3])

In [144]:
ui_proba

array([0, 5, 2, 0, 3, 0, 4, 0, 0, 3, 0, 4, 0, 0, 2, 0, 0, 2, 5, 0, 0, 3,
       0, 0, 0, 5, 5, 0, 0, 3])

In [145]:
ui_proba = ui_proba.reshape(5, 6)

In [146]:
def calculate_covariance_proba(ui_matrix, i, j):
    if (i != j):
        ratings_i = ui_matrix[i,:]
        ratings_j = ui_matrix[j,:]
    
        common_items = (ratings_i != 0) & (ratings_j != 0)
    
        part1 = np.sum(ratings_i[common_items]*ratings_j[common_items])*(1/6)
        part2 = np.sum(ratings_i)*np.sum(ratings_j)*(1/(6**2))
    
        return part1 - part2
    else:
        return float('inf')
    

In [147]:
cov_proba = np.zeros((5, 5))
for i in range(5):
    for j in range(i, 5):
        cov_proba[i, j] = calculate_covariance_proba(ui_proba, i, j)
        cov_proba[j, i] = cov_proba[i, j]

In [148]:
cov_proba

array([[        inf, -3.05555556, -0.44444444, -2.22222222,  2.22222222],
       [-3.05555556,         inf,  0.11111111,  2.38888889, -1.97222222],
       [-0.44444444,  0.11111111,         inf, -0.88888889,  1.22222222],
       [-2.22222222,  2.38888889, -0.88888889,         inf, -2.88888889],
       [ 2.22222222, -1.97222222,  1.22222222, -2.88888889,         inf]])