## Imports

In [1]:
import io
import os
import gzip
import json
import scipy
import random
import warnings
import numpy as np
import numba as nb
import pandas as pd
import datetime as dt
import seaborn as sns
from tqdm import tqdm
import scipy.stats as st
from collections import OrderedDict
import scipy.sparse as sparse
from scipy.linalg import orth
import matplotlib.pyplot as plt
from numpy import linalg as lin
warnings.filterwarnings('ignore')
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
from datetime import datetime as dt_dt
from scipy.sparse.linalg import spsolve
from numpy.linalg import qr as QR_decomp
from scipy.sparse import csr_matrix, find
from pandas.api.types import CategoricalDtype

## Functions

In [2]:
import sys
sys.path.append('/content/')

In [3]:
import a_DataHelpers
import b_AlgoFunctions
import c_PredEval
import d_CorrScores

In [5]:
from a_DataHelpers import getPivotMonths,Time_DataSplit,TestTrain_DataSplit,SingleRatingMatrix
from a_DataHelpers import AllRatingMatrices,All_SingleStepRatMat,psiStep_RatMat,psiAllStep_RatMat
from a_DataHelpers import Find_NewUsersItems,adjustedPSI_DF,adjustedAllDF,get_NEWHoldout,ADJUST_mainDF

from b_AlgoFunctions import integrator,getStartingValues,integratorOnMat,last_psiTrainMat
from b_AlgoFunctions import Updt_RowMatrix,getRow_Mat,row_update, Updt_ColMatrix,getCol_Mat,colunm_update
from b_AlgoFunctions import UsersItems_RatPair,getRowCol_psiupdt,ITEMS_defferredStatus,USERS_defferredStatus
from b_AlgoFunctions import getV_listUpdate, SingleStep_UPDATE,ALLSTEPs_UPDATE

from c_PredEval import TopNPred,TQDMgetALLTopNPred, Hitrate_Eval,getAll_HitRate
from c_PredEval import TQDMgetALLTopNPred_ALLUSERS,getALLTopNPred_ALLUSERS
from c_PredEval import get_ALLRandPred,getAll_RandomHitRate,getMOSTPOP_Pred,getAll_MostPOPHitRate,getAll_MOSTPOP_Pred
from c_PredEval import topN_Index,get_ALLRandPred_2,getAll_RandomHitRate_2,getAll_MostPOPHitRate2

from d_CorrScores import no_copy_csr_matrix,build_rank_weights_matrix,rank_weighted_jaccard_index
from d_CorrScores import Updt_getAll_AvgCorr, updtCorr_4AllRanks

## Apply Functions on *AMZGames*

### Import Cleaned Data

In [7]:
AMZG_DF  = pd.read_csv('/content/usedAMZGames.csv')
AMZG_DF['timestamp'] = pd.to_datetime(AMZG_DF['timestamp']) 
print(AMZG_DF.shape)       #productId_int	userId_int
print(AMZG_DF.dtypes)
print()
AMZG_DF

(372453, 6)
userId_int               object
productId_int            object
rating                    int64
timestamp        datetime64[ns]
userId                    int64
productId                 int64
dtype: object



Unnamed: 0,userId_int,productId_int,rating,timestamp,userId,productId
0,A3BEJOL2X8PS8I,B00004TC6E,1,2001-01-01,0,0
1,A1SQUI6BNEOB6C,B00004UE0G,1,2001-01-01,1,1
2,A2ZESFCRJL7YA0,B00002CF9M,1,2001-01-01,2,2
3,A3URVWPFJOHT72,B00000K1AF,1,2001-01-01,3,3
4,A1EE8EZLR8LJ9M,B00004WLMA,1,2001-01-02,4,4
...,...,...,...,...,...,...
372448,A2UVBD3N7449IC,B01ABTJ2BM,1,2018-08-31,28129,16648
372449,A27F1N27M1X0NM,B01D8H09LQ,1,2018-08-31,3183,16542
372450,A1J5HIF41ENSMZ,B01D8H09LQ,1,2018-08-31,16433,16542
372451,A18C19UTTAQEOD,B01GW3P9PE,1,2018-08-31,16400,17342


In [8]:
AMZG_DF = AMZG_DF[['userId','productId','rating','timestamp']]
print(AMZG_DF.shape)
print("For users: ")
print(AMZG_DF['userId'].nunique())
print(AMZG_DF['userId'].max())
print()
print("For items: ")
print(AMZG_DF['productId'].nunique())
print(AMZG_DF['productId'].max())
print()
print("Timestamp")
print(AMZG_DF['timestamp'].min())
print(AMZG_DF['timestamp'].max())
AMZG_DF

(372453, 4)
For users: 
53242
53241

For items: 
17351
17350

Timestamp
2001-01-01 00:00:00
2018-08-31 00:00:00


Unnamed: 0,userId,productId,rating,timestamp
0,0,0,1,2001-01-01
1,1,1,1,2001-01-01
2,2,2,1,2001-01-01
3,3,3,1,2001-01-01
4,4,4,1,2001-01-02
...,...,...,...,...
372448,28129,16648,1,2018-08-31
372449,3183,16542,1,2018-08-31
372450,16433,16542,1,2018-08-31
372451,16400,17342,1,2018-08-31


###Step 1: Data Splitting

In [9]:
pivotMonths_list  = getPivotMonths(AMZG_DF,'timestamp',N_TMonths=18)  
pivotMonths_list

[Timestamp('2018-07-31 00:00:00'),
 Timestamp('2018-06-30 00:00:00'),
 Timestamp('2018-05-31 00:00:00'),
 Timestamp('2018-04-30 00:00:00'),
 Timestamp('2018-03-31 00:00:00'),
 Timestamp('2018-02-28 00:00:00'),
 Timestamp('2018-01-31 00:00:00'),
 Timestamp('2017-12-31 00:00:00'),
 Timestamp('2017-11-30 00:00:00'),
 Timestamp('2017-10-31 00:00:00'),
 Timestamp('2017-09-30 00:00:00'),
 Timestamp('2017-08-31 00:00:00'),
 Timestamp('2017-07-31 00:00:00'),
 Timestamp('2017-06-30 00:00:00'),
 Timestamp('2017-05-31 00:00:00'),
 Timestamp('2017-04-30 00:00:00'),
 Timestamp('2017-03-31 00:00:00'),
 Timestamp('2017-02-28 00:00:00')]

In [10]:
A0_df,ΔA_train,ΔA_test = Time_DataSplit(AMZG_DF,'timestamp',pivotMonths_list,N_TMonths= 18,n_train=10)
print('Last date in A0_df :',A0_df['timestamp'].max())
print('Num ΔA_train :', len(ΔA_train))

Last date in A0_df : 2017-02-27 00:00:00
Num ΔA_train : 10


In [11]:
AllDF_list, PSITest_list, HOLDOUT_list, UserItemDF_list = TestTrain_DataSplit(AMZG_DF,'userId','timestamp',pivotMonths_list,ΔA_test)
print('Num AllDF_list :', len(AllDF_list))

Num AllDF_list : 8


In [None]:
for df in ΔA_train:
    print(df['userId'].nunique(),df['productId'].nunique())
for df in ΔA_test:
    print(df['userId'].nunique(),df['productId'].nunique())

### Step 2: Find new users and items

In [12]:
Tstps = AMZG_DF['timestamp']
tr_ts = ΔA_train[-1]['timestamp'].max()
print("ΔA_train_last max date: ",tr_ts)
AllDF_start = AMZG_DF.loc[Tstps <= tr_ts] 
print(AllDF_start['timestamp'].min())
print(AllDF_start['timestamp'].max())
print()
print(AllDF_list[0]['timestamp'].min())
print(AllDF_list[0]['timestamp'].max())

ΔA_train_last max date:  2017-12-30 00:00:00
2001-01-01 00:00:00
2017-12-30 00:00:00

2001-01-01 00:00:00
2018-01-30 00:00:00


In [13]:
New_usersList,New_itemsList = Find_NewUsersItems(AllDF_start,AllDF_list,'userId','productId',N_steps=8)
New_usersList[0]   #

array([52808, 52809, 52810, 52811, 52812, 52813, 52814, 52815, 52816,
       52817, 52818, 52819, 52820, 52821, 52822, 52823, 52824, 52825,
       52826, 52827, 52828, 52829, 52830, 52831, 52832, 52833, 52834,
       52835, 52836, 52837, 52838, 52839, 52840, 52841, 52842, 52843,
       52844, 52845, 52846, 52847, 52848, 52849, 52850, 52851, 52852,
       52853, 52854, 52855, 52856, 52857, 52858, 52859, 52860, 52861,
       52862, 52863, 52864, 52865, 52866, 52867, 52868, 52869, 52870,
       52871, 52872, 52873, 52874, 52875, 52876, 52877, 52878, 52879,
       52880, 52881, 52882, 52883, 52884, 52885, 52886, 52887, 52888,
       52889, 52890, 52891, 52892, 52893, 52894, 52895, 52896, 52897,
       52898])

In [14]:
New_itemsList[0]   

array([17329, 17330, 17331])

In [15]:
for i,j in zip(New_usersList,New_itemsList):
    print(i.size)
    print(j.size)
    print()

91
3

75
4

95
3

74
6

46
2

25
2

13
2

15
0



###Get Updt RatMat

In [16]:
print(AllDF_start['userId'].nunique(),AllDF_start['productId'].nunique())   #

52808 17329


In [17]:
rows_i = AllDF_start['userId'].nunique()
cols_i = AllDF_start['productId'].nunique()
print(A0_df.shape)
A0_RatMat_updt = SingleRatingMatrix(A0_df,'userId', 'productId',rows_i,cols_i)
A0_RatMat_updt

(339913, 4)


<52808x17329 sparse matrix of type '<class 'numpy.float64'>'
	with 339644 stored elements in Compressed Sparse Row format>

In [18]:
PSI_TrainMat_updt = AllRatingMatrices(ΔA_train,'userId','productId',rows_i ,cols_i)
PSI_TrainMat_updt

[<52808x17329 sparse matrix of type '<class 'numpy.float64'>'
 	with 4066 stored elements in Compressed Sparse Row format>,
 <52808x17329 sparse matrix of type '<class 'numpy.float64'>'
 	with 3053 stored elements in Compressed Sparse Row format>,
 <52808x17329 sparse matrix of type '<class 'numpy.float64'>'
 	with 2559 stored elements in Compressed Sparse Row format>,
 <52808x17329 sparse matrix of type '<class 'numpy.float64'>'
 	with 2208 stored elements in Compressed Sparse Row format>,
 <52808x17329 sparse matrix of type '<class 'numpy.float64'>'
 	with 2163 stored elements in Compressed Sparse Row format>,
 <52808x17329 sparse matrix of type '<class 'numpy.float64'>'
 	with 2166 stored elements in Compressed Sparse Row format>,
 <52808x17329 sparse matrix of type '<class 'numpy.float64'>'
 	with 1855 stored elements in Compressed Sparse Row format>,
 <52808x17329 sparse matrix of type '<class 'numpy.float64'>'
 	with 1644 stored elements in Compressed Sparse Row format>,
 <52808x

In [19]:
print(AllDF_start['userId'].nunique(),AllDF_start['productId'].nunique()) #

52808 17329


### Allsteps Update

In [None]:
# rows_i = AllDF_start['userId'].nunique()
# cols_i = AllDF_start['productId'].nunique()
# print(AllDF_start.shape)
# AllDF_startRatMat = SingleRatingMatrix(AllDF_start,'userId', 'productId',rows_i,cols_i)

# U_start,S_start,V_start = svds(AllDF_startRatMat,k=50)
# V_start = V_start.T
# print()
# print("U_start shape: ",U_start.shape)
# print("S_start shape: ",S_start.shape)
# print("V_start shape: ",V_start.shape)

In [20]:
U_start,S_start,V_start = last_psiTrainMat(A0_RatMat_updt,PSI_TrainMat_updt,k=50)
print()
print("U_start shape: ",U_start.shape)
print("S_start shape: ",S_start.shape)
print("V_start shape: ",V_start.shape)

100%|██████████| 10/10 [00:02<00:00,  3.80it/s]


U_start shape:  (52808, 50)
S_start shape:  (50, 50)
V_start shape:  (17329, 50)





In [21]:
U_list,S_list,V_list = [],[],[]
U_list.append(U_start)
S_list.append(S_start)
V_list.append(V_start)
print(len(U_list),len(V_list))

1 1


In [22]:
DItems_, DUsers_,In_DomainUSERS,In_DomainITEMS,userID_dict,itemID_dict,AllUpdtUSERS_,AllUpdtITEMS_,U_list,S_list,V_list = ALLSTEPs_UPDATE(AllDF_start,
                                            UserItemDF_list,New_itemsList,New_usersList,U_list,S_list,V_list,'userId','productId',Nsteps=8,k=50,Forced_Orth=False)

100%|██████████| 7/7 [00:22<00:00,  3.27s/it]


#### Output Check

In [23]:
AllDF_start['userId'].max() 

52807

In [24]:
print(len(In_DomainITEMS[17329:]))
print(In_DomainITEMS[17329:])
print()
print(len(In_DomainUSERS[52808:]))
print(In_DomainUSERS[52808:])

22
[17329, 17330, 17331, 17332, 17333, 17334, 17335, 17339, 17340, 17342, 17336, 17337, 17338, 17341, 17344, 17343, 17345, 17347, 17348, 17346, 17349, 17350]

430
[52808, 52809, 52810, 52811, 52812, 52813, 52814, 52815, 52816, 52817, 52818, 52819, 52820, 52821, 52822, 52823, 52824, 52825, 52826, 52827, 52828, 52829, 52830, 52832, 52833, 52834, 52835, 52836, 52837, 52838, 52839, 52840, 52841, 52842, 52843, 52844, 52845, 52846, 52847, 52848, 52849, 52850, 52851, 52852, 52855, 52856, 52857, 52858, 52859, 52861, 52862, 52863, 52864, 52865, 52866, 52867, 52868, 52869, 52871, 52872, 52873, 52874, 52875, 52876, 52877, 52878, 52879, 52880, 52881, 52882, 52883, 52884, 52885, 52886, 52887, 52888, 52889, 52890, 52891, 52892, 52893, 52894, 52895, 52896, 52897, 52898, 52899, 52900, 52901, 52902, 52903, 52904, 52905, 52906, 52908, 52909, 52910, 52911, 52912, 52913, 52914, 52915, 52916, 52917, 52918, 52919, 52920, 52921, 52922, 52923, 52924, 52925, 52926, 52927, 52928, 52929, 52930, 52931, 52932, 529

In [25]:
#deferred items and users
print(len(DItems_))
print(DItems_)
print()
print(len(DUsers_))
print(DUsers_)      

0
[]

4
[53231, 53236, 53237, 53238]


In [26]:
for x in list(itemID_dict)[17329:]:
    print ("Id: {}, UpdtId: {} ".format(x,  itemID_dict[x]))   ##id ==key || updtedid == values  ,itemID_dict

Id: 17329, UpdtId: 17329 
Id: 17330, UpdtId: 17330 
Id: 17331, UpdtId: 17331 
Id: 17332, UpdtId: 17332 
Id: 17333, UpdtId: 17333 
Id: 17334, UpdtId: 17334 
Id: 17335, UpdtId: 17335 
Id: 17339, UpdtId: 17336 
Id: 17340, UpdtId: 17337 
Id: 17342, UpdtId: 17338 
Id: 17336, UpdtId: 17339 
Id: 17337, UpdtId: 17340 
Id: 17338, UpdtId: 17341 
Id: 17341, UpdtId: 17342 
Id: 17344, UpdtId: 17343 
Id: 17343, UpdtId: 17344 
Id: 17345, UpdtId: 17345 
Id: 17347, UpdtId: 17346 
Id: 17348, UpdtId: 17347 
Id: 17346, UpdtId: 17348 
Id: 17349, UpdtId: 17349 
Id: 17350, UpdtId: 17350 


In [27]:
for x in list(userID_dict)[52808:]:
    print ("Id: {}, UpdtId: {} ".format(x,  userID_dict[x]))   ##id ==key || updtedid == values 

Id: 52808, UpdtId: 52808 
Id: 52809, UpdtId: 52809 
Id: 52810, UpdtId: 52810 
Id: 52811, UpdtId: 52811 
Id: 52812, UpdtId: 52812 
Id: 52813, UpdtId: 52813 
Id: 52814, UpdtId: 52814 
Id: 52815, UpdtId: 52815 
Id: 52816, UpdtId: 52816 
Id: 52817, UpdtId: 52817 
Id: 52818, UpdtId: 52818 
Id: 52819, UpdtId: 52819 
Id: 52820, UpdtId: 52820 
Id: 52821, UpdtId: 52821 
Id: 52822, UpdtId: 52822 
Id: 52823, UpdtId: 52823 
Id: 52824, UpdtId: 52824 
Id: 52825, UpdtId: 52825 
Id: 52826, UpdtId: 52826 
Id: 52827, UpdtId: 52827 
Id: 52828, UpdtId: 52828 
Id: 52829, UpdtId: 52829 
Id: 52830, UpdtId: 52830 
Id: 52832, UpdtId: 52831 
Id: 52833, UpdtId: 52832 
Id: 52834, UpdtId: 52833 
Id: 52835, UpdtId: 52834 
Id: 52836, UpdtId: 52835 
Id: 52837, UpdtId: 52836 
Id: 52838, UpdtId: 52837 
Id: 52839, UpdtId: 52838 
Id: 52840, UpdtId: 52839 
Id: 52841, UpdtId: 52840 
Id: 52842, UpdtId: 52841 
Id: 52843, UpdtId: 52842 
Id: 52844, UpdtId: 52843 
Id: 52845, UpdtId: 52844 
Id: 52846, UpdtId: 52845 
Id: 52847, U

In [28]:
print(len(U_list),len(V_list))
print(U_list[0].shape,V_list[0].shape)
print()
for u,v in zip(U_list[1:],V_list[1:]):
    print(u.shape,v.shape)   

9 9
(52808, 50) (17329, 50)

(52894, 50) (17330, 50)
(52968, 50) (17332, 50)
(53061, 50) (17336, 50)
(53139, 50) (17344, 50)
(53188, 50) (17346, 50)
(53212, 50) (17349, 50)
(53225, 50) (17349, 50)
(53238, 50) (17351, 50)


In [29]:
for i,j in zip(AllUpdtUSERS_,AllUpdtITEMS_):
    print(len(i),len(j))

52894 17330
52968 17332
53061 17336
53139 17344
53188 17346
53212 17349
53225 17349
53238 17351


In [30]:
print(len(In_DomainUSERS),len(In_DomainITEMS))

53238 17351


Data Adjustments

#### Data Adjustment

In [31]:
newHOLDOUT_LIST = get_NEWHoldout(HOLDOUT_list,userID_dict,itemID_dict,AllUpdtUSERS_,AllUpdtITEMS_,'userId','productId',n=8)
print(len(newHOLDOUT_LIST))
print()
for old,new in zip(HOLDOUT_list,newHOLDOUT_LIST):
    print(old.shape[0],new.shape[0])

100%|██████████| 8/8 [00:00<00:00, 86.91it/s]

8

865 857
687 676
816 800
718 713
560 556
465 463
405 398
243 239





In [32]:
newUserItem_list = adjustedAllDF(UserItemDF_list,userID_dict,itemID_dict,AllUpdtUSERS_,AllUpdtITEMS_,'userId','productId',n=8)
print()
for df in newUserItem_list:
    print(df['Updated_UserID'].nunique(),df['Updated_ItemID'].nunique())


52894 17330
52968 17332
53061 17336
53139 17344
53188 17346
53212 17349
53225 17349
53238 17351


In [33]:
new_PSIDFlist = adjustedPSI_DF(PSITest_list,userID_dict,itemID_dict,AllUpdtUSERS_,AllUpdtITEMS_,'userId','productId',n=8)
for old,new in zip(PSITest_list,new_PSIDFlist):
    print(old.shape[0],new.shape[0])

805 805
742 742
1068 1068
837 837
593 593
336 336
178 178
90 90


#### Get UserItem MAT

In [34]:
UserItem_MatUpdt = All_SingleStepRatMat(newUserItem_list,'Updated_UserID','Updated_ItemID')
print(len(UserItem_MatUpdt))
UserItem_MatUpdt

8


[<52894x17330 sparse matrix of type '<class 'numpy.float64'>'
 	with 363560 stored elements in Compressed Sparse Row format>,
 <52968x17332 sparse matrix of type '<class 'numpy.float64'>'
 	with 365165 stored elements in Compressed Sparse Row format>,
 <53061x17336 sparse matrix of type '<class 'numpy.float64'>'
 	with 366920 stored elements in Compressed Sparse Row format>,
 <53139x17344 sparse matrix of type '<class 'numpy.float64'>'
 	with 368571 stored elements in Compressed Sparse Row format>,
 <53188x17346 sparse matrix of type '<class 'numpy.float64'>'
 	with 369882 stored elements in Compressed Sparse Row format>,
 <53212x17349 sparse matrix of type '<class 'numpy.float64'>'
 	with 370778 stored elements in Compressed Sparse Row format>,
 <53225x17349 sparse matrix of type '<class 'numpy.float64'>'
 	with 371421 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 371915 stored elements in Compressed Sparse Row f

In [35]:
print(len(U_list),len(V_list))
for u,v in zip(U_list,V_list):
    print(u.shape[0],v.shape[0])   

9 9
52808 17329
52894 17330
52968 17332
53061 17336
53139 17344
53188 17346
53212 17349
53225 17349
53238 17351


In [36]:
print(AllDF_start['userId'].nunique(),AllDF_start['productId'].nunique())
print()
for i,j in zip(AllUpdtUSERS_,AllUpdtITEMS_):
    print(len(i),len(j))

52808 17329

52894 17330
52968 17332
53061 17336
53139 17344
53188 17346
53212 17349
53225 17349
53238 17351


#### Prediction

In [None]:
print(len(UserItem_MatUpdt),len(newHOLDOUT_LIST))

8 8


In [37]:
Vpsi_listUpdt = V_list[1:]
for v in Vpsi_listUpdt:
    print(v.shape)

(17330, 50)
(17332, 50)
(17336, 50)
(17344, 50)
(17346, 50)
(17349, 50)
(17349, 50)
(17351, 50)


In [38]:
V_1 = Vpsi_listUpdt[0]
V_1.shape

(17330, 50)

In [39]:
Top_NPred = TopNPred(UserItem_MatUpdt[0],newHOLDOUT_LIST[0],V_1,'Updated_UserID',10)
Top_NPred

array([[13664, 12255, 12012, ..., 12776,  5008, 13920],
       [12230,  4114,  7941, ...,  7353,  6190,  6665],
       [ 2819,  1287,  1565, ..., 10727,  2820,  1359],
       ...,
       [13413, 12012, 14484, ..., 14002, 14542, 14009],
       [ 5687,  1359,   829, ...,  3390,  3027, 12776],
       [ 5030,  1489,  5008, ...,  9805,  5687, 12231]])

In [40]:
Hitrate_Eval(newHOLDOUT_LIST[0],Top_NPred,'Updated_UserID','Updated_ItemID')

Number of hits:  43
Total Num of users:  857
Recommendation HitRate:  0.050175029171528586


0.050175029171528586

In [41]:
AllUpdt_pred =  TQDMgetALLTopNPred(UserItem_MatUpdt,newHOLDOUT_LIST,Vpsi_listUpdt,'Updated_UserID',N = 10)
print(len(AllUpdt_pred))
AllUpdt_pred[:2]

8it [00:01,  4.66it/s]

8





[array([[13664, 12255, 12012, ..., 12776,  5008, 13920],
        [12230,  4114,  7941, ...,  7353,  6190,  6665],
        [ 2819,  1287,  1565, ..., 10727,  2820,  1359],
        ...,
        [13413, 12012, 14484, ..., 14002, 14542, 14009],
        [ 5687,  1359,   829, ...,  3390,  3027, 12776],
        [ 5030,  1489,  5008, ...,  9805,  5687, 12231]]),
 array([[11083,  9421,  4238, ..., 11588, 11929,  3601],
        [ 3819,  3455,  2848, ...,  4807,  1760,  8121],
        [10727, 13923, 15655, ..., 16535,  6903,  5030],
        ...,
        [10129, 12255,  8072, ..., 13648, 13218, 13672],
        [  928,    59,   172, ...,  1879,  3754,  3115],
        [  237,    59,   172, ...,   270,  7812,   154]])]

In [42]:
AllSteps_HitR_updt, LowerBand_updt, Avg_HitR_updt, UpperBand_updt   = getAll_HitRate(newHOLDOUT_LIST,AllUpdt_pred,'Updated_UserID','Updated_ItemID')

Number of hits:  43
Total Num of users:  857
Recommendation HitRate:  0.050175029171528586
Number of hits:  26
Total Num of users:  676
Recommendation HitRate:  0.038461538461538464
Number of hits:  20
Total Num of users:  800
Recommendation HitRate:  0.025
Number of hits:  15
Total Num of users:  713
Recommendation HitRate:  0.021037868162692847
Number of hits:  17
Total Num of users:  556
Recommendation HitRate:  0.030575539568345324
Number of hits:  23
Total Num of users:  463
Recommendation HitRate:  0.04967602591792657
Number of hits:  10
Total Num of users:  398
Recommendation HitRate:  0.02512562814070352
Number of hits:  2
Total Num of users:  239
Recommendation HitRate:  0.008368200836820083
Average HitRate for All Recommendations:  0.031052478782444423


### Regular PSI 

#### Get Rating Matrices

In [43]:
newAMZg_DF = ADJUST_mainDF(AMZG_DF,userID_dict,itemID_dict,AllUpdtUSERS_,AllUpdtITEMS_,'userId','productId')
print(newAMZg_DF.shape)
print(newAMZg_DF['userId'].max(),newAMZg_DF['productId'].max())
print(newAMZg_DF['userId'].nunique(),newAMZg_DF['productId'].nunique())
print()
print(newAMZg_DF['Updated_UserID'].max(),newAMZg_DF['Updated_ItemID'].max())
print(newAMZg_DF['Updated_UserID'].nunique(),newAMZg_DF['Updated_ItemID'].nunique())
newAMZg_DF.head()

(372449, 6)
53241 17350
53238 17351

53237 17350
53238 17351


Unnamed: 0,userId,productId,rating,timestamp,Updated_UserID,Updated_ItemID
0,0,0,1,2001-01-01,0,0
1,1,1,1,2001-01-01,1,1
2,2,2,1,2001-01-01,2,2
3,3,3,1,2001-01-01,3,3
4,4,4,1,2001-01-02,4,4


In [44]:
rows_i = newAMZg_DF['Updated_UserID'].nunique()
cols_i = newAMZg_DF['Updated_ItemID'].nunique()
print(A0_df.shape)
A0_Rating_matrix = SingleRatingMatrix(A0_df,'userId', 'productId',rows_i,cols_i)  ##changes start within steps...
A0_Rating_matrix

(339913, 4)


<53238x17351 sparse matrix of type '<class 'numpy.float64'>'
	with 339644 stored elements in Compressed Sparse Row format>

In [45]:
for df in new_PSIDFlist:
    print(df['Updated_UserID'].max(),df['Updated_ItemID'].max())

52893 17329
52967 17327
53060 17335
53138 17343
53187 17345
53210 17348
53222 17346
53237 17265


In [46]:
print("Length of PSI_Train :",len(ΔA_train))
print("Length of PSI_Test  :",len(new_PSIDFlist))

Length of PSI_Train : 10
Length of PSI_Test  : 8


In [47]:
PSI_train_matrix = AllRatingMatrices(ΔA_train,'userId','productId',rows_i ,cols_i)
PSI_train_matrix   

[<53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 4066 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 3053 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 2559 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 2208 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 2163 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 2166 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 1855 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 1644 stored elements in Compressed Sparse Row format>,
 <53238x

In [48]:
PSI_test_matrix = AllRatingMatrices(new_PSIDFlist,'Updated_UserID','Updated_ItemID',rows_i ,cols_i)                                                                      
print(len(PSI_test_matrix))
PSI_test_matrix


8


[<53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 805 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 742 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 1068 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 837 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 593 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 336 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 178 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 90 stored elements in Compressed Sparse Row format>]

In [49]:
UserItemMAT_reglist = AllRatingMatrices(newUserItem_list,'Updated_UserID','Updated_ItemID',rows_i ,cols_i) 
UserItemMAT_reglist  

[<53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 363560 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 365165 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 366920 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 368571 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 369882 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 370778 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 371421 stored elements in Compressed Sparse Row format>,
 <53238x17351 sparse matrix of type '<class 'numpy.float64'>'
 	with 371915 stored elements in Compressed Sparse Row f

In [50]:
print(AllDF_list[-1]['userId'].nunique(),AllDF_list[-1]['productId'].nunique())
print(AMZG_DF['userId'].nunique(),AMZG_DF['productId'].nunique())

53242 17351
53242 17351


#### PSI Prediction

In [51]:
                                                                                   #integratorOnMat(A0,ΔA_train_matrix,ΔA_test_matrix,k):
Vpsi_listReg =  integratorOnMat(A0_Rating_matrix,PSI_train_matrix,PSI_test_matrix,k=50)  #PSI == projector splitting integrator
print(len(Vpsi_listReg))  

8


In [52]:
Top10Pred_psi = TopNPred(UserItemMAT_reglist[0], newHOLDOUT_LIST[0],Vpsi_listReg[0],'Updated_UserID',N=10)
print(Top10Pred_psi.shape)
Top10Pred_psi

(857, 10)


array([[13664, 12255, 12012, ..., 12776,  5008, 13920],
       [12230,  4114,  7941, ...,  7353,  6190,  6665],
       [ 2819,  1287,  1565, ..., 10727,  2820,  1359],
       ...,
       [13413, 12012, 14484, ..., 14002, 14542, 14009],
       [ 5687,  1359,   829, ...,  3390,  3027, 12776],
       [ 5030,  1489,  5008, ...,  9805,  5687, 12231]])

In [53]:
Hitrate_Eval(newHOLDOUT_LIST[0],Top10Pred_psi,'Updated_UserID','Updated_ItemID')

Number of hits:  43
Total Num of users:  857
Recommendation HitRate:  0.050175029171528586


0.050175029171528586

In [54]:
All_PSI_PRED =  TQDMgetALLTopNPred(UserItemMAT_reglist,newHOLDOUT_LIST,Vpsi_listReg,'Updated_UserID',N = 10)
print(len(All_PSI_PRED))
All_PSI_PRED[:2]

8it [00:01,  4.89it/s]

8





[array([[13664, 12255, 12012, ..., 12776,  5008, 13920],
        [12230,  4114,  7941, ...,  7353,  6190,  6665],
        [ 2819,  1287,  1565, ..., 10727,  2820,  1359],
        ...,
        [13413, 12012, 14484, ..., 14002, 14542, 14009],
        [ 5687,  1359,   829, ...,  3390,  3027, 12776],
        [ 5030,  1489,  5008, ...,  9805,  5687, 12231]]),
 array([[11083,  9421,  4238, ..., 11588, 11929,  3601],
        [ 3819,  3455,  2848, ...,  4807,  1760,  8121],
        [10727, 13923, 15655, ..., 16535,  6903,  5030],
        ...,
        [10129, 12255,  8072, ..., 13648, 13218, 13672],
        [  928,    59,   172, ...,  1879,  3754,  3115],
        [  237,    59,   172, ...,   270,  7812,   154]])]

In [55]:
AllSteps_Hitrate_psi, LowerBand, Avg_HitRate, UpperBand   = getAll_HitRate(newHOLDOUT_LIST,All_PSI_PRED,'Updated_UserID','Updated_ItemID')

Number of hits:  43
Total Num of users:  857
Recommendation HitRate:  0.050175029171528586
Number of hits:  26
Total Num of users:  676
Recommendation HitRate:  0.038461538461538464
Number of hits:  21
Total Num of users:  800
Recommendation HitRate:  0.02625
Number of hits:  15
Total Num of users:  713
Recommendation HitRate:  0.021037868162692847
Number of hits:  17
Total Num of users:  556
Recommendation HitRate:  0.030575539568345324
Number of hits:  22
Total Num of users:  463
Recommendation HitRate:  0.047516198704103674
Number of hits:  10
Total Num of users:  398
Recommendation HitRate:  0.02512562814070352
Number of hits:  2
Total Num of users:  239
Recommendation HitRate:  0.008368200836820083
Average HitRate for All Recommendations:  0.03093875038071656


In [56]:
for df in newUserItem_list:
    print(df['Updated_UserID'].nunique(),df['Updated_ItemID'].nunique())

52894 17330
52968 17332
53061 17336
53139 17344
53188 17346
53212 17349
53225 17349
53238 17351


###  RandRec:Test

In [None]:
print(len(UserItem_MatUpdt),len(newHOLDOUT_LIST))

8 8


In [None]:
All_RandPred = get_ALLRandPred(UserItem_MatUpdt,newHOLDOUT_LIST,'Updated_UserID',N=10)
All_RandPred[:2]  

8it [00:02,  3.81it/s]


[array([[ 4997,  1277,  3397, ..., 13074, 11424,  8355],
        [13658,  9719, 11786, ...,  9565,  6288,  2143],
        [ 2222,  4426, 11526, ...,  3724, 12245,  9353],
        ...,
        [11445,  3541, 12022, ..., 16711,  7446, 15335],
        [11184, 10147,  7645, ...,    46,  1534,  4738],
        [10238, 16054,  2390, ..., 16763,  3514, 12209]]),
 array([[ 1982,  3235, 11099, ...,  2938,   765,  9281],
        [14573,  9665,  5641, ...,  7224,  3170, 10120],
        [10139, 13444, 11819, ..., 10274,  5166,  6747],
        ...,
        [ 2915,  7827, 12274, ..., 17293, 12459,  4018],
        [ 2825,  1436, 13636, ..., 11140, 12051,  5315],
        [12441, 13650,  9944, ..., 14026,  2711,  4632]])]

In [None]:
AllSteps_Hitrate, LowerBand, Avg_HitRate, UpperBand = getAll_RandomHitRate(newHOLDOUT_LIST,All_RandPred,'Updated_UserID','Updated_ItemID')

Number of hits:  0
Total Num of users:  857
Recommendation HitRate:  0.0
Number of hits:  0
Total Num of users:  676
Recommendation HitRate:  0.0
Number of hits:  0
Total Num of users:  800
Recommendation HitRate:  0.0
Number of hits:  0
Total Num of users:  713
Recommendation HitRate:  0.0
Number of hits:  1
Total Num of users:  556
Recommendation HitRate:  0.0017985611510791368
Number of hits:  0
Total Num of users:  463
Recommendation HitRate:  0.0
Number of hits:  0
Total Num of users:  398
Recommendation HitRate:  0.0
Number of hits:  0
Total Num of users:  239
Recommendation HitRate:  0.0
Average HitRate for All Recommendations:  0.0002248201438848921


### Most Pop Rec

In [None]:
new_PSIDFlist = adjustedPSI_DF(PSITest_list,userID_dict,itemID_dict,AllUpdtUSERS_,AllUpdtITEMS_,n=8)
for old,new in zip(PSITest_list,new_PSIDFlist):
    print(old.shape[0],new.shape[0])

In [None]:
All_MostPOPRED_List =  getAll_MOSTPOP_Pred(new_PSIDFlist,newHOLDOUT_LIST,'Updated_UserID','Updated_ItemID',N=10)
All_MostPOPRED_List[:2]

[array([[ 8818, 12079, 17129, ...,  7965, 13218, 16535],
        [ 8818, 12079, 17129, ...,  7965, 13218, 16535],
        [ 8818, 12079, 17129, ...,  7965, 13218, 16535],
        ...,
        [ 8818, 12079, 17129, ...,  7965, 13218, 16535],
        [ 8818, 12079, 17129, ...,  7965, 13218, 16535],
        [ 8818, 12079, 17129, ...,  7965, 13218, 16535]]),
 array([[17226,  8818, 12079, ..., 17129, 13981, 17008],
        [17226,  8818, 12079, ..., 17129, 13981, 17008],
        [17226,  8818, 12079, ..., 17129, 13981, 17008],
        ...,
        [17226,  8818, 12079, ..., 17129, 13981, 17008],
        [17226,  8818, 12079, ..., 17129, 13981, 17008],
        [17226,  8818, 12079, ..., 17129, 13981, 17008]])]

In [None]:
AllSteps_Hitrate, LowerBand, Avg_HitRate, UpperBand  = getAll_MostPOPHitRate(newHOLDOUT_LIST,All_MostPOPRED_List,'Updated_UserID','Updated_ItemID')

Number of hits:  42
Total Num of users:  857
Recommendation HitRate:  0.049008168028004666
Number of hits:  26
Total Num of users:  676
Recommendation HitRate:  0.038461538461538464
Number of hits:  19
Total Num of users:  800
Recommendation HitRate:  0.02375
Number of hits:  21
Total Num of users:  713
Recommendation HitRate:  0.029453015427769985
Number of hits:  12
Total Num of users:  556
Recommendation HitRate:  0.02158273381294964
Number of hits:  38
Total Num of users:  463
Recommendation HitRate:  0.08207343412526998
Number of hits:  28
Total Num of users:  398
Recommendation HitRate:  0.07035175879396985
Number of hits:  5
Total Num of users:  239
Recommendation HitRate:  0.02092050209205021
Average HitRate for All Recommendations:  0.0419501438426941


In [None]:
print(len(UserItem_MatUpdt),len(newHOLDOUT_LIST))

8 8


### CORRELATION Report


In [57]:
print(UserItem_MatUpdt[0].shape)
print(UserItemMAT_reglist[0].shape)

(52894, 17330)
(53238, 17351)


#### getAll User Pred

In [58]:
AllUsers_updtPRED =  TQDMgetALLTopNPred_ALLUSERS(UserItem_MatUpdt,Vpsi_listUpdt, N=10) 
print(len(AllUsers_updtPRED))
AllUsers_updtPRED[:2]

8it [02:10, 16.27s/it]

8





[array([[ 1359,   237,    59, ...,   873,   727,   154],
        [10129,  8072, 10567, ...,  1359,  5687, 12099],
        [  791, 10129,  8072, ...,  4344, 15655,   928],
        ...,
        [13413, 12012, 14484, ..., 14002, 14542, 14009],
        [ 5687,  1359,   829, ...,  3390,  3027, 12776],
        [ 5030,  1489,  5008, ...,  9805,  5687, 12231]]),
 array([[ 1359,   237,    59, ...,   873,   727,   154],
        [10129,  8072, 10567, ...,  1359,  5687, 12099],
        [  791, 10129,  8072, ...,  4344, 15655,   928],
        ...,
        [16429, 12579, 17226, ..., 16184, 14505, 15652],
        [ 4148, 15202,  1489, ...,  7326, 15454, 13664],
        [11588, 10567, 12202, ...,  5030, 12099,  4832]])]

In [59]:
AllUsers_updtPRED[2].shape

(53061, 10)

####  SingleRank Corr 

In [60]:
AllSteps = list(range(1,8))
print(len(AllSteps))
print(len(AllUsers_updtPRED))
AllSteps

7
8


[1, 2, 3, 4, 5, 6, 7]

In [61]:
UpdtPSICorr_ = Updt_getAll_AvgCorr(AllUsers_updtPRED,newUserItem_list,AllSteps,'Updated_UserID')  #,
UpdtPSICorr_

array([array([1.        , 1.        , 1.        , ..., 0.99056095, 0.45352501,
       0.8372122 ]),
       array([1.        , 1.        , 1.        , ..., 0.96643133, 0.85080241,
       0.4196961 ]),
       array([1.        , 1.        , 1.        , ..., 0.48421476, 1.        ,
       0.36773835]),
       array([1.        , 1.        , 1.        , ..., 0.92690249, 1.        ,
       0.8923215 ]),
       array([1.        , 1.        , 1.        , ..., 0.59107566, 0.63731145,
       0.01933435]),
       array([1.        , 1.        , 1.        , ..., 0.85662181, 1.        ,
       0.49898457]),
       array([1.        , 1.        , 1.        , ..., 0.38701494, 1.        ,
       0.70757663])], dtype=object)

In [62]:
UpdtPSICorr_.shape
for df in UpdtPSICorr_:
    print(df.mean())  

0.9899609562814747
0.9905641149516281
0.9904058503197232
0.9933173287467842
0.9955900374158145
0.9974810324489751
0.9984659483219057


#### RegPSI Corr_

In [63]:
AllUsersRegPIS_PRED =  TQDMgetALLTopNPred_ALLUSERS(UserItemMAT_reglist,Vpsi_listReg, N=10) 
print(len(AllUsersRegPIS_PRED))
AllUsersRegPIS_PRED[:2]

8it [02:03, 15.48s/it]

8





[array([[ 1359,   237,    59, ...,   873,   727,   154],
        [10129,  8072, 10567, ...,  1359,  5687, 12099],
        [  791, 10129,  8072, ...,  4344, 15655,   928],
        ...,
        [ 5786,  5788,  5793, ...,  5782,  5781, 17350],
        [ 5786,  5788,  5793, ...,  5782,  5781, 17350],
        [ 5786,  5788,  5793, ...,  5782,  5781, 17350]]),
 array([[ 1359,   237,    59, ...,   873,   727,   154],
        [10129,  8072, 10567, ...,  1359,  5687, 12099],
        [  791, 10129,  8072, ...,  4344, 15655,   928],
        ...,
        [ 5786,  5788,  5793, ...,  5782,  5781, 17350],
        [ 5786,  5788,  5793, ...,  5782,  5781, 17350],
        [ 5786,  5788,  5793, ...,  5782,  5781, 17350]])]

In [64]:
AllUsersRegPIS_PRED[2].shape

(53238, 10)

####  RegPSI SingleRankCorr 

In [65]:
AllSteps = list(range(1,8))
print(len(AllSteps))
print(len(AllUsersRegPIS_PRED))
AllSteps  

7
8


[1, 2, 3, 4, 5, 6, 7]

In [66]:
RegPSICorr_ = Updt_getAll_AvgCorr(AllUsersRegPIS_PRED,newUserItem_list,AllSteps,'Updated_UserID')  #,
RegPSICorr_

array([array([1.        , 1.        , 1.        , ..., 0.99056095, 0.45352501,
       0.8372122 ]),
       array([1.        , 1.        , 1.        , ..., 0.96643133, 0.85080241,
       0.4196961 ]),
       array([1.        , 1.        , 1.        , ..., 0.51950592, 1.        ,
       0.54851568]),
       array([1.        , 1.        , 1.        , ..., 0.92690249, 1.        ,
       0.8923215 ]),
       array([1.        , 1.        , 1.        , ..., 0.5850961 , 0.63731145,
       0.01933435]),
       array([1.        , 1.        , 1.        , ..., 0.90698876, 1.        ,
       0.50325866]),
       array([1.        , 1.        , 1.        , ..., 0.38701494, 1.        ,
       0.72796442])], dtype=object)

In [67]:
RegPSICorr_.shape
for df in RegPSICorr_:
    print(df.mean())  

0.9868358626012891
0.9871147545573127
0.987093266074119
0.9910057051073075
0.9940084226728061
0.9961010544835398
0.9980445634054407
