In [1]:
import polars as pl
import numpy as np
import pandas as pd
from preprocessing_dataset import dataset_split, OrdinalEncoder, CSRConverter, binarize_rating
from models import ALS, EASE, SLIM
from metrics import recall_k, ndcg_k

In [3]:
ml_dataset = pl.from_pandas(pd.read_table('datasets/ml-1m/ratings.dat', sep='::', encoding = "latin1", engine = "python", 
              names = ['user', 'item', 'rating', 'timestamp']))

In [12]:
beauty = pl.read_csv('datasets/Amazon_beauty/ratings_Beauty.csv', has_header=False).rename({'column_1': 'user',
                                                                                           'column_2': 'item',
                                                                                           'column_3': 'rating',
                                                                                           'column_4': 'timestamp'})
beauty.group_by('user').agg(['item', 'rating', 'timestamp']).with_columns(len=pl.col('item').list.len()).filter(pl.col('len') > 2)

user,item,rating,timestamp,len
str,list[str],list[f64],list[i64],u32
"""A1YMM5JZYWTI82…","[""B00008KA7P"", ""B00016XJ4M"", … ""B00FEMQUKS""]","[2.0, 5.0, … 1.0]","[1334707200, 1392163200, … 1398643200]",13
"""A3231JADAQOIFM…","[""B000RO0I9M"", ""B000VDAFHE"", … ""B00BMO02PG""]","[1.0, 1.0, … 2.0]","[1386460800, 1392854400, … 1386460800]",4
"""AYU3RUEUKJK07""","[""B000XVEZFC"", ""B002LFNQT4"", ""B00853EQUY""]","[5.0, 3.0, 5.0]","[1356912000, 1356912000, 1356912000]",3
"""A1U2K1NR0WSYQ2…","[""B00014DMQE"", ""B000B641EE"", … ""B00414X6F2""]","[5.0, 3.0, … 4.0]","[1393718400, 1291334400, … 1291334400]",4
"""A1TMI8ATYGJ4UI…","[""B0000636YL"", ""B0012XO2W6"", ""B004A84HHA""]","[5.0, 5.0, 5.0]","[1320364800, 1306022400, 1320364800]",3
…,…,…,…,…
"""A2GR8DKPGFS0FX…","[""B00016XJ4M"", ""B00025WZOW"", … ""B00ACB6CY0""]","[5.0, 5.0, … 5.0]","[1400544000, 1360627200, … 1381190400]",9
"""A1RV29ZI6Z1WV2…","[""B000FQN1SC"", ""B000KFX10G"", … ""B007XA49O8""]","[3.0, 5.0, … 5.0]","[1321574400, 1396137600, … 1396137600]",12
"""A3SB9RQQQBDL37…","[""B00117CH5M"", ""B002XITLL6"", ""B004ETD92O""]","[5.0, 2.0, 5.0]","[1362528000, 1394582400, 1399939200]",3
"""A3QJQQZTKFV7BJ…","[""B00008MOQE"", ""B001CT09W2"", … ""B00DX0EGZE""]","[4.0, 5.0, … 5.0]","[1361577600, 1395014400, … 1380499200]",7


In [4]:
ml_dataset

user,item,rating,timestamp
i64,i64,i64,i64
1,1193,5,978300760
1,661,3,978302109
1,914,3,978301968
1,3408,4,978300275
1,2355,5,978824291
…,…,…,…
6040,1091,1,956716541
6040,1094,5,956704887
6040,562,5,956704746
6040,1096,4,956715648


In [8]:
dataset = ml_dataset.filter(pl.col('rating') > 3.5).group_by('user').agg(['item', 'rating', 'timestamp'])
dataset = dataset.with_columns(len=pl.col('item').list.len()).filter(pl.col('len') > 1).drop('len')
dataset = dataset.explode(['item', 'rating', 'timestamp'])
dataset

user,item,rating,timestamp
i64,i64,i64,i64
5520,588,4,960124272
5520,590,5,960598162
5520,593,5,959738212
5520,2070,5,959791624
5520,2083,4,960124402
…,…,…,…
1045,342,5,974966194
1045,34,4,974966139
1045,357,4,974966085
1045,39,4,974966108


In [2]:
def load_ml_dataset_1m(min_value: float=3.5) -> pl.DataFrame:
    dataset = pl.from_pandas(pd.read_table('datasets/ml-1m/ratings.dat', sep='::', encoding = "latin1", engine = "python", 
              names = ['user', 'item', 'rating', 'timestamp']))
    dataset = dataset.filter(pl.col('rating') > min_value).group_by('user').agg(['item', 'rating', 'timestamp'])
    dataset = dataset.with_columns(len=pl.col('item').list.len()).filter(pl.col('len') > 1).drop('len')
    dataset = dataset.explode(['item', 'rating', 'timestamp'])
    return dataset

In [10]:
def load_ml_dataset_20m(min_value: float=3.5) -> pl.DataFrame:
    dataset = pl.read_csv('datasets/ml-20m/ratings.csv')
    dataset = dataset.rename({'userId': 'user', 'movieId': 'item'})
    dataset = dataset.filter(pl.col('rating') > min_value).group_by('user').agg(['item', 'rating', 'timestamp'])
    dataset = dataset.with_columns(len=pl.col('item').list.len()).filter(pl.col('len') > 1).drop('len')
    dataset = dataset.explode(['item', 'rating', 'timestamp'])
    return dataset

In [13]:
def load_beauty_dataset(min_value: float=3.5) -> pl.DataFrame:
    dataset = pl.read_csv('datasets/Amazon_beauty/ratings_Beauty.csv', has_header=False).rename({'column_1': 'user',
                                                                                           'column_2': 'item',
                                                                                           'column_3': 'rating',
                                                                                           'column_4': 'timestamp'})
    dataset = dataset.filter(pl.col('rating') > min_value).group_by('user').agg(['item', 'rating', 'timestamp'])
    dataset = dataset.with_columns(len=pl.col('item').list.len()).filter(pl.col('len') > 1).drop('len')
    dataset = dataset.explode(['item', 'rating', 'timestamp'])
    return dataset

In [16]:
beauty = load_beauty_dataset()

In [19]:
beauty.select(pl.col('item').unique())

item
str
"""B006CVAG6Y"""
"""B007480TQ6"""
"""B003Q8T0UE"""
"""B000CMFU1A"""
"""B004QI3BBC"""
…
"""B001A4H756"""
"""B00A9Z4V1A"""
"""B004JKUDM2"""
"""B000JVY52Y"""


In [3]:
ml_dataset = load_ml_dataset_1m()
ml_dataset

user,item,rating,timestamp
i64,i64,i64,i64
3969,2997,4,965660249
3969,3007,4,965660249
3969,2401,4,965660167
3969,2858,5,965660249
3969,480,4,965660127
…,…,…,…
5106,1020,5,962335480
5106,2770,4,962335232
5106,531,5,962334855
5106,539,4,962335497


In [2]:
ml_dataset_20m = pl.read_csv('datasets/ml-20m/ratings.csv')
ml_dataset_20m = ml_dataset_20m.rename({'userId': 'user', 'movieId': 'item'})

In [11]:
ml_dataset_20m = load_ml_dataset_20m()

In [16]:
ml_dataset_20m.select(pl.col('item').unique())

item
i64
1
2
3
4
5
…
131250
131252
131254
131256


In [20]:
train, test = dataset_split(beauty, test_size=0.2)

In [21]:
from tabulate import tabulate

als = ALS(user='user', item='item', rating='rating',
         n_factors=100,
         n_iterations=10,
         reg=0.001,
         top_k=50,
         binarize=False)
als_recommendations = als.fit_predict(train)

ease = EASE(user='user', item='item', rating='rating',
              l2_reg=1000.0, top_k=50, binarize=False, n_bathes=20)
slim = SLIM(user='user', item='item', rating='rating', 
           l1_reg=2., l2_reg=10., top_k=50, binarize=False, n_bathes=20)

ease_recommendations = ease.fit_predict(train)
#slim_recommendations = slim.fit_predict(train)

  check_blas_config()
  check_blas_config()


  0%|          | 0/10 [00:00<?, ?it/s]



MemoryError: Unable to allocate 59.8 GiB for an array with shape (126723, 126723) and data type float32

In [5]:
tab = {'model': ['ALS', 'EASE']}

for k in [1, 5, 10, 50]:
    tab['recall@' + str(k)] = [
        recall_k(als_recommendations, test, user='user', item='item', rating='scores', k=k),
        recall_k(ease_recommendations, test, user='user', item='item', rating='scores', k=k)
    ]

for k in [1, 5, 10, 50]:
    tab['NDCG@' + str(k)] = [
        ndcg_k(als_recommendations, test, user='user', item='item', rating='scores', k=k),
        ndcg_k(ease_recommendations, test, user='user', item='item', rating='scores', k=k),
    ]

In [6]:
print(tabulate(tab, headers="keys", tablefmt='github', numalign='center'))

| model   |  recall@1  |  recall@5  |  recall@10  |  recall@50  |   NDCG@1   |  NDCG@5   |  NDCG@10  |  NDCG@50  |
|---------|------------|------------|-------------|-------------|------------|-----------|-----------|-----------|
| ALS     | 0.0044457  | 0.0248324  |  0.0521179  |  0.227142   | 0.00652019 | 0.0410698 | 0.0631442 | 0.0955105 |
| EASE    | 0.00259041 | 0.00560405 | 0.00925808  |  0.0384197  | 0.00276548 | 0.0113797 | 0.0172511 | 0.0252204 |


In [6]:
tab = {'model': ['ALS', 'EASE', 'SLIM']}

for k in [1, 5, 10, 50]:
    tab['recall@' + str(k)] = [
        recall_k(als_recommendations, test, user='user', item='item', rating='scores', k=k),
        recall_k(ease_recommendations, test, user='user', item='item', rating='scores', k=k),
        recall_k(slim_recommendations, test, user='user', item='item', rating='scores', k=k)
    ]

for k in [1, 5, 10, 50]:
    tab['NDCG@' + str(k)] = [
        ndcg_k(als_recommendations, test, user='user', item='item', rating='scores', k=k),
        ndcg_k(ease_recommendations, test, user='user', item='item', rating='scores', k=k),
        ndcg_k(slim_recommendations, test, user='user', item='item', rating='scores', k=k)
    ]
    

user,item
i64,list[i64]
1998,"[908, 2005, … 948]"
5660,"[1210, 485, … 2985]"
2120,"[1198, 2959, … 300]"
2233,"[765, 1197, … 529]"
3957,"[1276, 1961, … 2100]"
…,…
4832,"[1197, 1093, … 2081]"
5987,"[593, 3481, … 296]"
494,"[1270, 1272, … 589]"
5195,"[955, 1210, … 898]"


In [5]:
ease = EASE(user='user', item='item', rating='rating',
              l2_reg=500.0, top_k=50, binarize=False, n_bathes=20)
ease_recommendations = ease.fit_predict(train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


AttributeError: module 'numpy' has no attribute 'nun_to_num'

In [7]:
als_recommendations

scores,user,item
f32,i64,i64
1.180452,1,1748
1.161999,1,111
1.150979,1,2951
1.145254,1,741
1.144478,1,1288
…,…,…
1.182347,138493,3258
1.181499,138493,293
1.179177,138493,1665
1.178197,138493,1298


In [24]:
a = np.array([1, 2, 3, 4, 5])
b = np.array([1, 1, np.nan, 1, 1])
c = a / b

In [25]:
c

array([ 1.,  2., nan,  4.,  5.])

In [23]:
c = np.nan_to_num(c)