In [28]:
from pathlib import Path
from dataloader import DataLoader

In [2]:
pwd = %pwd
CONFIGS_PATH = Path(pwd).parent / "configs"
DATASET_CONFIG = CONFIGS_PATH / "datasets.json"

In [3]:
loader = DataLoader(DATASET_CONFIG)
dataset = loader.load_dataset("yelpnyc")

In [4]:
dataset.metadata_df

Unnamed: 0,Reviewer_id,Product_id,Rating,Label,Date
0,923,0,3,-1,2014-12-08
1,924,0,3,-1,2013-05-16
2,925,0,4,-1,2013-07-01
3,926,0,4,-1,2011-07-28
4,927,0,4,-1,2010-11-01
...,...,...,...,...,...
359047,161146,349,5,1,2014-02-06
359048,116424,349,5,1,2014-01-31
359049,161147,349,5,1,2014-01-30
359050,97930,349,5,1,2014-01-25


In [5]:
print(f"{dataset.n_users} users")
print(f"{dataset.m_items} items")
print(f"{dataset.n_interactions} ratings")

160225 users
923 items
359052 ratings


In [6]:
# Access dates (they are formatted as python datetime objects instead of strings)
dataset.metadata_df[dataset.METADATA_DATE]

0        2014-12-08
1        2013-05-16
2        2013-07-01
3        2011-07-28
4        2010-11-01
            ...    
359047   2014-02-06
359048   2014-01-31
359049   2014-01-30
359050   2014-01-25
359051   2014-01-25
Name: Date, Length: 359052, dtype: datetime64[ns]

In [44]:
first_date = dataset.metadata_df.groupby(dataset.METADATA_USER_ID)[dataset.METADATA_DATE].min()
last_date = dataset.metadata_df.groupby(dataset.METADATA_USER_ID)[dataset.METADATA_DATE].max()

print(last_date.values)
print(first_date.values)
print((last_date.values- first_date.values).astype('timedelta64[D]'))

['2014-12-08T00:00:00.000000000' '2013-05-16T00:00:00.000000000'
 '2013-07-01T00:00:00.000000000' ... '2014-02-09T00:00:00.000000000'
 '2014-02-06T00:00:00.000000000' '2014-01-30T00:00:00.000000000']
['2013-11-04T00:00:00.000000000' '2013-05-16T00:00:00.000000000'
 '2013-07-01T00:00:00.000000000' ... '2014-02-09T00:00:00.000000000'
 '2014-02-06T00:00:00.000000000' '2014-01-30T00:00:00.000000000']
[399   0   0 ...   0   0   0]


In [22]:
avg_ratings = dataset.metadata_df.groupby(dataset.METADATA_ITEM_ID)[dataset.METADATA_STAR_RATING].mean()

print(avg_ratings.values)


[4.00952381 4.32806324 3.84337349 4.16949153 4.03290676 4.37209302
 4.2007874  3.75184638 3.87654321 3.84269663 4.47389558 4.
 3.72781065 4.02247191 3.90825688 3.93902439 3.97309417 4.13768116
 3.91987179 4.0371517  4.3220339  4.36666667 4.23404255 4.19594595
 4.         4.07692308 4.43693694 3.81920904 3.5785124  3.9537037
 4.4375     4.43548387 3.90086207 4.17640449 4.10909091 4.20289855
 3.99361022 4.02380952 3.80697051 3.70758929 4.31286895 4.14643545
 3.70588235 3.80036134 3.9318542  3.79487179 4.04968944 4.23684211
 3.985      4.73529412 4.35171386 3.80042017 3.9679098  4.31333333
 3.69760479 4.4251497  4.02531646 3.91726619 4.64102564 4.22891566
 4.38383838 4.43209877 4.26351931 3.9371808  4.16296296 4.42248062
 4.54678899 4.03278689 4.         3.94029851 3.91563275 3.92213115
 3.96598866 4.25185185 4.06349206 4.11428571 4.11428571 3.92026578
 3.96369048 4.28       4.         4.29957806 3.57571324 4.19939577
 4.2037037  4.04285714 4.575      3.83505155 4.03676471 2.9841629
 3.88

In [27]:
# Sparse matrix u2i with star ratings
dataset.rated_graph_u2i.toarray()

array([[3, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
# Access the rating of reviewer id 97930 for item 349
dataset.rated_graph_u2i[dataset.reviewer_to_index[97930], dataset.item_to_index[349]]

5

In [9]:
# Sparse matrix u2i with binary values (1 if user u rated item i, 0 otherwise)
dataset.graph_u2i

<160225x923 sparse matrix of type '<class 'numpy.float64'>'
	with 359052 stored elements in Compressed Sparse Row format>

In [15]:
# Get all ratings of every user
dataset.metadata_df.groupby(dataset.METADATA_USER_ID)[dataset.METADATA_STAR_RATING].apply(list)

Reviewer_id
923       [3, 5, 4, 4, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 4, ...
924                                                     [3]
925                                                  [4, 4]
926                                                     [4]
927                                         [4, 2, 3, 4, 5]
                                ...                        
161143                                                  [5]
161144                                                  [5]
161145                                                  [5]
161146                                                  [5]
161147                                                  [5]
Name: Rating, Length: 160225, dtype: object