In [75]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import glob 
import json 

import ast 
import seaborn as sns 

In [76]:
meal_dfs = glob.glob("../../datasets/music/*")
meal_dfs

['../../datasets/music\\Digital_Music.jsonl',
 '../../datasets/music\\meta_Digital_Music.jsonl']

In [77]:
file_path = meal_dfs[0]
meta_path = meal_dfs[1]

In [78]:
# checking the dfs with reviews file = # e.g., "All_Beauty.jsonl", downloaded from the `review` link above
all_data = []
with open(file_path, 'r') as fp:
    i=0
    for line in fp:
        all_data.append(json.loads(line.strip()))
        i+=1 
        if i>100:
            break 

In [79]:
lines = [] 

with open(file_path, 'r') as fp:
    lines = fp.read().splitlines()

line_dicts = [json.loads(line) for line in lines]
df_final = pd.DataFrame(line_dicts)
df_final.head(2)

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,Nice,If i had a dollar for how many times I have pl...,[],B004RQ2IRG,B004RQ2IRG,AFUOYIZBU3MTBOLYKOJE5Z35MBDA,1618972613292,0,True
1,5.0,Excellent,awesome sound - cant wait to see them in perso...,[],B0026UZEI0,B0026UZEI0,AHGAOIZVODNHYMNCBV4DECZH42UQ,1308167525000,0,True


- asin : product_id 
- user_id : user_id 

## For sequential recommendation: 

The dataset should be in the format of:

$ [user, item, timestamp ] $ 

In [80]:
df_final.rating.unique(), df_final.rating.value_counts()

(array([5., 1., 3., 4., 2.]),
 rating
 5.0    100618
 4.0     14129
 3.0      6392
 1.0      6136
 2.0      3159
 Name: count, dtype: int64)

### Hence, let us take the items in the order of the rating highest at first

In [81]:
df_final.asin.nunique()

70519

In [82]:
df_final.user_id.nunique(), df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130434 entries, 0 to 130433
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   rating             130434 non-null  float64
 1   title              130434 non-null  object 
 2   text               130434 non-null  object 
 3   images             130434 non-null  object 
 4   asin               130434 non-null  object 
 5   parent_asin        130434 non-null  object 
 6   user_id            130434 non-null  object 
 7   timestamp          130434 non-null  int64  
 8   helpful_vote       130434 non-null  int64  
 9   verified_purchase  130434 non-null  bool   
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 9.1+ MB


(100952, None)

There are 130k total reviews and 100k different users. Hence, let us select only handful number of people that have reviewed and rated more than only one product.

In [83]:
# only taking users, items, rating, and timestamp
df = df_final[['user_id', 'asin', 'parent_asin', 'rating', 'timestamp']]
df.head(2)

Unnamed: 0,user_id,asin,parent_asin,rating,timestamp
0,AFUOYIZBU3MTBOLYKOJE5Z35MBDA,B004RQ2IRG,B004RQ2IRG,5.0,1618972613292
1,AHGAOIZVODNHYMNCBV4DECZH42UQ,B0026UZEI0,B0026UZEI0,5.0,1308167525000


In [84]:
df['user_id'].value_counts()

user_id
AGAFM74L2RIJ5O36NNYH4Z5ISQNQ    341
AEDFM4VDH2MKYVBKGYVTU6R5L5FQ    182
AH3FC6V3IUJIN2Y7BCZ7DN3IMMJQ    175
AEFLICXXHRBMNT4HAZH2NUU6TCOA    136
AFMUUMXTKB6C52CCENIFBK4QLT4Q    130
                               ... 
AHJNH24BZHQB7PWPGPYRMYXOQ2SQ      1
AFZGUZQ3X3PIQGIC7MAVMSL7E4XA      1
AHH2FTUUYYGKMZEY5OH3IDPD23ZQ      1
AH5MDHJK7OCKI6TZIN5N5K7TCXQQ      1
AEFCHGMHFSZA4IWC5FWTBRPR25GQ      1
Name: count, Length: 100952, dtype: int64

In [85]:
df_sorted = df.sort_values(by=['user_id', 'rating'], ascending=[True,False])
df_sorted.head()

Unnamed: 0,user_id,asin,parent_asin,rating,timestamp
72082,AE2225OKZMXZ4TTULR33R66IAU4A,B0000057L2,B0000057L2,5.0,1471009104000
42640,AE225NQR6TN6SUNMTF6O4T37TXPQ,B000HVV2N6,B000HVV2N6,5.0,1650665983971
1167,AE225Z2VRWT6GPTOMA4H4O3H2KVQ,B00PUTIJQI,B00PUTIJQI,5.0,1520039515541
87743,AE226RM2T7PZIFMLS4AYBU2DM7LA,B00FJNE6WG,B00FJNE6WG,5.0,1410617507000
80074,AE22CWLV6HOE3P57RFCL2BTD5NUQ,B00KNSEZ1E,B00KNSEZ1E,5.0,1579498312610


In [86]:
result = df_sorted.groupby('user_id')['asin'].apply(lambda x: ','.join(x)).reset_index()

In [87]:
result[result['user_id']=='AGAFM74L2RIJ5O36NNYH4Z5ISQNQ'].asin.tolist()

['B000027NXM,B00004SNJB,B00004USJX,B01MCZ9Z6S,B0000257XU,B0000254LR,B000024OHW,B01L9AGU1C,B000M74XKO,B000027NGI,B00002MYLY,B000BK3YHU,B00004S3DC,B00MJ6ET0A,B0000260P0,B000M2BABO,B004W0H5S4,B00000ASA5,B000025YVO,B0002YW098,B000K6LOSQ,B003H1BNYQ,B0000264IM,B002Z5VD9U,B005SDWL1K,B00CTLHL9G,B000063XPF,B000025TP3,B000025U5A,B0000UM06W,B00R63OA9Q,B00007JR63,B000027Q53,B00IQUJMG4,B0000264I0,B000CEETHY,B000050RI7,B00007JQVK,B0000254I0,B000H72FD6,B00004R7V8,B0000254BV,B0000254AY,B000026367,B0000254L3,B0000254KP,B00004V516,B000026BEO,B00005G7S9,B005HT0KAY,B005LG2PZG,B0000245GN,B000MU2KNI,B000WTYQUO,B007S0GVBC,B000XPTBW0,B000FPHFJE,B00EBVFNCY,B002HPUTT8,B000001YNH,B000001YN8,B0009N5L5W,B000003574,B000024OKO,B007XWGPL6,B00002513J,B0015RAJ3U,B0017VTWSC,B002B3333A,B0002N6UHM,B0040ZYPKM,B004D647GK,B007MWFWP2,B007S9IWYW,B000TQB5HC,B006X6T03S,B0011ZLF1Q,B000II3376,B000M6UMVO,B000NYZTDQ,B000VLYLJ4,B004MA22NC,B0025WUKMY,B001P4KG04,B0002KBKLQ,B000FS4PP8,B0002IN1I8,B000PHCY20,B002RPJ7S2,B000TAZBEG,B000VLWJ

In [88]:
result.head()

Unnamed: 0,user_id,asin
0,AE2225OKZMXZ4TTULR33R66IAU4A,B0000057L2
1,AE225NQR6TN6SUNMTF6O4T37TXPQ,B000HVV2N6
2,AE225Z2VRWT6GPTOMA4H4O3H2KVQ,B00PUTIJQI
3,AE226RM2T7PZIFMLS4AYBU2DM7LA,B00FJNE6WG
4,AE22CWLV6HOE3P57RFCL2BTD5NUQ,B00KNSEZ1E


In [89]:
df[df['user_id']=='AGAFM74L2RIJ5O36NNYH4Z5ISQNQ'][:10]

Unnamed: 0,user_id,asin,parent_asin,rating,timestamp
37951,AGAFM74L2RIJ5O36NNYH4Z5ISQNQ,B000027B8A,B000027B8A,3.0,1641310072862
37952,AGAFM74L2RIJ5O36NNYH4Z5ISQNQ,B0000262NE,B0000262NE,3.0,1553179841287
37953,AGAFM74L2RIJ5O36NNYH4Z5ISQNQ,B0007MR0SG,B0007MR0SG,4.0,1549022911946
37954,AGAFM74L2RIJ5O36NNYH4Z5ISQNQ,B000027NXM,B000027NXM,5.0,1547202919524
37955,AGAFM74L2RIJ5O36NNYH4Z5ISQNQ,B00005EOZV,B00005EOZV,4.0,1518365906454
37956,AGAFM74L2RIJ5O36NNYH4Z5ISQNQ,B000027B8D,B000027B8D,4.0,1518339037343
37957,AGAFM74L2RIJ5O36NNYH4Z5ISQNQ,B00004SNJB,B00004SNJB,5.0,1517616749839
37958,AGAFM74L2RIJ5O36NNYH4Z5ISQNQ,B000OH9BXQ,B000OH9BXQ,4.0,1511782444447
37959,AGAFM74L2RIJ5O36NNYH4Z5ISQNQ,B000004CIE,B000004CIE,4.0,1508949174273
37960,AGAFM74L2RIJ5O36NNYH4Z5ISQNQ,B001MMPBS6,B001MMPBS6,3.0,1504515152896


In [90]:
user_ratings_count = df.groupby('user_id').size().reset_index(name='rating_count')
users_with_more_than_2_ratings = user_ratings_count[user_ratings_count['rating_count'] > 2]['user_id']
print(f'Users with more than 2 ratigns: \n {users_with_more_than_2_ratings.head(2)}\n')

filtered_df = df[df['user_id'].isin(users_with_more_than_2_ratings)]
print(f'Filtered df with users having more than 2 ratings: \n{filtered_df.head(2)}')

asin_count_per_user = filtered_df.groupby(['user_id', 'asin']).size().reset_index(name='asin_count')

user_similar_asin_count = asin_count_per_user.groupby('user_id')['asin_count'].sum().reset_index(name='total_asin_count')

sorted_users = user_similar_asin_count.sort_values(by='total_asin_count', ascending=False)
sorted_users

Users with more than 2 ratigns: 
 14    AE22MFO6GCUJLXCWVQDLZDMWXQDA
58    AE24H5QMMVF5762UWO7BUT2NEWJA
Name: user_id, dtype: object

Filtered df with users having more than 2 ratings: 
                        user_id        asin parent_asin  rating      timestamp
7  AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ  B08L7CJ6NM  B08L7CJ6NM     5.0  1641330592178
8  AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ  B092C24W4G  B092C24W4G     5.0  1641224888411


Unnamed: 0,user_id,total_asin_count
2406,AGAFM74L2RIJ5O36NNYH4Z5ISQNQ,341
327,AEDFM4VDH2MKYVBKGYVTU6R5L5FQ,182
3354,AH3FC6V3IUJIN2Y7BCZ7DN3IMMJQ,175
399,AEFLICXXHRBMNT4HAZH2NUU6TCOA,136
1736,AFMUUMXTKB6C52CCENIFBK4QLT4Q,130
...,...,...
2065,AFWISMRHTQJ35HX4SAVSGR7XTTWA,3
2066,AFWJ2MO33GQ6QDWST2SQH6SAYW5A,3
2067,AFWJFLGHIRDKCRPCETEIPPZVMPFA,3
2068,AFWMGQMC6W5ZINFZFMPRIYXJWLXQ,3


In [91]:
DF = sorted_users.merge(result, on="user_id")[['user_id', 'asin']]
DF.head()

Unnamed: 0,user_id,asin
0,AGAFM74L2RIJ5O36NNYH4Z5ISQNQ,"B000027NXM,B00004SNJB,B00004USJX,B01MCZ9Z6S,B0..."
1,AEDFM4VDH2MKYVBKGYVTU6R5L5FQ,"B000008DB3,B000008DB3,B0010A24P8,B00NB367GE,B0..."
2,AH3FC6V3IUJIN2Y7BCZ7DN3IMMJQ,"B00AZI24YM,B00NF3CABQ,B01K8KLNFC,B00LMH72UG,B0..."
3,AEFLICXXHRBMNT4HAZH2NUU6TCOA,"B00147J31U,B000UFSVOM,B00CTTGPXG,B0000278F5,B0..."
4,AFMUUMXTKB6C52CCENIFBK4QLT4Q,"B006DI0M3I,B004E2DOUI,B00008PZV6,B011BT9996,B0..."


In [92]:
main_users = DF.user_id.tolist()

In [98]:
df_recbole = df[df['user_id'].isin(main_users)][['user_id', 'asin', 'timestamp', 'rating']]
df_recbole.shape, df.shape 

((25542, 4), (130434, 5))

In [99]:
df_recbole

Unnamed: 0,user_id,asin,timestamp,rating
7,AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ,B08L7CJ6NM,1641330592178,5.0
8,AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ,B092C24W4G,1641224888411,5.0
9,AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ,B01M32UHXG,1566575806068,5.0
10,AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ,B002Q2KG1I,1493417280000,5.0
11,AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ,B003NXEPY8,1461286805000,5.0
...,...,...,...,...
130333,AFMZVBFOI5STU3YCLDKLKDKLIIXQ,B01EKROZ54,1484319203000,5.0
130334,AFMZVBFOI5STU3YCLDKLKDKLIIXQ,B01D2CQIH6,1462407167000,5.0
130345,AEGGQYHJQPN3STMZMV7EINJYCDOA,B06WRQ1H9C,1566681798118,4.0
130346,AEGGQYHJQPN3STMZMV7EINJYCDOA,B074S6RVC7,1566362521851,5.0


In [100]:
# changing the ids into indices 
user_ids = {user_id: idx for idx, user_id in enumerate(df_recbole['user_id'].unique())}
item_ids = {asin: idx for idx, asin in enumerate(df_recbole['asin'].unique())}

# Map the ids
df_recbole['user_id_token'] = df_recbole['user_id'].map(user_ids)
df_recbole['item_id'] = df_recbole['asin'].map(item_ids)
df_recbole.head()

Unnamed: 0,user_id,asin,timestamp,rating,user_id_token,item_id
7,AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ,B08L7CJ6NM,1641330592178,5.0,0,0
8,AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ,B092C24W4G,1641224888411,5.0,0,1
9,AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ,B01M32UHXG,1566575806068,5.0,0,2
10,AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ,B002Q2KG1I,1493417280000,5.0,0,3
11,AHB5CGLYN3Y6NIPHNQLYFJT2W2PQ,B003NXEPY8,1461286805000,5.0,0,4


In [104]:
# df_recbole.drop('user_id', axis=1, inplace=True)
# df_recbole.drop('asin', axis=1, inplace=True)
df_recbole.head(2)

Unnamed: 0,timestamp,rating,user_id_token,item_id
7,1641330592178,5.0,0,0
8,1641224888411,5.0,0,1


In [105]:
df_recbole = df_recbole.rename(columns={'user_id_token':'user_id:token', 'item_id':'item_id:token', 'rating':'rating:float', 'timestamp':'timestamp:float'})
df_recbole.head() 

Unnamed: 0,timestamp:float,rating:float,user_id:token,item_id:token
7,1641330592178,5.0,0,0
8,1641224888411,5.0,0,1
9,1566575806068,5.0,0,2
10,1493417280000,5.0,0,3
11,1461286805000,5.0,0,4


* Rearranging the columns of the dataframe 

In [113]:
cols = df_recbole.columns.tolist()
cols

['timestamp:float', 'rating:float', 'user_id:token', 'item_id:token']

In [114]:
df_recbole = df_recbole[['user_id:token', 'item_id:token', 'rating:float', 'timestamp:float']]
df_recbole.head()

Unnamed: 0,user_id:token,item_id:token,rating:float,timestamp:float
7,0,0,5.0,1641330592178
8,0,1,5.0,1641224888411
9,0,2,5.0,1566575806068
10,0,3,5.0,1493417280000
11,0,4,5.0,1461286805000


In [115]:
df_recbole.to_csv('amazon/amazon.inter', sep='\t', index=False)