In [None]:

import json
import gzip
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset


In [None]:

#reading from a zip file
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)
a=parse('AMAZON_FASHION_5.json.gz')

In [None]:
a=list(a)
a

In [None]:
# import the dataset
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('AMAZON_FASHION_5.json.gz')

In [None]:
df.head(10)

In [None]:
# adding time value to the reviewTime to make the data unique as the the original reviewTime has the same date value for all the products of a particular customer
df['reviewTime'] = pd.to_datetime(df['reviewTime'])
hrs = pd.Timedelta(hours=9)
min = pd.to_timedelta(df.groupby('reviewTime').cumcount().add(1).mul(7), unit='s')
df['reviewTime']=df['reviewTime'] + hrs + min 

In [None]:
df.head(20)

In [None]:
# Here the reviewerID is filtered according to reviewTime, we are considering this to split our data into train and test. 
#Test will have the recent timestamp value data and the train will have everything other than the recent timestamp.
df['rank_latest']= df.groupby(['reviewerID'])['reviewTime'].rank(method='first', ascending=False)

In [None]:
df.head(20)

In [None]:
# Here we are splitting the data according to the above timestamp condition 
train_data = df[df['rank_latest'] != 1]
test_data = df[df['rank_latest'] == 1]

# drop columns that we no longer need
train_data = train_data[['reviewerID', 'asin', 'overall']]
test_data = test_data[['reviewerID', 'asin', 'overall']]

In [None]:
train_data.head(20)

In [None]:
# We will only consider implicit ratings here from the above explicit ones. Thus, converting all the ratings of the user with that item as 1. 
# Where 1 represents that the user has interacted with that item.
train_data.loc[:, 'overall'] = 1

In [None]:
train_data.head(20)

**We also require negative samples to train our models, to indicate items that the user has not interacted with. We assume that such items are those that the user are not interested in. For now we randomly assign 5 such negative samples for each user-item pair.**

In [None]:
# Get a list of all product IDs
all_pids = df['asin'].unique()

# Pointers that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_data['reviewerID'], train_data['asin']))

# 5:1 ratio of negative to positive samples
num_negatives = 5

for (u, i) in user_item_set:
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_pids) 
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_pids)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

In [None]:
print(users[:9])
print(items[:9])
print(labels[:9])

**The class below simply encapsulates the code we have written above into a PyTorch Dataset class.**

In [None]:
class AmazonTrainDataset(Dataset):

    def __init__(self, ratings, all_pids):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_pids)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_pids):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['reviewerID'], ratings['asin']))

        num_negatives = 5
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_pids)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)