In [None]:
# !pip install spacy
!pip install gym
!pip install copy
!pip install pylab
!pip install random
!pip install os
!pip install gzip

In [None]:
# Import statements
import json
import pandas as pd
import spacy
import numpy as np
import gym
import tensorflow as tf
import copy
import random
import pylab
import os
import gzip
from urllib.request import urlopen
from collections import deque
from keras import layers, models
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense

In [None]:
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/AMAZON_FASHION.json.gz
# Load JSON data
data = []
with gzip.open('AMAZON_FASHION.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
# Let's take a peek at the first row and the total number of rows
print(len(data))
print(data[0])

In [None]:
# Create a DataFrame for easier data manipulation
df = pd.DataFrame(data)
df = df[['overall','verified','reviewerID','asin','style','reviewerName','reviewText', 'summary','reviewTime']]
# Filter verified reviews with non-null overall ratings
filtered_df = df[(df['verified'] == True) & (~df['overall'].isnull())]

In [None]:
#Create FashionProduct class for a product representation from reviews
class FashionProduct() : pass
class Reviewer() : pass
# Group reviews by reviewers and select users with more than ten purchases
reviewers = {}
grouped_df_reviwerId = filtered_df.groupby('reviewerID')
for reviewerId, group in grouped_df_reviwerId:
    products = group[group['asin'].notna()]['asin'].unique()
    if len(products) > 10:
        reviewer = Reviewer()
        reviewer.reviewerId = reviewerId
        reviewer.products = products
        reviewers[reviewerId] = reviewer

In [None]:
# Filter dataset to include only reviewers with more than ten products
filtered_df = filtered_df[(filtered_df['reviewerID'].isin([reviewer.reviewerId for reviewer in reviewer_values]))]

In [None]:
# Group reviews by product ASIN, reviewerID, and reviewTime
filtered_df['reviewTime'] = pd.to_datetime(filtered_df['reviewTime'])
filtered_df.sort_values('reviewTime')
grouped_df = filtered_df.groupby(['asin', 'reviewerID', 'reviewTime'], sort=False)

In [None]:
#load spacy for nlp related noun extraction, stopword removal and others
nlp = spacy.load('en_core_web_sm')

# extract nouns from review text
def extract_nouns(doc):
    return " ".join([token.text for token in doc if token.pos_ == "NOUN" or token.pos_ == "PROPN"])

# Initialize a dictionary to store product features as states
states = {}
products = {}

# Iterate over each product
for (product_asin, reviewerId, reviewTime), group in grouped_df:
    if (product_asin, reviewerId) in states: continue
    product = FashionProduct()
    product.product_asin = product_asin
    product.reviewerId = reviewerId
    product.time = reviewTime
    if product_asin not in products:
        products[product_asin] = product
        p= products[product_asin]
        p.reviewers = set()
        p.sizes = set()
        p.colors = set()
        p.reviews = set()
        p.rating =[]

    product.reviewers = products[product_asin].reviewers



    # extract size and color metadata from style column
    styles=  group[group['style'].notna()]['style']
    sizes = styles.apply(lambda x: x.get("Size:", "") if "Size:" in x else x.get("Size Name:", "")).unique().tolist()
    colors = styles.apply(lambda x: x.get("Color:", "")).unique().tolist()

    products[product_asin].sizes.update(sizes)
    products[product_asin].colors.update(colors)

    #extract other noun metadata from review text
    reviews = group[group['reviewText'].notna()]['reviewText']
    reviews = " ".join(reviews.apply(lambda x: " ".join([extract_nouns(chunk) for chunk in nlp(x).noun_chunks]).strip()).unique())
    products[product_asin].reviews.update(reviews)
    #using rms instead of average for review ratings to give slightly higher weightage to good reviews
    ratings = group[group['overall']>0]['overall'].tolist()
    products[product_asin].rating.extend(ratings)
    product.ratings = np.sqrt(np.mean( [r**2 for r in products[product_asin].rating]))

    sizes = " ".join(products[product_asin].sizes)
    colors = " ".join(products[product_asin].colors)
    reviews = " ".join(products[product_asin].reviews)
    product.metadata= " ".join((reviews+" "+sizes+" "+colors).split())

    # add past product and reviewer's product metadata
    # we will take metatdata of last 2 reviewer only as large metadata causes memory issues
    for reviewer in list(product.reviewers)[-2:]:
        state = states[(product_asin, reviewer)]
        product.metadata += " "+state.metadata

    # keep past reviewer list
    products[product_asin].reviewers.add(reviewerId)

    states[(product_asin, reviewerId)] = product

states_list= list(states.values())


In [None]:
# Add past product and reviewer's product metadata to the current state
# Code already implemented in previous section, shown here for clarity only
for reviewer in list(product.reviewers)[-2:]:
    state = states[(product_asin, reviewer)]
    product.metadata += " " + state.metadata


# Create states for users and enhance metadata with past products
for state in states_list:
    if state.reviewerId not in users:
        users[state.reviewerId] = Reviewer()
        users[state.reviewerId].products = set()
    for prod1 in list(users[state.reviewerId].products)[-2:]:
        state1 = states[(prod1, state.reviewerId)]
        state.metadata += state1.metadata

In [None]:
# Remove states with empty metadata
states_list = [s for s in states_list if s.metadata.strip() != '']