In [34]:
import os
import gzip
import json
import pandas as pd
import collections

In [2]:
# Define a “pretty print” function pprint for dict objects and dataframes.
pprint = lambda x: print(json.dumps(x, indent=2)) if isinstance(x, dict) else display(x)

## Loading in Data

In [3]:
reviews = []
items = []

In [4]:
file = "data/All_Beauty.jsonl.gz"

with gzip.open(file, 'rt') as fp:
    for line in fp:
        reviews.append(json.loads(line.strip()))


In [5]:
file = "data/meta_All_Beauty.jsonl.gz"

with gzip.open(file, 'rt') as fp:
    for line in fp:
        items.append(json.loads(line.strip()))

## Exploratory Analysis

In [9]:
reviews_df = pd.read_json("data/All_Beauty.jsonl.gz", lines=True, compression='gzip')

In [12]:
items_df = pd.read_json("data/meta_All_Beauty.jsonl.gz", lines=True, compression='gzip')

#### Display basic summary stats of the data

In [None]:
# reviews summary stats
print("############################## DATA INFO ############################## ")
print(reviews_df.info())      # DataFrame info

print("############################## DATA DESCRIBE ############################## ")
print(reviews_df.describe())  # Descriptive statistics

print("############################## DATA HEAD ############################## ")
print(reviews_df.head())      # First 5 rows

print("############################## DATA SHAPE ############################## ")
print(reviews_df.shape)       # Shape of DataFrame

# items summary stats
print("############################## DATA INFO ############################## ")
print(items_df.info())        # DataFrame info

print("############################## DATA DESCRIBE ############################## ")
print(items_df.describe())    # Descriptive statistics

print("############################## DATA HEAD ############################## ")
print(items_df.head())        # First 5 rows

print("############################## DATA SHAPE ############################## ")
print(items_df.shape)         # Shape of DataFrame

############################## DATA INFO ############################## 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701528 entries, 0 to 701527
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   rating             701528 non-null  int64         
 1   title              701528 non-null  object        
 2   text               701528 non-null  object        
 3   images             701528 non-null  object        
 4   asin               701528 non-null  object        
 5   parent_asin        701528 non-null  object        
 6   user_id            701528 non-null  object        
 7   timestamp          701528 non-null  datetime64[ns]
 8   helpful_vote       701528 non-null  int64         
 9   verified_purchase  701528 non-null  bool          
dtypes: bool(1), datetime64[ns](1), int64(2), object(6)
memory usage: 48.8+ MB
None
############################## DATA DESCRIBE ###############

## Pre-Processing Dataset

In [27]:
# only keep verified purchases
reviews = [row for row in reviews if row['verified_purchase'] == True]

In [None]:
print("######## NaNs ######## ")
print("Items: ")
print(items_df.isna().sum())
print("\nReviews: ")
print(reviews_df.isna().sum())

print("######## Duplicates ########")
print(reviews_df.duplicated(subset=['asin', 'user_id', 'timestamp']).sum())

######## NaNs ######## 
Items: 
main_category           0
title                   0
average_rating          0
rating_number           0
features                0
description             0
price               94886
images                  0
videos                  0
store               11331
categories              0
details                 0
parent_asin             0
bought_together    112590
dtype: int64

Reviews: 
rating               0
title                0
text                 0
images               0
asin                 0
parent_asin          0
user_id              0
timestamp            0
helpful_vote         0
verified_purchase    0
dtype: int64
########Duplicates ########
7276


#### K-core Filtering (k=5)
- Retain users with ≥5 reviews and items with ≥5 reviews.
- Remove duplicate reviews, keeping the earliest for each `{userID, parent_asin}`.

In [None]:
# Helper Functions

def load_ratings(rev):
    inters = []
    for review in rev:
        item, user, rating, time = review['parent_asin'], review['user_id'], review['rating'], review['timestamp']
        inters.append((user, item, float(rating), int(time)))
    return inters

def get_user2count(inters):
    user2count = collections.defaultdict(int)
    for unit in inters:
        user2count[unit[0]] += 1
    return user2count


def get_item2count(inters):
    item2count = collections.defaultdict(int)
    for unit in inters:
        item2count[unit[1]] += 1
    return item2count


def generate_candidates(unit2count, threshold):
    cans = set()
    for unit, count in unit2count.items():
        if count >= threshold:
            cans.add(unit)
    return cans, len(unit2count) - len(cans)

In [47]:
# Make the interactions in order and remove duplicate reviews

def make_inters_in_order(inters):
    user2inters, new_inters = collections.defaultdict(list), []
    for inter in inters:
        user, item, rating, timestamp = inter
        user2inters[user].append((user, item, rating, timestamp))
    for user in user2inters:
        user_inters = user2inters[user]
        user_inters.sort(key=lambda d: d[3])
        his_items = set()
        for inter in user_inters:
            user, item, rating, timestamp = inter
            if item in his_items:
                continue
            his_items.add(item)
            new_inters.append(inter)
    return new_inters

# filter by k-core (5 in this case)
def filter_inters(inters, user_k_core_threshold=0, item_k_core_threshold=0):
    new_inters = []
    # filter by k-core
    if user_k_core_threshold or item_k_core_threshold:
        print('\nFiltering by k-core:')
        idx = 0
        user2count = get_user2count(inters)
        item2count = get_item2count(inters)

        while True:
            new_user2count = collections.defaultdict(int)
            new_item2count = collections.defaultdict(int)
            users, n_filtered_users = generate_candidates(
                user2count, user_k_core_threshold)
            items, n_filtered_items = generate_candidates(
                item2count, item_k_core_threshold)
            if n_filtered_users == 0 and n_filtered_items == 0:
                break
            for unit in inters:
                if unit[0] in users and unit[1] in items:
                    new_inters.append(unit)
                    new_user2count[unit[0]] += 1
                    new_item2count[unit[1]] += 1
            idx += 1
            inters, new_inters = new_inters, []
            user2count, item2count = new_user2count, new_item2count
            print('    Epoch %d The number of inters: %d, users: %d, items: %d'
                    % (idx, len(inters), len(user2count), len(item2count)))
    return inters


In [48]:
# Preprocessing step

def preprocess_rating(inters):
    print('Process rating data: ')
    print(' Dataset: reviews',)

    # load ratings
    rating_inters = load_ratings(inters)


    # Sort and remove repeated reviews
    rating_inters = make_inters_in_order(rating_inters)

    # K-core filtering;
    print('The number of raw inters: ', len(rating_inters))
    kcore_rating_inters = filter_inters(rating_inters,
                                        user_k_core_threshold=5,
                                        item_k_core_threshold=5)

    # return: list of (user_ID, item_ID, rating, timestamp)
    return kcore_rating_inters, rating_inters

In [49]:
preprocessed_inters,_ = preprocess_rating(reviews)

Process rating data: 
 Dataset: reviews


KeyError: 'sortTimestamp'

## Purchasing Prediction
In this project, I will develop several baseline models and iteratively improve upon them using advanced techniques discussed in class, as well as deep learning methods. These models will include Most Popular, item-based collaborative filtering, Bayesian Personalized Ranking (BPR), and a deep learning model implemented with PyTorch.



#### Most Popular

In [None]:
# necessities (for preprocessing):
# 