In [None]:
import os
import gzip
import json
import pandas as pd
import random
import collections
from tqdm import tqdm

In [25]:
# Define a “pretty print” function pprint for dict objects and dataframes.
pprint = lambda x: print(json.dumps(x, indent=2)) if isinstance(x, dict) else display(x)

## Loading in Data

In [3]:
reviews = []
items = []

In [4]:
file = "data/Software.jsonl.gz"

with gzip.open(file, 'rt') as fp:
    for line in fp:
        reviews.append(json.loads(line.strip()))


In [5]:
file = "data/meta_Software.jsonl.gz"

with gzip.open(file, 'rt') as fp:
    for line in fp:
        items.append(json.loads(line.strip()))

## Exploratory Analysis

In [5]:
reviews_df = pd.read_json("data/Software.jsonl.gz", lines=True, compression='gzip')

In [6]:
items_df = pd.read_json("data/meta_Software.jsonl.gz", lines=True, compression='gzip')

#### Display basic summary stats of the data

In [7]:
# reviews summary stats
print("############################## DATA INFO ############################## ")
print(reviews_df.info())      # DataFrame info

print("############################## DATA DESCRIBE ############################## ")
print(reviews_df.describe())  # Descriptive statistics

print("############################## DATA HEAD ############################## ")
print(reviews_df.head())      # First 5 rows

print("############################## DATA SHAPE ############################## ")
print(reviews_df.shape)       # Shape of DataFrame

# items summary stats
print("############################## DATA INFO ############################## ")
print(items_df.info())        # DataFrame info

print("############################## DATA DESCRIBE ############################## ")
print(items_df.describe())    # Descriptive statistics

print("############################## DATA HEAD ############################## ")
print(items_df.head())        # First 5 rows

print("############################## DATA SHAPE ############################## ")
print(items_df.shape)         # Shape of DataFrame

############################## DATA INFO ############################## 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4880181 entries, 0 to 4880180
Data columns (total 10 columns):
 #   Column             Dtype         
---  ------             -----         
 0   rating             int64         
 1   title              object        
 2   text               object        
 3   images             object        
 4   asin               object        
 5   parent_asin        object        
 6   user_id            object        
 7   timestamp          datetime64[ns]
 8   helpful_vote       int64         
 9   verified_purchase  bool          
dtypes: bool(1), datetime64[ns](1), int64(2), object(6)
memory usage: 339.7+ MB
None
############################## DATA DESCRIBE ############################## 
             rating                      timestamp  helpful_vote
count  4.880181e+06                        4880181  4.880181e+06
mean   3.935087e+00  2016-08-11 20:56:15.320964864  4.

## Pre-Processing Dataset

In [None]:
print("######## NaNs ######## ")
print("Items: ")
print(items_df.isna().sum())
print("\nReviews: ")
print(reviews_df.isna().sum())

print("######## Duplicates ########")
print(reviews_df.duplicated(subset=['asin', 'user_id', 'timestamp']).sum())

######## NaNs ######## 
Items: 
main_category           0
title                   0
average_rating          0
rating_number           0
features                0
description             0
price               94886
images                  0
videos                  0
store               11331
categories              0
details                 0
parent_asin             0
bought_together    112590
dtype: int64

Reviews: 
rating               0
title                0
text                 0
images               0
asin                 0
parent_asin          0
user_id              0
timestamp            0
helpful_vote         0
verified_purchase    0
dtype: int64
######## Duplicates ########
7276


701528

#### K-core Filtering (k=5)
- Retain users with ≥5 reviews and items with ≥5 reviews.
- Remove duplicate reviews, keeping the earliest for each `{userID, parent_asin}`.

In [6]:
# Helper Functions

def load_ratings(rev):
    inters = []
    for review in rev:
        item, user, rating, time = review['parent_asin'], review['user_id'], review['rating'], review['timestamp']
        inters.append((user, item, float(rating), int(time)))
    return inters

def get_user2count(inters):
    user2count = collections.defaultdict(int)
    for unit in inters:
        user2count[unit[0]] += 1
    return user2count


def get_item2count(inters):
    item2count = collections.defaultdict(int)
    for unit in inters:
        item2count[unit[1]] += 1
    return item2count


def generate_candidates(unit2count, threshold):
    cans = set()
    for unit, count in unit2count.items():
        if count >= threshold:
            cans.add(unit)
    return cans, len(unit2count) - len(cans)

In [7]:
# Make the interactions in order and remove duplicate reviews

def make_inters_in_order(inters):
    user2inters, new_inters = collections.defaultdict(list), []
    for inter in inters:
        user, item, rating, timestamp = inter
        user2inters[user].append((user, item, rating, timestamp))
    for user in user2inters:
        user_inters = user2inters[user]
        user_inters.sort(key=lambda d: d[3])
        his_items = set()
        for inter in user_inters:
            user, item, rating, timestamp = inter
            if item in his_items:
                continue
            his_items.add(item)
            new_inters.append(inter)
    return new_inters

# filter by k-core (5 in this case)
def filter_inters(inters, user_k_core_threshold=0, item_k_core_threshold=0):
    new_inters = []
    # filter by k-core
    if user_k_core_threshold or item_k_core_threshold:
        print('\nFiltering by k-core:')
        idx = 0
        user2count = get_user2count(inters)
        item2count = get_item2count(inters)

        while True:
            new_user2count = collections.defaultdict(int)
            new_item2count = collections.defaultdict(int)
            users, n_filtered_users = generate_candidates(
                user2count, user_k_core_threshold)
            items, n_filtered_items = generate_candidates(
                item2count, item_k_core_threshold)
            if n_filtered_users == 0 and n_filtered_items == 0:
                break
            for unit in inters:
                if unit[0] in users and unit[1] in items:
                    new_inters.append(unit)
                    new_user2count[unit[0]] += 1
                    new_item2count[unit[1]] += 1
            idx += 1
            inters, new_inters = new_inters, []
            user2count, item2count = new_user2count, new_item2count
            print('    Epoch %d The number of inters: %d, users: %d, items: %d'
                    % (idx, len(inters), len(user2count), len(item2count)))
    return inters


In [8]:
# Preprocessing step

def preprocess_rating(inters):
    print('Process rating data: ')
    print(' Dataset: reviews',)

    # load ratings
    rating_inters = load_ratings(inters)


    # Sort and remove repeated reviews
    rating_inters = make_inters_in_order(rating_inters)

    # K-core filtering;
    print('The number of raw inters: ', len(rating_inters))
    kcore_rating_inters = filter_inters(rating_inters,
                                        user_k_core_threshold=5,
                                        item_k_core_threshold=5)

    # return: list of (user_ID, item_ID, rating, timestamp)
    return kcore_rating_inters, rating_inters

In [None]:
inters,_ = preprocess_rating(reviews)


Process rating data: 
 Dataset: reviews
The number of raw inters:  4828480

Filtering by k-core:
    Epoch 1 The number of inters: 1353435, users: 157062, items: 32850
    Epoch 2 The number of inters: 1302721, users: 152319, items: 18486
    Epoch 3 The number of inters: 1281240, users: 146980, items: 18143
    Epoch 4 The number of inters: 1278612, users: 146779, items: 17654
    Epoch 5 The number of inters: 1277242, users: 146453, items: 17635
    Epoch 6 The number of inters: 1277020, users: 146436, items: 17596
    Epoch 7 The number of inters: 1276876, users: 146402, items: 17594
    Epoch 8 The number of inters: 1276864, users: 146399, items: 17594
    Epoch 9 The number of inters: 1276852, users: 146399, items: 17591
    Epoch 10 The number of inters: 1276840, users: 146396, items: 17591


#### Last-out Split
We split our dataset into training, validation, and test using the "leave-last-out data split" method. 

- Training part: the first N-2 items;
- Validation part: the (N-1)-th item;
- Testing part: the N-th item.



In [26]:
# helper method

def make_inters_in_order(inters):
    user2inters, new_inters = collections.defaultdict(list), collections.defaultdict(list)
    for inter in inters:
        user, item, rating, timestamp = inter
        user2inters[user].append((user, item, rating, timestamp))
    for user in user2inters:
        user_inters = user2inters[user]
        user_inters.sort(key=lambda d: d[3])
        his_items = set()
        for inter in user_inters:
            user, item, rating, timestamp = inter
            if item in his_items:
                continue
            his_items.add(item)
            new_inters[user].append(inter)
    return new_inters


In [28]:
def last_out_split(inters):
    train_data = []
    valid_data = []
    test_data = []

    # Order the inters
    ordered_inters = make_inters_in_order(inters=inters)


    for user in tqdm(ordered_inters, desc='Creating train/valid/test lists'):
        cur_inter = ordered_inters[user]
        # Add the last interaction to the test set
        test_data.append((cur_inter[-1][0], cur_inter[-1][1], cur_inter[-1][2], cur_inter[-1][3]))
        
        if len(cur_inter) > 1:
            # Add the second-to-last interaction to the validation set
            valid_data.append((cur_inter[-2][0], cur_inter[-2][1], cur_inter[-2][2], cur_inter[-2][3]))
        
        if len(cur_inter) > 2:
            # Add the remaining interactions to the training set
            for i in range(len(cur_inter) - 2):
                train_data.append((cur_inter[i][0], cur_inter[i][1], cur_inter[i][2], cur_inter[i][3]))
    return train_data, valid_data, test_data


In [29]:
train_data, valid_data, test_data = last_out_split(inters)

Creating train/valid/test lists: 100%|██████████| 146396/146396 [00:02<00:00, 57542.27it/s]


## Purchasing Prediction
In this section, I will develop several baseline models and iteratively improve upon them using advanced techniques discussed in class, as well as deep learning methods. These models will include Most Popular, item-to-item based collaborative filtering, Bayesian Personalized Ranking (BPR), and a deep learning model implemented with PyTorch.


#### Most Popular Model

In [23]:
from collections import Counter

def mostPopularClassification(data, threshold=0.75):
    """
    Identifies the most popular items based on their frequency in the data.
    
    Parameters:
        data (list): A list of tuples or dictionaries where the first element/key represents the item.
        threshold (float): The proportion of total occurrences to include in the popular set.

    Returns:
        set: A set of the most popular items.
    """
    # Count occurrences of each item
    item_counts = Counter(entry[1] for entry in data)
    total_occurrences = sum(item_counts.values())

    # Sort items by frequency in descending order
    sorted_items = sorted(item_counts.items(), key=lambda x: x[1], reverse=True)

    # Select items until the threshold is reached
    popular_items = set()
    cumulative_count = 0
    for item, count in sorted_items:
        cumulative_count += count
        popular_items.add(item)
        if cumulative_count >= threshold * total_occurrences:
            break

    return popular_items

In [None]:
popular_items = mostPopularClassification(train_data, 0.75)

{'B006HVARCC',
 'B07Q45YDY2',
 'B00ML3I0DS',
 'B00RMJ8IB0',
 'B004VS0JPI',
 'B00SYRLNFW',
 'B01LZM2XTW',
 'B00J9IH1RO',
 'B005EZMCGQ',
 'B00F9F1G4U',
 'B00CAK052Q',
 'B00M51V8A8',
 'B009S984J8',
 'B006PK2TEE',
 'B006QMYF0W',
 'B00H526IU2',
 'B00MY9VE8C',
 'B097PJZ6KR',
 'B01MSBCRDY',
 'B015YFGHMK',
 'B0155X6WVW',
 'B005J0VPNC',
 'B015K3XX8W',
 'B017PKJDEG',
 'B01M8IXC16',
 'B00O5IUQ7U',
 'B017TLHABO',
 'B01883XCYG',
 'B00UC7DG6Q',
 'B07GHTZ8G8',
 'B071CLP46T',
 'B00O1DVM86',
 'B007SPMWOM',
 'B00HHBXYGM',
 'B07H7RY9F3',
 'B004GL88GW',
 'B00CLR1RNE',
 'B00GUG227C',
 'B00B8KRVHG',
 'B071P6LWP8',
 'B01GKJX51A',
 'B00IVD2HMW',
 'B00P77ZAN8',
 'B004V5A7K8',
 'B006K2VNC6',
 'B00MDWUIAK',
 'B00BUR9KJE',
 'B00B2V66VS',
 'B01C449SWU',
 'B00IG2DOKM',
 'B00GAQ3W1M',
 'B007TBAQCK',
 'B006HJKKCG',
 'B00AKJRPC2',
 'B07816NG7B',
 'B00IJLO1IY',
 'B019CVAKMK',
 'B006GWE5PM',
 'B00FI1759K',
 'B0096DFBZQ',
 'B00ETO0UM6',
 'B005V0QA62',
 'B00P31G9PQ',
 'B01DKLHLXY',
 'B00HVYEPAY',
 'B005ZXWMUS',
 'B005DO1N

In [44]:
type(popular_items)

set

In [None]:
# Measuring performance of model for Most Popular model
def accuracy(pred, actual):
    sum(pred == actual) / len(pred)

def precision_at_k(recommendations, relevant_items):
    return len(recommendations & relevant_items) / len(recommendations)

def recall_at_k(most_popular, relevant_items):
    if len(relevant_items) == 0:
        return 0  # Avoid division by zero
    return len(set(most_popular) & set(relevant_items)) / len(relevant_items)

In [72]:
user_set = set()
item_set = set()
purchased_set = set()

# Collect from train and validation data
for u, b, r,_ in train_data + valid_data:  # Avoid test_data
    user_set.add(u)
    item_set.add(b)
    purchased_set.add((u, b))  # Add observed interactions

In [None]:
# Make validation dataset include negative sets at an equal rate as positive set

def generate_negatives(user_set, item_set, purchased_set, num_negatives=len(valid_data)):
    negatives = []
    for user in user_set:
        non_purchased_items = item_set - {item for (u, item) in purchased_set if u == user}
        sampled_items = random.sample(non_purchased_items, min(num_negatives, len(non_purchased_items)))
        for item in sampled_items:
            negatives.append((user, item))  # Negative interaction
    return negatives

('AFSKPY37N3C43SOI5IEXEK5JSIYA', 'B002I7PGT8', 1.0, 1366378695000)


146396

In [None]:
# Testing against validation dataset to optimize threshold
valid_data_set = {x[1] for x in valid_data}

precision = precision_at_k(popular_items, valid_data_set)
recall = recall_at_k(popular_items, valid_data_set)

# loop through valid_dataset
for review in 
# Check if its in if so its 1, check if its equal to actual, if so then add it to correct


In [61]:
print(f"Precision: {precision}")
print(f"Recall: f{recall}")

Precision: 1.0
Recall: f0.15030674846625766
