#### Config

In [1]:
REVIEWS_PATH = './Sports_and_Outdoors.jsonl'
SAVE_PATH = '../Amazon_Toys_and_Games/Amazon_Sports_and_Outdoors.inter'
ITEM_MAPPING_PATH = '../Amazon_Sports_and_Outdoors/item_mapping_Amazon_Sports_and_Outdoors.json'
USER_MAPPING_PATH = '../Amazon_Sports_and_Outdoors/user_mapping_Amazon_Sports_and_Outdoors.json'
ITEM_REVERSE_MAPPING_PATH = '../Amazon_Sports_and_Outdoors/item_reverse_mapping_Amazon_Sports_and_Outdoors.json'
USER_REVERSE_MAPPING_PATH = '../Amazon_Sports_and_Outdoors/user_reverse_mapping_Amazon_Sports_and_Outdoors.json'
META_PATH = './meta_Sports_and_Outdoors.jsonl'
META_SAVE_PATH = '../Amazon_Toys_and_Games/Amazon_Sports_and_Outdoors.item'

#MIN_USER_OCCURENCE = 5

# Preprocessing
###### Authors: Piotr Stachowicz, Jakub Malczak

## (.inter)

### Reading the JSONL 'review' file

In [2]:
import pandas as pd
import json

data = []
with open(REVIEWS_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        record = json.loads(line)

        data.append(
            {
                "user_id:token": record.get("user_id"),
                "item_id:token": record.get("parent_asin"),
                "rating:float": record.get("rating"),
                "timestamp:float": record.get("timestamp"),
            }
        )

df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,user_id:token,item_id:token,rating:float,timestamp:float
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B0BGFR76CF,5.0,1677321053520
1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B00NXQLFQQ,5.0,1523093771676
2,AGGZ357AO26RQZVRLGU4D4N52DZQ,B0957WLR63,5.0,1653526919105
3,AGGZ357AO26RQZVRLGU4D4N52DZQ,B00IET8S80,5.0,1627330911189
4,AGGZ357AO26RQZVRLGU4D4N52DZQ,B01C2SW7XA,5.0,1617831811976


### Map string IDs

In [3]:
df["user_id:token"], user_index = pd.factorize(df["user_id:token"]) # eg. 42 -> "ASFAFASFASF"
df["item_id:token"], item_index = pd.factorize(df["item_id:token"])

reverse_item_index = {org_id: num for num, org_id in enumerate(item_index)} # eg. "ASFAFASFASF" -> 42
reverse_user_index = {org_id: num for num, org_id in enumerate(user_index)}

### Filter entries

##### By minimum occurence

In [4]:
#df = df.groupby('user_id:token').filter(lambda x: len(x) > MIN_USER_OCCURENCE)

### Validate dataset

##### Check for null values

In [5]:
df.isna().sum()

user_id:token      0
item_id:token      0
rating:float       0
timestamp:float    0
dtype: int64

##### Check for invalid IDs

In [6]:
string_cols = df.select_dtypes(include="object").columns
for col in string_cols:
    print(col, "=> puste stringi:", (df[col] == "").sum())

##### Check for invalid types

In [7]:
df[["rating:float", "timestamp:float"]].describe().map(lambda x: f"{x:,.0f}")

Unnamed: 0,rating:float,timestamp:float
count,19595170,19595170
mean,4,1542144400806
std,1,91428989142
min,1,957208301000
25%,4,1474241284500
50%,5,1557174492773
75%,5,1616176559383
max,5,1694670041162


### Save dataset to .inter file

In [8]:
df.to_csv(SAVE_PATH, sep="\t", index=False)

### Save mappings to .json file

In [9]:
with open(ITEM_MAPPING_PATH, 'w', encoding='utf-8') as f:
    json.dump(list(item_index), fp=f)

with open(ITEM_REVERSE_MAPPING_PATH, 'w', encoding='utf-8') as f:
    json.dump(reverse_item_index, fp=f)

with open(USER_MAPPING_PATH, 'w', encoding='utf-8') as f:
    json.dump(list(user_index), fp=f)

with open(USER_REVERSE_MAPPING_PATH, 'w', encoding='utf-8') as f:
    json.dump(reverse_user_index, fp=f)

## (.item)

### Reading the JSONL 'meta' file

In [10]:
def safe_float(x):
    try:
        return float(x)
    except (TypeError, ValueError):
        return float('nan')

data = []
with open(META_PATH, 'r', encoding='utf-8') as f:
    keys = set(reverse_item_index.keys())
    for line in f:
        record = json.loads(line)
        best_sellers = record.get("details", {}).get("Best Sellers Rank", {})
        max_key, max_value = None, float('nan')

        if best_sellers:
            max_key = max(best_sellers, key=best_sellers.get)
            max_value = best_sellers[max_key]

        # Does not exist in 'review' file
        if record.get("parent_asin") not in keys:
            continue

        data.append(
            {
                "item_id:token": reverse_item_index[record.get("parent_asin")],
                "title:token": record.get("title"),
                "price:float": safe_float(record.get("price")),
                # store 35109 times None vs. details -> Brand Name 572787 times None
                "brand:token": record.get("store"),
                "categories:token_seq": '\'' + "\', \'".join(record.get("categories")) + '\'' if record.get("categories") else '',
                "sales_type:token": max_key,
                "sales_rank:float": max_value,
            }
        )

df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,item_id:token,title:token,price:float,brand:token,categories:token_seq,sales_type:token,sales_rank:float
0,884509,Sure-Grip Zombie Wheels Low 59mm 4 Pack,55.0,Sure-Grip,"'Sports & Outdoors', 'Sports', 'Skates, Skateb...",Sports & Outdoors,295175.0
1,561856,USGI Wet Weather Bag (Fоur Paсk),,USGI,"'Sports & Outdoors', 'Sports', 'Boating & Sail...",Sports & Outdoors,962400.0
2,239749,NHL San Jose Sharks Team Logo Post Earrings,18.99,Aminco,"'Sports & Outdoors', 'Fan Shop', 'Jewelry & Wa...",Sports & Outdoors,721263.0
3,55030,Bont Skates - Prostar Purple Suede Professiona...,209.0,Bont,"'Sports & Outdoors', 'Sports', 'Skates, Skateb...",Sports & Outdoors,213685.0
4,1277121,Team Golf Alamaba Crimson Tide Embroidered Tow...,,Team Golf,"'Sports & Outdoors', 'Fan Shop', 'Sports Equip...",Sports & Outdoors,1621084.0


### Validate dataset

##### Check for null values

In [11]:
df.isna().sum()

item_id:token                 0
title:token                   0
price:float             1102717
brand:token               35108
categories:token_seq          0
sales_type:token         963077
sales_rank:float         963077
dtype: int64

##### Check for invalid IDs

In [12]:
string_cols = df.select_dtypes(include="object").columns
for col in string_cols:
    print(col, "=> puste stringi:", (df[col] == "").sum())

title:token => puste stringi: 112
brand:token => puste stringi: 0
categories:token_seq => puste stringi: 89817
sales_type:token => puste stringi: 0


##### Check for invalid types

In [13]:
df[["price:float", "sales_rank:float"]].describe().map(lambda x: f"{x:,.0f}")

Unnamed: 0,price:float,sales_rank:float
count,484502,624142
mean,57,735503
std,162,614592
min,0,1
25%,13,267574
50%,24,589380
75%,50,1054191
max,22000,13516189


### Save dataset to .item file

In [14]:
df.to_csv(META_SAVE_PATH, sep='\t', index=False, na_rep='')