In [1]:
import random

import pandas as pd

from tqdm.auto import tqdm

In [2]:
categories = pd.read_json('/pio/scratch/1/recommender_systems/interim/Amazon/meta_Clothing_Shoes_and_Jewelry_categories.json', lines=True)

In [105]:
def prepare_test_file(categories, path, test_set_size = 10000,
                      category_type = "category_1", weighted_sample = True,
                      min_counts = 0):
    
    counts = categories[category_type].value_counts()
    counts = counts[counts > min_counts] 
    categories_grouped = categories.groupby(category_type)
    
    categories_dict = {}
    for category, items in categories_grouped:
        categories_dict[category] = items

    test_set = []    
    for _ in tqdm(range(test_set_size)):
        
        # choosing categories for ABX
        if weighted_sample:
            category_sample = counts.sample(2, weights=counts)
        else:
            category_sample = counts.sample(2)
            
        # choosing which category to treat as positive & negative    
        positive = category_sample.index[0]
        negative = category_sample.index[1]
        if random.random() < 0.5:
            positive, negative = negative, positive
            
        # chosing items for ABX from positive & nega    
        positive_items = categories_dict[positive].sample(2).asin.values
        negative_item = categories_dict[negative].sample(1).asin.values
        
        # appending record
        line = {"A": positive_items[0],
                "B": negative_item[0],
                "X": positive_items[1],
                "category_AX": positive,
                "category_B": negative}
        test_set.append(line)
        
    with open(path, 'w') as file:    
        pd.DataFrame(test_set).to_json(file, 'records', lines = True)

In [106]:
prepare_test_file(categories, '/pio/scratch/1/i313924/test_ABX', category_type='category_1', test_set_size = 10000)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [92]:
counts = categories["category_2"].value_counts()
counts

Clothing                                                                1044952
Shoes                                                                    466895
Jewelry                                                                  399823
Accessories                                                              186716
Watches                                                                  123775
                                                                         ...   
Size: 0.20 x inch                                                             1
Size details ,please see the size table pictures with tshirt photoes          1
Material: Copper, zircon                                                      1
Soft woven canvas fabric                                                      1
                                                                              1
Name: category_2, Length: 9908, dtype: int64

In [93]:
counts.sample(2, weights=counts)

Accessories    186716
Men             17361
Name: category_2, dtype: int64

In [94]:
counts.sample(2)

Om Namah Shivai Prayer Shawl of Lord Shiva    1
Earring size:13*15mm/0.51" x 0.59".           1
Name: category_2, dtype: int64

## Debugging

In [3]:
abx_tests = pd.read_json('/pio/scratch/1/i313924/test_data/test_ABX_weighted.json
', lines=True)

In [4]:
abx_tests

Unnamed: 0,A,B,X,category_AX,category_B
0,B00Z730M1M,B00HZOJFWI,B00BXA6ZJQ,Women,Novelty & More
1,B00E3V35YK,B004LOFDB2,B00GXKDM1A,Women,Novelty & More
2,B01E5A13LA,B003XDTBS8,B01ERHV7R6,Women,Men
3,B004VQ19AY,B00JFAMADQ,B01D9O8Z3M,Women,Luggage & Travel Gear
4,B016Y82UPO,B005GTQQCQ,B016S2Z5HG,Men,Women
...,...,...,...,...,...
9995,B016ZNTZDS,B00AYCKUMW,B001KYU04G,Luggage & Travel Gear,Women
9996,B01CRKBVA8,B008FXIP8S,B00D8BR36M,Women,Men
9997,B014ERCUSE,B0016HSH6K,B0060QUJPY,Girls,Men
9998,B001T6IK42,B006NLKZD2,B007I7KE9A,Men,Novelty & More


In [7]:
categories.loc[categories["asin"] == "B00Z730M1M"]

Unnamed: 0,asin,category_1,category_2
1973909,B00Z730M1M,Women,Shoes


In [8]:
categories.loc[categories["asin"] == "B00HZOJFWI"]

Unnamed: 0,asin,category_1,category_2
1193885,B00HZOJFWI,Novelty & More,Clothing


In [9]:
categories.loc[categories["asin"] == "B00BXA6ZJQ"]

Unnamed: 0,asin,category_1,category_2
809153,B00BXA6ZJQ,Women,Handbags & Wallets


In [10]:
categories.loc[categories["asin"] == "B001T6IK42"]

Unnamed: 0,asin,category_1,category_2
144195,B001T6IK42,Men,Accessories


In [11]:
categories.loc[categories["asin"] == "B006NLKZD2"]

Unnamed: 0,asin,category_1,category_2
481418,B006NLKZD2,Novelty & More,Clothing


In [12]:
categories.loc[categories["asin"] == "B007I7KE9A"]

Unnamed: 0,asin,category_1,category_2
534416,B007I7KE9A,Men,Made from 100% Performance Polyester


In [14]:
abx_tests["category_AX"].value_counts()

Women                                3819
Men                                  2420
Novelty & More                       1695
Girls                                 560
Baby                                  386
Luggage & Travel Gear                 350
Boys                                  338
Costumes & Accessories                301
Shoe, Jewelry & Watch Accessories      97
Uniforms, Work & Safety                18
Traditional & Cultural Wear            16
Name: category_AX, dtype: int64

In [15]:
abx_tests["category_B"].value_counts()

Women                                3819
Men                                  2390
Novelty & More                       1729
Girls                                 519
Baby                                  398
Luggage & Travel Gear                 351
Boys                                  342
Costumes & Accessories                298
Shoe, Jewelry & Watch Accessories     114
Traditional & Cultural Wear            22
Uniforms, Work & Safety                18
Name: category_B, dtype: int64