In [2]:
import csv
import json
from collections import Counter
from pathlib import Path
import pandas as pd
import numpy as np

cat_file = r"PATH TO CATEGORY FILE"


# Load categories
with open(cat_file) as categories:
    csv_reader = csv.reader(categories, delimiter=',')
    line_count = 0
    cat_groups = []
    cat_dict = {}
    for row in csv_reader:
            cat_number, cat, cat_group = row
            if cat_group.strip() not in cat_groups:
                cat_groups.append(cat_group.strip())
                
            cat_dict[int(cat_number)] = cat_group.strip()
                
            line_count += 1
        
    print(f'Total {len(cat_groups)} groups.')
    print(f'Total {line_count} categories.')

Total 12 groups.
Total 224 categories.


In [5]:
dataset_path_train = r"PATH TO train.json"
dataset_path_valid = r"PATH TO valid.json"
dataset_path_test = r"PATH TO test.json"
dataset_root = r"PATH TO polyvore_outfits/ folder"

# Load metadata
with open(Path(dataset_root, "polyvore_item_metadata.json")) as json_file:
    metadata = json.load(json_file)
    
compositions = []

# Get sets of outfits, items and item counts
def get_sets(dataset_path):
    item_counts = []
    with open(dataset_path) as json_file:
        raw_json = json.load(json_file)
        print("Loaded " + str(len(raw_json)) + " outfits from " + dataset_path, flush=True)
        items_set = set()
        outfits_set = set()

        # Load all test items into dict
        for outfit in raw_json:
            compositions.extend([metadata[item["item_id"]]["semantic_category"] for item in outfit["items"]])
            set_id = int(outfit["set_id"])
            outfits_set.add(set_id)
            item_counts.append(len(outfit["items"]))
            for item in outfit["items"]:
                items_set.add(item["item_id"])
    
    series = pd.Series(item_counts)
    print(series.describe())
    bcounts = np.bincount(item_counts)
    print(dict(zip(np.unique(item_counts), bcounts[bcounts.nonzero()])))
    
    print("Loaded " + str(len(items_set)) + " items from " + dataset_path, flush=True)
    return outfits_set, items_set, item_counts
            
train_o, train_i, train_counts = get_sets(dataset_path_train)
valid_o, valid_i, valid_counts = get_sets(dataset_path_valid)
test_o, test_i, test_counts = get_sets(dataset_path_test)

total_o = train_o.union(valid_o).union(test_o)
total_counts = train_counts
total_counts.extend(valid_counts)
total_counts.extend(test_counts)

outfits_count = len(total_o)
counter = Counter(compositions)
print(counter)
for cat in counter:
    counter[cat] = counter[cat] / outfits_count
print(counter)

total_i = train_i.union(valid_i).union(test_i)
print("The dataset contains " + str(len(total_o)) + " outfits")
print("The dataset contains " + str(len(total_i)) + " items")
print("The metadata contains " + str(len(metadata)) + " items")

categories = set()

for item in metadata.values():
    categories.add(int(item["category_id"]))
print("The metadata contains " + str(len(categories)) + " categories")

Loaded 10000 outfits from D:\David\Škola\RP\dataset\polyvore_outfits\nondisjoint\test.json
count    10000.000000
mean         5.350600
std          1.624729
min          2.000000
25%          4.000000
50%          5.000000
75%          6.000000
max         17.000000
dtype: float64
{2: 174, 3: 917, 4: 2036, 5: 2656, 6: 2078, 7: 1199, 8: 585, 9: 200, 10: 99, 11: 36, 12: 10, 13: 7, 14: 2, 17: 1}
Loaded 47854 items from D:\David\Škola\RP\dataset\polyvore_outfits\nondisjoint\test.json
Counter({'shoes': 9613, 'jewellery': 8896, 'bags': 8694, 'tops': 6669, 'bottoms': 6134, 'outerwear': 3467, 'all-body': 3451, 'sunglasses': 2906, 'hats': 1401, 'accessories': 1361, 'scarves': 914})
The metadata contains 251008 items
The metadata contains 153 categories
