In [17]:
import csv
import json
import sys
from collections import Counter
from pathlib import Path
import pandas as pd
import numpy as np

cat_file = r"D:\David\Škola\RP\dataset\polyvore_outfits\categories.csv"

with open(cat_file) as categories:
    csv_reader = csv.reader(categories, delimiter=',')
    line_count = 0
    cat_groups = []
    cat_dict = {}
    for row in csv_reader:
            cat_number, cat, cat_group = row
            if cat_group.strip() not in cat_groups:
                cat_groups.append(cat_group.strip())
                
            cat_dict[int(cat_number)] = cat_group.strip()
                
            line_count += 1
        
    print(f'Total {len(cat_groups)} groups.')
    print(f'Total {line_count} categories.')

Total 12 groups.
Total 224 categories.


In [22]:
dataset_path_train = r"D:\David\Škola\RP\dataset\polyvore_outfits\disjoint\train.json"
dataset_path_valid = r"D:\David\Škola\RP\dataset\polyvore_outfits\disjoint\valid.json"
dataset_path_test = r"D:\David\Škola\RP\dataset\polyvore_outfits\disjoint\test.json"
fitb_filepath = r"D:\David\Škola\RP\dataset\polyvore_outfits\disjoint\fill_in_blank_valid.json"
dataset_root = r"D:\David\Škola\RP\dataset\polyvore_outfits"

def key_from_fitb_string(string):
    values = string.split("_")
    return int(values[0]), int(values[1])

with open(Path(dataset_root, "polyvore_item_metadata.json")) as json_file:
    metadata = json.load(json_file)
    
compositions = []

def get_sets(dataset_path):
    item_counts = []
    with open(dataset_path) as json_file:
        raw_json = json.load(json_file)
        print("Loaded " + str(len(raw_json)) + " outfits from " + dataset_path, flush=True)
        items_set = set()
        outfits_set = set()

        # Load all test items into dict
        for outfit in raw_json:
            compositions.extend([metadata[item["item_id"]]["semantic_category"] for item in outfit["items"]])
            set_id = int(outfit["set_id"])
            outfits_set.add(set_id)
            item_counts.append(len(outfit["items"]))
            for item in outfit["items"]:
                items_set.add(item["item_id"])
    
    series = pd.Series(item_counts)
    print(series.describe())
    bcounts = np.bincount(item_counts)
    print(dict(zip(np.unique(item_counts), bcounts[bcounts.nonzero()])))
    
    print("Loaded " + str(len(items_set)) + " items from " + dataset_path, flush=True)
    return outfits_set, items_set, item_counts
            
train_o, train_i, train_counts = get_sets(dataset_path_train)
valid_o, valid_i, valid_counts = get_sets(dataset_path_valid)
test_o, test_i, test_counts = get_sets(dataset_path_test)

total_o = train_o.union(valid_o).union(test_o)
total_counts = train_counts
total_counts.extend(valid_counts)
total_counts.extend(test_counts)

bcounts = np.bincount(total_counts)
print(dict(zip(np.unique(total_counts), bcounts[bcounts.nonzero()])))

outfits_count = len(total_o)
counter = Counter(compositions)
for cat in counter:
    counter[cat] = counter[cat] / outfits_count
print(counter)

total_i = train_i.union(valid_i).union(test_i)
print("The dataset contains " + str(len(total_o)) + " outfits")
print("The dataset contains " + str(len(total_i)) + " items")
print("The metadata contains " + str(len(metadata)) + " items")

categories = set()

for item in metadata.values():
    categories.add(int(item["category_id"]))
print("The metadata contains " + str(len(categories)) + " categories")

Loaded 16995 outfits from D:\David\Škola\RP\dataset\polyvore_outfits\disjoint\train.json
count    16995.000000
mean         5.056487
std          1.479326
min          2.000000
25%          4.000000
50%          5.000000
75%          6.000000
max         16.000000
dtype: float64
{2: 346, 3: 1842, 4: 4178, 5: 4850, 6: 3208, 7: 1581, 8: 639, 9: 231, 10: 90, 11: 18, 12: 9, 13: 1, 14: 1, 16: 1}
Loaded 71967 items from D:\David\Škola\RP\dataset\polyvore_outfits\disjoint\train.json
Loaded 3000 outfits from D:\David\Škola\RP\dataset\polyvore_outfits\disjoint\valid.json
count    3000.000000
mean        5.096000
std         1.485099
min         2.000000
25%         4.000000
50%         5.000000
75%         6.000000
max        14.000000
dtype: float64
{2: 52, 3: 316, 4: 722, 5: 866, 6: 568, 7: 302, 8: 107, 9: 45, 10: 11, 11: 9, 12: 1, 14: 1}
Loaded 14657 items from D:\David\Škola\RP\dataset\polyvore_outfits\disjoint\valid.json
Loaded 15145 outfits from D:\David\Škola\RP\dataset\polyvore_outfits\

In [None]:
examples = []
with open(fitb_filepath) as fitb_file:
    raw_json = json.load(fitb_file)
    print("Loaded " + str(len(raw_json)) + " questions", flush=True)

    # Compose questions from FITB file and test items dict
    for task in raw_json:
        set_id = None
        inputs = []
        input_categories = []
        targets = []
        target_categories = []
        target_pos = None

        for question_item_str in task["question"]:
            q_key = key_from_fitb_string(question_item_str)
            item_category = items[q_key]
            input_categories.append(item_category)
            set_id = q_key[0]
        pos = 0

        for question_item_str in task["answers"]:
            q_key = key_from_fitb_string(question_item_str)
            if q_key[0] == set_id:
                target_pos = pos
            item_category = items[q_key]
            target_categories.append(item_category)
            pos += 1
        
        if len(set(target_categories)) > 1:
            print(set_id)
            print(target_categories)
        