In [6]:
import pandas as pd
import numpy as np
import json

import warnings
warnings.filterwarnings('ignore')

In [7]:
def data_preparation(row):
# getting loading_* params
    lw = row['data_result']['cargo_space']['loading_size']['width']
    lh = row['data_result']['cargo_space']['loading_size']['height']
    ll = row['data_result']['cargo_space']['loading_size']['length']
    
# normalizing box size params
    w = np.array([x['size']['width'] for x in row['data_result']['boxes']]) / lw
    h = np.array([x['size']['height'] for x in row['data_result']['boxes']]) / lh
    l = np.array([x['size']['length'] for x in row['data_result']['boxes']]) / ll
    
    # getting volume
    v = w * h * l
#     ids = np.array([x['id'] for x in row['data_result']['boxes']])

# getting other params
    t = np.array([int(x['turnover']) for x in row['data_result']['boxes']])
    s = np.array([int(x['stacking']) for x in row['data_result']['boxes']])
    
# make custom params
    w_t = w[t == 1]
    h_t = h[t == 1]
    l_t = l[t == 1]
    v_t = w_t * h_t * l_t 
    w_nt = w[t == 0]
    h_nt = h[t == 0]
    l_nt = l[t == 0]
    v_nt = w_nt * h_nt * l_nt

    w_s = w[s == 1]
    h_s = h[s == 1]
    l_s = l[s == 1]
    v_s = w_s * h_s * l_s
    w_ns = w[s == 0]
    h_ns = h[s == 0]
    l_ns = l[s == 0]
    v_ns = w_ns * h_ns * l_ns


# getting labels
    d = row['data_result']['cargo_space']['calculation_info']['density_percent'] / 100
    
# used quantiles
    qs = [0, 0.2, 0.4, 0.6, 0.8, 1]
    return {
#         "n": len(row['data_result']['boxes']),
# various aggregated statistics of sizes, volumes and other params
        "mean_width": w.mean(),
        "mean_height": h.mean(),
        "mean_length": l.mean(),
        "mean_volume": v.mean(),
        "std_width" : w.std(),
        "std_height" : h.std(),
        "std_length" : l.std(),
        "std_volume" : v.std(),
        "sum_width": w.sum(),
        "sum_height": h.sum(),
        "sum_length": l.sum(),
        "sum_volume": v.sum(),
        "mean_turnover": t.mean(),
        "mean_stacking": s.mean(),
        
        "mean_width_turnover": w_t.mean(),
        "mean_height_turnover": h_t.mean(),
        "mean_length_turnover": l_t.mean(),
        "mean_volume_turnover": v_t.mean(),
        "std_width_turnover" : w_t.std(),
        "std_height_turnover" : h_t.std(),
        "std_length_turnover" : l_t.std(),
        "std_volume_turnover" : v_t.std(),
        "sum_width_turnover": w_t.sum(),
        "sum_height_turnover": h_t.sum(),
        "sum_length_turnover": l_t.sum(),
        "sum_volume_turnover": v_t.sum(),
        
        "mean_width_no_turnover": w_nt.mean(),
        "mean_height_no_turnover": h_nt.mean(),
        "mean_length_no_turnover": l_nt.mean(),
        "mean_volume_no_turnover": v_nt.mean(),
        "std_width_no_turnover" : w_nt.std(),
        "std_height_no_turnover" : h_nt.std(),
        "std_length_no_turnover" : l_nt.std(),
        "std_volume_no_turnover" : v_nt.std(),
        "sum_width_no_turnover": w_nt.sum(),
        "sum_height_no_turnover": h_nt.sum(),
        "sum_length_no_turnover": l_nt.sum(),
        "sum_volume_no_turnover": v_nt.sum(),
        
        "mean_width_stacking": w_s.mean(),
        "mean_height_stacking": h_s.mean(),
        "mean_length_stacking": l_s.mean(),
        "mean_volume_stacking": v_s.mean(),        
        "std_width_stacking" : w_s.std(),
        "std_height_stacking" : h_s.std(),
        "std_length_stacking" : l_s.std(),
        "std_volume_stacking" : v_s.std(),
        "sum_width_stacking": w_s.sum(),
        "sum_height_stacking": h_s.sum(),
        "sum_length_stacking": l_s.sum(),
        "sum_volume_stacking": v_s.sum(),
        
        "mean_width_no_stacking": w_ns.mean(),
        "mean_height_no_stacking": h_ns.mean(),
        "mean_length_no_stacking": l_ns.mean(),
        "mean_volume_no_stacking": v_ns.mean(),
        "std_width_no_stacking" : w_ns.std(),
        "std_height_no_stacking" : h_ns.std(),
        "std_length_no_stacking" : l_ns.std(),
        "std_volume_no_stacking" : v_ns.std(),
        "sum_width_no_stacking": w_ns.sum(),
        "sum_height_no_stacking": h_ns.sum(),
        "sum_length_no_stacking": l_ns.sum(),
        "sum_volume_no_stacking": v_ns.sum(),
        **{
            "width_p{}".format(int(q * 100)): np.quantile(w, q) for q in qs
        },
        **{
            "heigth_p{}".format(int(q * 100)): np.quantile(h, q) for q in qs
        },
        **{
            "length_p{}".format(int(q * 100)): np.quantile(l, q) for q in qs
        },
        **{
            "volume_p{}".format(int(q * 100)): np.quantile(v, q) for q in qs
        },
        **{
            "width_turning_p{}".format(int(q * 100)): -1 if len(w_t) == 0 else np.quantile(w_t, q) for q in qs
        },
        **{
            "heigth_turning_p{}".format(int(q * 100)): -1 if len(h_t) == 0 else np.quantile(h_t, q) for q in qs
        },
        **{
            "length_turning_p{}".format(int(q * 100)): -1 if len(l_t) == 0 else np.quantile(l_t, q) for q in qs
        },
        **{
            "volume_turning_p{}".format(int(q * 100)): -1 if len(v_t) == 0 else np.quantile(v_t, q) for q in qs
        },
        **{
            "width_stacking_p{}".format(int(q * 100)): -1 if len(w_s) == 0 else np.quantile(w_s, q) for q in qs
        },
        **{
            "heigth_stacking_p{}".format(int(q * 100)): -1 if len(h_s) == 0 else np.quantile(h_s, q) for q in qs
        },
        **{
            "length_stacking_p{}".format(int(q * 100)): -1 if len(l_s) == 0 else np.quantile(l_s, q) for q in qs
        },
        **{
            "volume_stacking_p{}".format(int(q * 100)): -1 if len(v_s) == 0 else np.quantile(v_s, q) for q in qs
        },
#         "loading_width": lw,
#         "loading_height": lh,
#         "loading_length": ll,
        
# labels
        "density_percent": d,
    }

In [8]:
def distrib(x):
# used for bootstrapping
    return np.random.normal(x/50)

In [9]:
def bootstrap(row):
# used for increasing dataset size
    row['data_result']['cargo_space']['loading_size']['width'] += distrib(row['data_result']['cargo_space']['loading_size']['width'])
    row['data_result']['cargo_space']['loading_size']['height'] += distrib(row['data_result']['cargo_space']['loading_size']['height'])
    row['data_result']['cargo_space']['loading_size']['length'] += distrib(row['data_result']['cargo_space']['loading_size']['length'])
    
#     for i in range(len(row['data_result']['boxes'])):
#         row['data_result']['boxes'][i]['size']['width'] += distrib(row['data_result']['boxes'][i]['size']['width'])
#         row['data_result']['boxes'][i]['size']['height'] += distrib(row['data_result']['boxes'][i]['size']['height'])
#         row['data_result']['boxes'][i]['size']['length'] += distrib(row['data_result']['boxes'][i]['size']['length'])
    
    return row
    

In [10]:
import os

folder_path = "./ALGORITM"

file_list = os.listdir(folder_path)

data = []
test_data = []
test_size = 0.4

# reading all json files and putting info into csv with pandas
for file in file_list:
    if file.endswith('.json'):
        path = os.path.join(folder_path, file)
        with open(path, 'r', encoding='utf-8') as f:
            raw = json.load(f)
            if np.random.random() > test_size:
                # for train
                data.append(data_preparation(bootstrap(raw)))
                data.append(data_preparation(raw))
            else:
                # for test
                test_data.append(data_preparation(raw))

df = pd.DataFrame(data)
# fill empty
df[df.isna()] = -1
df.to_csv("data_custom_train.csv", index=False)
df_test = pd.DataFrame(test_data)
# fill empty
df_test[df_test.isna()] = -1
df_test.to_csv("data_custom_test.csv", index=False)
df.head()

Unnamed: 0,mean_width,mean_height,mean_length,mean_volume,std_width,std_height,std_length,std_volume,sum_width,sum_height,...,length_stacking_p60,length_stacking_p80,length_stacking_p100,volume_stacking_p0,volume_stacking_p20,volume_stacking_p40,volume_stacking_p60,volume_stacking_p80,volume_stacking_p100,density_percent
0,0.160545,0.08626,0.175225,0.002852,0.055125,0.02278,0.046782,0.002357,43.347088,23.290174,...,0.186169,0.212788,0.32498,0.000589,0.00091,0.001289,0.003574,0.003691,0.015735,0.817642
1,0.160545,0.08626,0.175225,0.002852,0.055125,0.02278,0.046782,0.002357,43.347088,23.290174,...,0.186169,0.212788,0.32498,0.000589,0.00091,0.001289,0.003574,0.003691,0.015735,0.817642
2,0.313951,0.723633,0.156861,0.035636,0.0,0.0,0.0,0.0,5.337171,12.301762,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.944444
3,0.313951,0.723633,0.156861,0.035636,0.0,0.0,0.0,0.0,5.337171,12.301762,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.944444
4,0.341128,0.160328,0.339351,0.019207,0.050023,0.02732,0.059937,0.007002,11.257229,5.290818,...,0.343314,0.408707,0.408707,0.009513,0.009989,0.01873,0.020335,0.026757,0.026757,0.740278


In [11]:
df.head()

Unnamed: 0,mean_width,mean_height,mean_length,mean_volume,std_width,std_height,std_length,std_volume,sum_width,sum_height,...,length_stacking_p60,length_stacking_p80,length_stacking_p100,volume_stacking_p0,volume_stacking_p20,volume_stacking_p40,volume_stacking_p60,volume_stacking_p80,volume_stacking_p100,density_percent
0,0.160545,0.08626,0.175225,0.002852,0.055125,0.02278,0.046782,0.002357,43.347088,23.290174,...,0.186169,0.212788,0.32498,0.000589,0.00091,0.001289,0.003574,0.003691,0.015735,0.817642
1,0.160545,0.08626,0.175225,0.002852,0.055125,0.02278,0.046782,0.002357,43.347088,23.290174,...,0.186169,0.212788,0.32498,0.000589,0.00091,0.001289,0.003574,0.003691,0.015735,0.817642
2,0.313951,0.723633,0.156861,0.035636,0.0,0.0,0.0,0.0,5.337171,12.301762,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.944444
3,0.313951,0.723633,0.156861,0.035636,0.0,0.0,0.0,0.0,5.337171,12.301762,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.944444
4,0.341128,0.160328,0.339351,0.019207,0.050023,0.02732,0.059937,0.007002,11.257229,5.290818,...,0.343314,0.408707,0.408707,0.009513,0.009989,0.01873,0.020335,0.026757,0.026757,0.740278
