In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
def data_preparation(row):
# getting loading_* params
    lw = row['data_result']['cargo_space']['loading_size']['width']
    lh = row['data_result']['cargo_space']['loading_size']['height']
    ll = row['data_result']['cargo_space']['loading_size']['length']
    
# normalizing box size params
    w = np.array([x['size']['width'] for x in row['data_result']['boxes']]) / lw
    h = np.array([x['size']['height'] for x in row['data_result']['boxes']]) / lh
    l = np.array([x['size']['length'] for x in row['data_result']['boxes']]) / ll
    
    # getting volume
    v = w * h * l
#     ids = np.array([x['id'] for x in row['data_result']['boxes']])

# getting other params
    t = np.array([int(x['turnover']) for x in row['data_result']['boxes']])
    s = np.array([int(x['stacking']) for x in row['data_result']['boxes']])
    
# getting labels
    d = row['data_result']['cargo_space']['calculation_info']['density_percent'] / 100
    
# used quantiles
    qs = [0, 0.2, 0.4, 0.6, 0.8, 1]
    return {
#         "n": len(row['data_result']['boxes']),
# various aggregated statistics of sizes, volumes and other params
        "mean_width": w.mean(),
        "mean_height": h.mean(),
        "mean_length": l.mean(),
        "mean_volume": v.mean(),
        "sum_width": w.sum(),
        "sum_height": h.sum(),
        "sum_length": l.sum(),
        "sum_volume": v.sum(),
        "mean_turnover": t.mean(),
        "mean_stacking": s.mean(),
        **{
            "width_p{}".format(int(q * 100)): np.quantile(w, q) for q in qs
        },
        **{
            "heigth_p{}".format(int(q * 100)): np.quantile(h, q) for q in qs
        },
        **{
            "length_p{}".format(int(q * 100)): np.quantile(l, q) for q in qs
        },
        **{
            "volume_p{}".format(int(q * 100)): np.quantile(v, q) for q in qs
        },
#         "loading_width": lw,
#         "loading_height": lh,
#         "loading_length": ll,
        
# labels
        "density_percent": d,
    }

In [3]:
def distrib(x):
# used for bootstrapping
    return np.random.normal(x / 50)

In [4]:
def bootstrap(row):
# used for increasing dataset size
    row['data_result']['cargo_space']['loading_size']['width'] += distrib(row['data_result']['cargo_space']['loading_size']['width'])
    row['data_result']['cargo_space']['loading_size']['height'] += distrib(row['data_result']['cargo_space']['loading_size']['height'])
    row['data_result']['cargo_space']['loading_size']['length'] += distrib(row['data_result']['cargo_space']['loading_size']['length'])
    
#     for i in range(len(row['data_result']['boxes'])):
#         row['data_result']['boxes'][i]['size']['width'] += distrib(row['data_result']['boxes'][i]['size']['width'])
#         row['data_result']['boxes'][i]['size']['height'] += distrib(row['data_result']['boxes'][i]['size']['height'])
#         row['data_result']['boxes'][i]['size']['length'] += distrib(row['data_result']['boxes'][i]['size']['length'])
    
    return row
    

In [5]:
import os

folder_path = "./ALGORITM"

file_list = os.listdir(folder_path)

data = []
test_data = []
test_size = 0.4

# reading all json files and putting info into csv with pandas
for file in file_list:
    if file.endswith('.json'):
        path = os.path.join(folder_path, file)
        with open(path, 'r', encoding='utf-8') as f:
            raw = json.load(f)
            if np.random.random() > test_size:
                # for train
                data.append(data_preparation(bootstrap(raw)))
                data.append(data_preparation(bootstrap(raw)))
                data.append(data_preparation(bootstrap(raw)))
                data.append(data_preparation(bootstrap(raw)))
                data.append(data_preparation(bootstrap(raw)))
            else:
                # for test
                test_data.append(data_preparation(raw))
df = pd.DataFrame(data)
df.to_csv("data_train.csv", index=False)
df_test = pd.DataFrame(test_data)
df_test.to_csv("data_test.csv", index=False)
df.head()

Unnamed: 0,mean_width,mean_height,mean_length,mean_volume,sum_width,sum_height,sum_length,sum_volume,mean_turnover,mean_stacking,...,length_p60,length_p80,length_p100,volume_p0,volume_p20,volume_p40,volume_p60,volume_p80,volume_p100,density_percent
0,0.160555,0.086225,0.175294,0.002852,43.349719,23.280809,47.329344,0.769962,1.0,1.0,...,0.186243,0.212872,0.325108,0.000589,0.00091,0.001289,0.003575,0.003691,0.015736,0.817642
1,0.15747,0.084525,0.171916,0.002689,42.516809,22.821659,46.417321,0.72601,1.0,1.0,...,0.182654,0.20877,0.318843,0.000555,0.000858,0.001215,0.003371,0.00348,0.014838,0.817642
2,0.154556,0.082953,0.168494,0.002539,41.730026,22.397271,45.493461,0.685405,1.0,1.0,...,0.179018,0.204615,0.312497,0.000524,0.00081,0.001147,0.003182,0.003286,0.014008,0.817642
3,0.151432,0.081372,0.165293,0.002393,40.886747,21.970381,44.62912,0.646239,1.0,1.0,...,0.175617,0.200727,0.30656,0.000494,0.000763,0.001082,0.003,0.003098,0.013207,0.817642
4,0.14831,0.079753,0.162191,0.002254,40.043834,21.533418,43.791495,0.608686,1.0,1.0,...,0.172321,0.19696,0.300806,0.000466,0.000719,0.001019,0.002826,0.002918,0.01244,0.817642


In [6]:
df.head()

Unnamed: 0,mean_width,mean_height,mean_length,mean_volume,sum_width,sum_height,sum_length,sum_volume,mean_turnover,mean_stacking,...,length_p60,length_p80,length_p100,volume_p0,volume_p20,volume_p40,volume_p60,volume_p80,volume_p100,density_percent
0,0.160555,0.086225,0.175294,0.002852,43.349719,23.280809,47.329344,0.769962,1.0,1.0,...,0.186243,0.212872,0.325108,0.000589,0.00091,0.001289,0.003575,0.003691,0.015736,0.817642
1,0.15747,0.084525,0.171916,0.002689,42.516809,22.821659,46.417321,0.72601,1.0,1.0,...,0.182654,0.20877,0.318843,0.000555,0.000858,0.001215,0.003371,0.00348,0.014838,0.817642
2,0.154556,0.082953,0.168494,0.002539,41.730026,22.397271,45.493461,0.685405,1.0,1.0,...,0.179018,0.204615,0.312497,0.000524,0.00081,0.001147,0.003182,0.003286,0.014008,0.817642
3,0.151432,0.081372,0.165293,0.002393,40.886747,21.970381,44.62912,0.646239,1.0,1.0,...,0.175617,0.200727,0.30656,0.000494,0.000763,0.001082,0.003,0.003098,0.013207,0.817642
4,0.14831,0.079753,0.162191,0.002254,40.043834,21.533418,43.791495,0.608686,1.0,1.0,...,0.172321,0.19696,0.300806,0.000466,0.000719,0.001019,0.002826,0.002918,0.01244,0.817642


In [7]:
df_test.head()

Unnamed: 0,mean_width,mean_height,mean_length,mean_volume,sum_width,sum_height,sum_length,sum_volume,mean_turnover,mean_stacking,...,length_p60,length_p80,length_p100,volume_p0,volume_p20,volume_p40,volume_p60,volume_p80,volume_p100,density_percent
0,0.375,0.125,0.333333,0.015625,12.375,4.125,11.0,0.515625,0.0,1.0,...,0.333333,0.333333,0.333333,0.015625,0.015625,0.015625,0.015625,0.015625,0.015625,0.825
1,0.32,0.738095,0.16,0.03779,5.44,12.547619,2.72,0.642438,0.0,0.0,...,0.16,0.16,0.16,0.03779,0.03779,0.03779,0.03779,0.03779,0.03779,0.944444
2,0.070565,0.07334,0.181099,0.000935,68.165323,70.846154,174.941176,0.902738,0.0,1.0,...,0.183824,0.220588,0.220588,0.00059,0.00059,0.000998,0.000998,0.001361,0.001361,0.922083
3,0.669048,0.638857,0.170492,0.063761,4.683333,4.472,1.193443,0.446328,0.0,0.428571,...,0.162295,0.214754,0.227869,0.033892,0.044594,0.057519,0.076906,0.083156,0.084718,0.557213
4,0.669048,0.638857,0.170492,0.063761,4.683333,4.472,1.193443,0.446328,0.0,0.428571,...,0.162295,0.214754,0.227869,0.033892,0.044594,0.057519,0.076906,0.083156,0.084718,0.557213
