In [None]:
import pandas as pd
import matplotlib.pyplot as plt

ebay_mtb = pd.read_json('data\ebay_mtb.json',lines=True)
ebay_road = pd.read_json('data\ebay_road.json',lines=True)
exch_mtb = pd.read_json('data\exch_mtb.json',lines=True)
exch_road = pd.read_json('data\exch_road.json',lines=True)

In [None]:
road = pd.concat([ebay_road, exch_road], axis=0).reset_index(drop=True) 
mtb = pd.concat([ebay_mtb, exch_mtb], axis=0).reset_index(drop=True) 

In [None]:
from sklearn.preprocessing import QuantileTransformer
def preprocess_years(df: pd.DataFrame, year_col_name = 'Production_year'):
    min = 1950
    max = 2020
    df['year_num'] = df[year_col_name].apply(lambda x: (x - min)/(max - min))

    notna_idx = df.index[df[year_col_name].notna()]
    arr = df.loc[notna_idx, year_col_name].to_numpy()
    arr = arr.reshape(-1, 1)
    qt = QuantileTransformer(n_quantiles=len(arr), output_distribution='uniform')
    arr = qt.fit_transform(arr)
    arr = arr.flatten()
    df.loc[notna_idx,'year_num'] = arr

preprocess_years(mtb)
preprocess_years(road)

In [None]:
mtb.drop(columns=['Production_year'])
road.drop(columns=['Production_year'])

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(8, 6)) 

mtb['Production_year'].plot(kind='hist', bins=50, ax=axs[0,0], title='MTB')
mtb['year_num'].plot(kind='hist', bins=50, ax=axs[0, 1], title='MTB')

road['Production_year'].plot(kind='hist', bins=50, ax=axs[1,0], title='Road')
road['year_num'].plot(kind='hist', bins=50, ax=axs[1, 1], title='Road')

plt.tight_layout()
plt.show()

In [121]:
mtb

Unnamed: 0,ID,Price,Production_year,isNew,suspension_num,material_num,groupset_num,e_shift_num,wheel_num,brake_num,year_num
0,1396,650.0,2007,0,0.5,0.30,0.4,0,1.00,0.5,0.082290
1,2636,700.0,2018,1,0.5,0.30,0.4,0,1.00,1.0,0.451252
2,2842,1200.0,2019,1,1.0,0.50,0.9,0,0.66,1.0,0.712880
3,2848,595.0,2018,0,0.5,0.50,0.9,0,1.00,1.0,0.451252
4,2849,500.0,2002,0,1.0,0.50,0.6,0,0.33,1.0,0.068873
...,...,...,...,...,...,...,...,...,...,...,...
1114,159390918,540.0,2019,1,0.5,0.45,0.3,0,1.00,1.0,0.712880
1115,136769724,4300.0,2019,1,0.5,0.45,0.7,0,1.00,1.0,0.712880
1116,146496026,1799.0,2018,1,0.5,0.45,0.6,0,0.66,1.0,0.451252
1117,165760938,1050.0,2020,1,0.5,0.45,0.5,0,0.66,1.0,1.000000


In [122]:
road

Unnamed: 0,ID,Price,Production_year,isNew,material_num,groupset_num,e_shift_num,brake_num,year_num
0,4,929.35,2018,0,0.50,0.9,0,0.5,0.488597
1,7,3950.00,2019,0,0.90,0.8,1,1.0,0.720310
2,10,469.85,2000,0,0.50,0.6,0,0.5,0.058520
3,13,1199.00,2019,1,0.50,0.8,0,0.5,0.720310
4,14,910.00,2018,0,0.50,1.0,0,0.5,0.488597
...,...,...,...,...,...,...,...,...,...
2320,113790485,3999.99,2016,1,0.75,0.8,0,0.5,0.212780
2321,177850247,8000.00,2020,1,0.90,0.8,1,1.0,1.000000
2322,146255978,2200.00,2019,1,0.90,0.7,0,0.5,0.720310
2323,102758886,2099.99,2016,1,0.90,0.7,0,0.5,0.212780
