In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
import seaborn as sns

In [2]:
%cd /home/stepan/kaggle/cars_from_auction

/home/stepan/kaggle/cars_from_auction


In [3]:
%ls

[0m[01;34mdata[0m/  [01;34mmodels[0m/  [01;34mprepare[0m/  [01;34mresult[0m/  [01;34mreview[0m/  [01;34mtest[0m/  [01;34mtrain[0m/


In [4]:
%ls data/

[0m[01;34mprocessed[0m/  sample_submission.csv  test.csv  train.csv  zipcodes.csv


In [5]:
data_path = 'data/'
data_file = 'test.csv'

In [6]:
data = pd.read_csv(data_path + data_file, index_col=0)

In [7]:
data

Unnamed: 0,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price
60314,1.6,small car,2013,manual,136,swift,40000,gasoline,suzuki,0.0,30449,490.0
12566,,coupé,2004,auto,333,6er,150000,gasoline,bmw,0.0,45307,670.0
17760,,station wagon,2006,auto,170,e_klasse,150000,diesel,mercedes_benz,0.0,59494,460.0
8876,,limousine,99,manual,101,astra,150000,gasoline,opel,,25524,
80392,,limousine,1975,manual,54,andere,150000,diesel,mercedes_benz,0.0,70794,1110.0
...,...,...,...,...,...,...,...,...,...,...,...,...
93878,1.4,limousine,1999,manual,86,corolla,150000,gasoline,toyota,0.0,44339,110.0
99783,,station wagon,2002,auto,184,3er,150000,diesel,bmw,0.0,47574,240.0
57399,,small car,2005,manual,52,fox,100000,gasoline,volkswagen,0.0,50389,60.0
97106,,bus,2001,manual,151,transporter,150000,diesel,volkswagen,0.0,12209,930.0


In [8]:
data.describe()

Unnamed: 0,engine_capacity,registration_year,power,mileage,damage,zipcode,insurance_price
count,20086.0,50000.0,50000.0,50000.0,41773.0,50000.0,42594.0
mean,1.869466,1838.6336,120.90786,125278.3,0.092931,51196.608,420.93323
std,0.820776,546.886892,187.784151,39509.609835,0.290339,25750.460527,727.179269
min,0.0,0.0,0.0,5000.0,0.0,1067.0,10.0
25%,1.4,1998.0,75.0,100000.0,0.0,30926.0,100.0
50%,1.8,2003.0,110.0,150000.0,0.0,50170.0,230.0
75%,2.0,2008.0,150.0,150000.0,0.0,71720.75,510.0
max,9.7,2016.0,16312.0,150000.0,1.0,99998.0,51990.0


# Transform

In [9]:
def transform_year(d):
    d = d.copy()
    year = d['registration_year']
    
    d.loc[(year < 1900) & (year <= 20), 'registration_year'] += 2000
    d.loc[(year < 1900) & (year > 20), 'registration_year'] += 1900
    return d

In [10]:
def transform_power(d):
    d = d.copy()
    power = d['power']
    car_type = d['type']
    
    d.loc[power >= 500, 'power'] = np.nan
    d.loc[power < 25, 'power'] = np.nan
    return d

In [11]:
def transform_capacity(d):
    d = d.copy()
    engine = d['engine_capacity']
    d.loc[engine > 3.3, 'engine_capacity'] = np.nan
    d.loc[engine < 0.8, 'engine_capacity'] = np.nan
    return d

In [12]:
def transform_price(d):
    d = d.copy()
    d.loc[:, 'insurance_price'] = np.log(d['insurance_price'])
    if 'price' in d:
        d.loc[:, 'price'] = np.log(d['price'])
    return d

## Apply

In [13]:
transformers = [transform_year, transform_power, transform_capacity, transform_price]

In [14]:
new_data = data
for t in transformers:
    new_data = t(new_data)

In [15]:
new_data.describe()

Unnamed: 0,engine_capacity,registration_year,power,mileage,damage,zipcode,insurance_price
count,19497.0,50000.0,45497.0,50000.0,41773.0,50000.0,42594.0
mean,1.778325,2003.7336,128.206277,125278.3,0.092931,51196.608,5.479055
std,0.477522,7.299557,59.607414,39509.609835,0.290339,25750.460527,1.042473
min,0.8,1945.0,25.0,5000.0,0.0,1067.0,2.302585
25%,1.4,2000.0,85.0,100000.0,0.0,30926.0,4.60517
50%,1.8,2004.0,116.0,150000.0,0.0,50170.0,5.438079
75%,2.0,2008.0,155.0,150000.0,0.0,71720.75,6.234411
max,3.3,2016.0,495.0,150000.0,1.0,99998.0,10.858807


In [16]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 60314 to 478
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   engine_capacity    19497 non-null  float64
 1   type               43834 non-null  object 
 2   registration_year  50000 non-null  int64  
 3   gearbox            48021 non-null  object 
 4   power              45497 non-null  float64
 5   model              47671 non-null  object 
 6   mileage            50000 non-null  int64  
 7   fuel               46408 non-null  object 
 8   brand              50000 non-null  object 
 9   damage             41773 non-null  float64
 10  zipcode            50000 non-null  int64  
 11  insurance_price    42594 non-null  float64
dtypes: float64(4), int64(3), object(5)
memory usage: 5.0+ MB


In [17]:
new_data.to_csv(data_path + 'processed/transformed_' + data_file)