## Import bibliotek i podstawowa konfiguracja

In [None]:
import pandas as pd
import numpy as np

import sys
import re

from sklearn.metrics import mean_absolute_error as mae

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import KFold
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split

import xgbfir

import matplotlib.pyplot as plt
import seaborn as sns
import brewer2mpl

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from functools import partial

%matplotlib inline

np.random.seed(2018)

pd.set_option('max_rows', 200)
pd.set_option('max_columns', 200)

## Mapa miast i województw

In [None]:
voivodeships = pd.read_csv('./default/provinces.csv')
cities = pd.read_csv('./default/places.csv')
cities = pd.merge(cities, voivodeships, on='province_id')
cities['city_name'] = cities['city_name'].map(lambda x: x.lower())

city_to_voivodeship_map = dict(zip(cities['city_name'], cities['province_name']))

## Dane

In [None]:
original_train = pd.read_hdf('./input/train.car_price.h5')
train = original_train.copy()

original_test = pd.read_hdf('./input/test.car_price.h5')
test = original_test.copy()

![](../images/dummy_benchmark.png)

In [None]:
train.info()

## Funkcje pomocnicze do serializacji i normalizacji danych

In [None]:
def num_filter(val):
    return re.sub('[^0-9]','', val)

def float_filter(val):
    return re.sub('[^0-9\,\.]','', val).replace(',', '.')

def str_filter(val):
    return val.lower().strip()

def extract_city(address):
    if address == None:
        return None
    
    city = address.split(',')[0]
    
    if ' - ' in city: 
        city = city.split(' - ')[-1][7:]
    
    city = city.lower().strip()
    
    return city

def norm_date(value):
    if value is None: return value
    
    months_to_digit = {
        'styczeń': 1,
        'luty': 2,
        'marzec': 3,
        'kwiecień': 4,
        'maj': 5,
        'czerwiec': 6,
        'lipiec': 7,
        'sierpień': 8,
        'wrzesień': 9,
        'październik': 10,
        'listopad': 11,
        'grudzień': 12,
        'january': 1,
        'february': 2,
        'march': 3,
        'april': 4,
        'may': 5,
        'june': 6,
        'july': 7,
        'august': 8,
        'september': 9,
        'october': 10,
        'november': 11,
        'december': 12
    }
    values = value.split(' ')

    day   = values[0] if len(values) == 3 else None
    month = values[-2].lower()
    year  = values[-1]

    month = months_to_digit[month]

    if day is None:
        return '{0}/{1}'.format( month, year)

    return '{0}/{1}/{2}'.format( day, month, year)

def merge_labels(df, label0, label1, label, transform=(lambda x: x), unknown_value=None, del_feat=True):
    df[label] = [transform(a) if a != None else transform(b) if b != None else unknown_value for a, b in zip(df[label0], df[label1])]

    if del_feat:
        del df[label0]
        del df[label1]
    
    return df

## Funkcje pomocnicze do kategoryzacji

In [None]:
def categorize_feature(df, feat, feat_cat, indexers, del_feat=True, average_func=np.median, unknown_cat_name='unknown', unknown_cat_val=-1):
    if feat in indexers:
        return categorize_feature_for_test(df, feat, feat_cat, indexers[feat], del_feat=del_feat, unknown_cat_name=unknown_cat_name, unknown_cat_val=unknown_cat_val)
    
    categories_map = categorize_feature_for_train(df, feat, feat_cat, indexers, del_feat=del_feat, average_func=average_func, unknown_cat_name=unknown_cat_name)
    
def categorize_feature_for_test(df, feat, feat_cat, categories_map, del_feat=True, unknown_cat_name='unknown', unknown_cat_val=-1):
    df[feat_cat] = [categories_map[x] if x in categories_map 
                    else categories_map[unknown_cat_name] if unknown_cat_name in categories_map 
                    else unknown_cat_val for x in df[feat]]
    
    if del_feat:
        del df[feat]
    
def categorize_feature_for_train(df, feat, feat_cat, indexers, del_feat=True, average_func=np.median, unknown_cat_name='unknown'):
    df.loc[df[feat].isnull(), feat] = unknown_cat_name

    unique_categories = set(df[feat])
    avg_price_value = average_func(df['price_value'])

    categories_info = {}

    for cat in unique_categories:
        rows = df[df[feat] == cat]
        if cat != unknown_cat_name:
            categories_info[cat] = (cat, len(rows), average_func(rows['price_value']))
        else:
            categories_info[cat] = (cat, len(rows), avg_price_value)

    categories_info = sorted(list(categories_info.values()), key=lambda tup: tup[2])
    categories_map = {}

    for i, (cat, l, avg) in enumerate(categories_info):
        categories_map[cat] = i

    for cat, ind in categories_map.items():
        df.loc[df[feat] == cat, feat_cat] = ind

    indexers[feat] = categories_map
    
    if del_feat:
        del df[feat]

In [None]:
prepared_train['invoice_vat'].value_counts()

## Funkcja do przygotowania danych

In [None]:
def prepare_data(df, train_indexers=None):
    indexers = train_indexers if train_indexers != None else {}

    feats_useless = ['price_details', 'param_vin', 'breadcrumb']
    feats_redundant = []
    
    bool_transform = lambda x: x == 'Tak'
    
    # param_bezwypadkowy {'Tak' (66723), None (39771)}, param_no-accident {'Tak' (48), None (106446)} -> accident_free
    merge_labels(df, 'param_bezwypadkowy', 'param_no-accident', 'accident_free', transform=bool_transform, unknown_value=False)

    # param_uszkodzony {'Tak' (4250), None (102244)}, param_damaged {'Tak' (4), None (106490)} -> damaged
    merge_labels(df, 'param_uszkodzony', 'param_damaged', 'damaged', transform=bool_transform, unknown_value=False)
    
    # param_faktura-vat {'Tak' (26389), None (80105)} -> invoice_vat
    df['invoice_vat'] = df['param_faktura-vat'].map(bool_transform)
    feats_redundant.append('param_faktura-vat')

    # param_perłowy {'Tak' (11493), None (95001)}, param_pearl {'Tak' (3), None (106491)} -> pearl_car_paint
    merge_labels(df, 'param_perłowy', 'param_pearl', 'pearl_car_paint', transform=bool_transform, unknown_value=False)

    # param_homologacja-ciężarowa {'Tak' (764), None (105730)} -> truck
    df['truck'] = df['param_homologacja-ciężarowa'].map(bool_transform)
    feats_redundant.append('param_homologacja-ciężarowa')

    # param_service-record {'Tak' (40), None (106454)}, param_serwisowany-w-aso {'Tak' (52314), None (54180)} -> service_record
    merge_labels(df, 'param_service-record', 'param_serwisowany-w-aso', 'service_record', transform=bool_transform, unknown_value=False)

    # param_metallic {'Tak' (40), None (106454)}, param_metalik {'Tak' (72520), 'metallic' (20) None (33954)} -> metallic_car_paint
    merge_labels(df, 'param_metallic', 'param_metalik', 'metallic_car_paint', transform=lambda x: x == 'Tak' or x == 'metallic', unknown_value=False)

    # param_leasing-concession {'Tak' (11), None (106483)} -> leasing_concession
#     df['leasing_concession'].map(bool_transform)
    feats_redundant.append('param_leasing-concession')

    # param_financing-option {'Tak' (29), None (106465)}, param_możliwość-finansowania {'Tak' (35540), None (70954)} -> financing_option
    merge_labels(df, 'param_financing-option', 'param_możliwość-finansowania', 'financing_option', transform=bool_transform, unknown_value=False)

    # param_original-owner {'Tak' (39), None (106455)} -> original_owner
    df['original_owner'] = df['param_original-owner'].map(bool_transform)
    feats_redundant.append('param_original-owner')

    # param_vat-marża {'Tak' (39), None (106455)} -> vat_margin
    df['vat_margin'] = df['param_vat-marża'].map(bool_transform)
    feats_redundant.append('param_vat-marża')

    # param_kategoria {'Osobowe' (106385), None (109)} -> passenger_car
    df['passenger_car'] = df['param_kategoria'] == 'Osobowe'
    feats_redundant.append('param_kategoria')

    # param_leasing {'Tak' (11454), None (95040)} -> leasing
    df['leasing'] = df['param_leasing'].map(bool_transform)
    feats_redundant.append('param_leasing')

    # param_zarejestrowany-jako-zabytek {'Tak' (80), None (106414)} -> antique
    df['antique'] = df['param_zarejestrowany-jako-zabytek'].map(bool_transform)
    feats_redundant.append('param_zarejestrowany-jako-zabytek')

    # param_pierwszy-właściciel {'Tak' (47115), None (59379)} -> first_owner
    df['first_owner'] = df['param_pierwszy-właściciel'].map(bool_transform)
    feats_redundant.append('param_pierwszy-właściciel')

    # param_vat-discount {'Tak' (21), None (106473)} -> vat_discount
    df['vat_discount'] = df['param_vat-discount'].map(bool_transform)
    feats_redundant.append('param_vat-discount')

    # param_particle-filter {'Tak' (7), None (106487)}, param_filtr-cząstek-stałych {'Tak' (6274), None (100220)} -> particle_filter
    merge_labels(df, 'param_particle-filter', 'param_filtr-cząstek-stałych', 'particle_filter', transform=bool_transform, unknown_value=False)

    # param_zarejestrowany-w-polsce {'Tak' (49132), None (57362)}, param_registered-in-poland {'Tak' (39), None (106455)} -> registered_in_poland
    merge_labels(df, 'param_zarejestrowany-w-polsce', 'param_registered-in-poland', 'particle_filter', transform=bool_transform, unknown_value=False)

    # param_kierownica-po-prawej-(anglik) {'Tak' (793), None (105701)} -> right_hand_steering_wheel
    df['right_hand_steering_wheel'] = df['param_kierownica-po-prawej-(anglik)'].map(bool_transform)
    feats_redundant.append('param_kierownica-po-prawej-(anglik)')

    # param_vat-free {'Tak' (25), None (106469)} -> vat_free
    df['vat_free'] = df['param_vat-free'].map(bool_transform)
    feats_redundant.append('param_vat-free')

    # param_acrylic {'Tak' (4), None (106490)}, param_akryl-(niemetalizowany) {'Tak' (4172), 'metallic' (4) None (102318)} -> acrylic_car_paint
    merge_labels(df, 'param_akryl-(niemetalizowany)', 'param_acrylic', 'acrylic_car_paint', transform=lambda x: x == 'Tak' or x == 'acrylic', unknown_value=False)

    # param_tuning {'Tak' (583), None (105911)} -> tuning
    df['tuning'] = df['param_tuning'].map(bool_transform)
    feats_redundant.append('param_tuning')

    # param_matowy {'Tak' (319), None (106175)} -> mat_car_paint
    df['mat_car_paint'] = df['param_matowy'].map(bool_transform)
    feats_redundant.append('param_matowy')
    
    num_transform = lambda x: np.int64(num_filter(x))
    
    merge_labels(df, 'param_rok-produkcji', 'param_year', 'production_year_num', transform=num_transform, unknown_value=-1)
    
    merge_labels(df, 'param_moc', 'param_engine-power', 'engine_power_num', transform=num_transform, unknown_value=-1)
    
    merge_labels(df, 'param_napęd', 'param_transmission', 'transmission')
    categorize_feature(df, 'transmission', 'transmission_cat', indexers=indexers)
    
    merge_labels(df, 'param_skrzynia-biegów', 'param_gearbox', 'gearbox')
    categorize_feature(df, 'gearbox', 'gearbox_cat', indexers=indexers, average_func=np.mean)

    merge_labels(df, 'param_typ', 'param_body-type', 'body_type', transform=lambda x: x.lower())
    categorize_feature(df, 'body_type', 'body_type_cat', indexers=indexers)
    
    merge_labels(df, 'param_marka-pojazdu', 'param_make', 'vehicle_brand', unknown_value='unknown')
    df.loc[df['vehicle_brand'] == 'Inny', 'vehicle_brand'] = 'unknown'
    
    merge_labels(df, 'param_model-pojazdu', 'param_model', 'vehicle_model', unknown_value='unknown')
    
    s_feat0 = 'vehicle_brand'
    s_feat1 = 'vehicle_model'
    feat = 'vehicle_brand_model'
    feat_cat = 'vehicle_brand_model_cat'
    df['vehicle_brand_model'] = [a + ' ' + b if a != None and a != 'unknown' and b != None and b != 'unknown' else 'unknown' for a, b in zip(df[s_feat0], df[s_feat1])]
    if indexers != None and feat in indexers:
        vehicle_brand_categories, vehicle_brands_very_cheap, vehicle_brands_cheap, vehicle_brands_expensive, vehicle_brands_very_expensive = indexers[feat]
        
        df[feat_cat] = [vehicle_brand_categories[x] if x in vehicle_brand_categories else 
                        vehicle_brand_categories['Other Very Cheap'] if x in vehicle_brands_very_cheap else 
                        vehicle_brand_categories['Other Cheap'] if x in vehicle_brands_cheap else 
                        vehicle_brand_categories['Other Expensive'] if x in vehicle_brands_expensive else
                        vehicle_brand_categories['Other Very Expensive'] if x in vehicle_brands_very_expensive else
                        vehicle_brand_categories['unknown'] for x in df[feat]]
    else:
        vehicle_brands = set(df[feat])
        vehicle_brand_info = {}
        vehicle_brands_prices = {}
        vehicle_brancs_counts = {}
        vehicle_brands_cheap = []
        vehicle_brands_very_cheap = []
        vehicle_brands_expensive = []
        vehicle_brands_very_expensive = []
        mean_price_value = np.mean(df['price_value'])
        median_price_value = np.median(df['price_value'])
        price_value_p0 = np.percentile(train['price_value'], 15)
        price_value_p1 = np.percentile(train['price_value'], 50)
        price_value_p2 = np.percentile(train['price_value'], 80)

        for brand in vehicle_brands:
            rows = df[df[feat] == brand]
            vehicle_brand_info[brand] = (len(rows), np.mean(rows['price_value']), np.median(rows['price_value']))

        for b, (l, mean, median) in vehicle_brand_info.items():
            if l < 5 and b != 'unknown':
                if median < price_value_p0:
                    vehicle_brands_very_cheap.append(b)
                    df.loc[df[feat] == b, feat] = 'Other Very Cheap'
                elif median < price_value_p1:
                    vehicle_brands_cheap.append(b)
                    df.loc[df[feat] == b, feat] = 'Other Cheap'
                elif median < price_value_p2:
                    vehicle_brands_expensive.append(b)
                    df.loc[df[feat] == b, feat] = 'Other Expensive'
                else:
                    vehicle_brands_very_expensive.append(b)
                    df.loc[df[feat] == b, feat] = 'Other Very Expensive'

        vehicle_brands = set(df[feat])
        vehicle_brand_info = {}
        
        for brand in vehicle_brands:
            rows = df[df[feat] == brand]
            if brand != 'unknown':
                vehicle_brand_info[brand] = (brand, len(rows), np.mean(rows['price_value']), np.median(rows['price_value']))
            else:
                vehicle_brand_info[brand] = (brand, len(rows), mean_price_value, median_price_value)

        sorted_info = sorted(list(vehicle_brand_info.values()), key=lambda tup: tup[3])
        vehicle_brand_categories = {}

        for i, (b, l, m, s) in enumerate(sorted_info):
            vehicle_brand_categories[b] = i

        indexers[feat] = (vehicle_brand_categories, vehicle_brands_very_cheap, vehicle_brands_cheap, vehicle_brands_expensive, vehicle_brands_very_expensive)
        
        df[feat_cat] = [vehicle_brand_categories[x] if x in vehicle_brand_categories else -1 for x in df[feat]]
    
    feats_redundant.append(feat) 
    
    
    feat = 'vehicle_brand'
    feat_cat = 'vehicle_brand_cat'
    if indexers != None and feat in indexers:
        vehicle_brand_categories, vehicle_brands_very_cheap, vehicle_brands_cheap, vehicle_brands_expensive, vehicle_brands_very_expensive = indexers[feat]
        
        df[feat_cat] = [vehicle_brand_categories[x] if x in vehicle_brand_categories else 
                        vehicle_brand_categories['Other Very Cheap'] if x in vehicle_brands_very_cheap else 
                        vehicle_brand_categories['Other Cheap'] if x in vehicle_brands_cheap else 
                        vehicle_brand_categories['Other Expensive'] if x in vehicle_brands_expensive else
                        vehicle_brand_categories['Other Very Expensive'] if x in vehicle_brands_very_expensive else
                        vehicle_brand_categories['unknown'] for x in df[feat]]
    else:
        vehicle_brands = set(df[feat])
        vehicle_brand_info = {}
        vehicle_brands_prices = {}
        vehicle_brancs_counts = {}
        vehicle_brands_cheap = []
        vehicle_brands_very_cheap = []
        vehicle_brands_expensive = []
        vehicle_brands_very_expensive = []
        mean_price_value = np.mean(df['price_value'])
        median_price_value = np.median(df['price_value'])
        price_value_p0 = np.percentile(train['price_value'], 30)
        price_value_p1 = np.percentile(train['price_value'], 60)
        price_value_p2 = np.percentile(train['price_value'], 90)

        for brand in vehicle_brands:
            rows = df[df[feat] == brand]
            vehicle_brand_info[brand] = (len(rows), np.mean(rows['price_value']), np.median(rows['price_value']))

        for b, (l, mean, median) in vehicle_brand_info.items():
            if l < 500 and b != 'unknown':
                if median < price_value_p0:
                    vehicle_brands_very_cheap.append(b)
                    df.loc[df[feat] == b, feat] = 'Other Very Cheap'
                elif median < price_value_p1:
                    vehicle_brands_cheap.append(b)
                    df.loc[df[feat] == b, feat] = 'Other Cheap'
                elif median < price_value_p2:
                    vehicle_brands_expensive.append(b)
                    df.loc[df[feat] == b, feat] = 'Other Expensive'
                else:
                    vehicle_brands_very_expensive.append(b)
                    df.loc[df[feat] == b, feat] = 'Other Very Expensive'

        vehicle_brands = set(df[feat])
        vehicle_brand_info = {}
        
        for brand in vehicle_brands:
            rows = df[df[feat] == brand]
            if brand != 'unknown':
                vehicle_brand_info[brand] = (brand, len(rows), np.mean(rows['price_value']), np.median(rows['price_value']))
            else:
                vehicle_brand_info[brand] = (brand, len(rows), mean_price_value, median_price_value)

        sorted_info = sorted(list(vehicle_brand_info.values()), key=lambda tup: tup[3])
        vehicle_brand_categories = {}
        
        for i, (b, l, m, s) in enumerate(sorted_info):
            vehicle_brand_categories[b] = i

        indexers[feat] = (vehicle_brand_categories, vehicle_brands_very_cheap, vehicle_brands_cheap, vehicle_brands_expensive, vehicle_brands_very_expensive)
        
        df[feat_cat] = [vehicle_brand_categories[x] if x in vehicle_brand_categories else -1 for x in df[feat]]
    
    feats_redundant.append(feat)
    
    categorize_feature(df, 'vehicle_model', 'vehicle_model_cat', indexers=indexers)

    merge_labels(df, 'param_version', 'param_wersja', 'version', transform=lambda x: x.lower())
    categorize_feature(df, 'version', 'version_cat', indexers=indexers)
    
    merge_labels(df, 'param_kod-silnika', 'param_engine-code', 'engine_code', transform=lambda x: x.lower())
    categorize_feature(df, 'engine_code', 'engine_code_cat', indexers=indexers)

    merge_labels(df, 'param_pojemność-skokowa', 'param_engine-capacity', 'engine_capacity', unknown_value='unknown')
    categorize_feature(df, 'engine_capacity', 'engine_capacity_cat', indexers=indexers)

    merge_labels(df, 'param_rodzaj-paliwa', 'param_fuel-type', 'param_fuel_type')
    categorize_feature(df, 'param_fuel_type', 'param_fuel_type_cat', indexers=indexers)
    
    s_feat0 = 'param_kraj-pochodzenia'
    s_feat1 = 'param_country-of-origin'
    feat = 'country_of_origin'
    feat_cat = 'country_of_origin_cat'
    df[feat] = [a if a != None else b if b != None else 'unknown' for a, b in zip(df[s_feat0], df[s_feat1])]
    
    if indexers != None and feat in indexers:
        countries_categories, countries_cheap, countries_expensive = indexers[feat]
        df[feat_cat] = [countries_categories[x] if x in countries_categories else 
                        countries_categories['Other Cheap'] if x in countries_cheap else 
                        countries_categories['Other Expensive'] if x in countries_expensive else
                        countries_categories['unknown'] for x in df[feat]]
    else:
        countries = set(df[feat])
        countries_info = {}
        countries_cheap = []
        countries_expensive = []
        mean_price_value = np.mean(df['price_value'])
        median_price_value = np.median(df['price_value'])

        for country in countries:
            rows = df[df[feat] == country]
            countries_info[country] = (len(rows), np.mean(rows['price_value']), np.median(rows['price_value']))

        for c, (l, mean, median) in countries_info.items():
            if l < 500:
                if median < median_price_value:
                    countries_cheap.append(c)
                    df.loc[df[feat] == c, feat] = 'Other Cheap'
                else:
                    countries_expensive.append(c)
                    df.loc[df[feat] == c, feat] = 'Other Expensive'

        countries = set(df[feat])
        countries_info = {}

        for country in countries:
            rows = df[df[feat] == country]
            if country != 'unknown':
                countries_info[country] = (country, len(rows), np.mean(rows['price_value']), np.median(rows['price_value']))
            else:
                countries_info[country] = (country, len(rows), mean_price_value, median_price_value)

        sorted_info = sorted(list(countries_info.values()), key=lambda tup: tup[3])
        countries_categories = {}

        for i, (c, l, m, s) in enumerate(sorted_info):
            countries_categories[c] = i

        indexers[feat] = (countries_categories, countries_cheap, countries_expensive)

        df[feat_cat] = [countries_categories[x] if x in countries_categories else -1 for x in df[feat]]
    
    feats_redundant.append(s_feat0)
    feats_redundant.append(s_feat1)
    
    merge_labels(df, 'param_przebieg', 'param_mileage', 'mileage_num', transform=num_transform, unknown_value=-1)
    df['mileage_num'] = [a if a > -1 else b if b > 800 else -1 for a, b in zip(df['mileage_num'], df['engine_power_num'])]
    df['engine_power_num'] = df['engine_power_num'].map(lambda x: -1 if x > 800 else x)
    df['mileage'] = pd.cut(df['engine_power_num'], bins=[0, 5, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 1000000000], include_lowest=True, labels=['< 5k km', '5k-50k km', '50k-100k km', '100k-150k km', '150k-200k km', '200k-250k km', '250k-300k km', '300k-350k km', '> 350k km'])
    if indexers != None and 'mileage' in indexers:
        df['mileage_cat'] = indexers['mileage'].get_indexer(df['mileage'])
    else:
        df['mileage_cat'], indexer = pd.factorize(df['mileage'])
#         indexers['mileage'] = indexer #błąd

    max_mileage = 500001
    mileage_median = np.median(df['mileage_num'])
    df['mileage_num_reversed'] = df['mileage_num'].map(lambda x: mileage_median if x < 0 else max_mileage - x if x < max_mileage else 0)
    feats_redundant.append('mileage')

    merge_labels(df, 'param_kolor', 'param_color', 'color', transform=lambda x: x.lower())
    categorize_feature(df, 'color', 'color_cat', indexers=indexers)
    
    merge_labels(df, 'param_emisja-co2', 'param_co2-emissions', 'co2_emissions', transform=lambda x: x.lower())
    categorize_feature(df, 'co2_emissions', 'co2_emissions_cat', indexers=indexers)
    
    categorize_feature(df, 'seller_name', 'seller_name_cat', indexers=indexers)

    # seller_type ('Osoba prywatna' [41804], 'Dealer' [106385], None [109]), param_oferta-od ('Osoby prywatnej' [41808], 'Firmy' [64577], None [109]) -> private_seller
    df['private_seller'] = [a == 'Osoba prywatna' or b == 'Osoby prywatnej' for a, b in zip(df.seller_type, df['param_oferta-od'])]
    df['private_seller'] = df['private_seller'].astype(np.bool)
    df['dealer_seller'] = [a == 'Dealer' or b == 'Firmy' for a, b in zip(df.seller_type, df['param_oferta-od'])]
    df['dealer_seller'] = df['dealer_seller'].astype(np.bool)
    feats_redundant.extend(['seller_type', 'param_oferta-od'])

    df['car_state'] = [1 if x == 'Nowe' else 0 if x == 'Używane' else -1 for x in df['param_stan']]
    feats_redundant.append('param_stan')

    merge_labels(df, 'param_liczba-miejsc', 'param_nr-of-seats', 'seats_count', transform=lambda x: np.int8(x), unknown_value=-1)

    merge_labels(df, 'param_liczba-drzwi', 'param_door-count', 'doors_count', transform=lambda x: np.int8(x), unknown_value=-1)
    
    merge_labels(df, 'param_pierwsza-rejestracja', 'param_first-registration', 'param_first_registration_date', transform=norm_date)
    df['param_first_registration_date'] = pd.to_datetime(df['param_first_registration_date'])
    

    df['param_first_registration_year'] = df['param_first_registration_date'].dt.year
    df['param_first_registration_year'].fillna(-1, inplace=True)

    df['param_first_registration_month'] = df['param_first_registration_date'].dt.month
    df['param_first_registration_month'].fillna(-1, inplace=True)

    merge_labels(df, 'param_miesięczna-rata', 'param_monthly-payment-value', 'monthly_payment_num', transform=lambda x: np.float(float_filter(x)), unknown_value=-1)

    df['seller_city'] = df['seller_address'].map(extract_city)
    categorize_feature(df, 'seller_city', 'seller_city_cat', indexers=indexers, del_feat=False)
    
    regions_str = 'dolnośląskie, kujawsko-pomorskie, lubelskie, lubuskie, łódzkie, małopolskie, mazowieckie, opolskie, podkarpackie, podlaskie, pomorskie, śląskie, świętokrzyskie, warmińsko-mazurskie, wielkopolskie, zachodniopomorskie'
    regions = regions_str.split(', ')
    for region in regions:
        df['region'] = df['seller_address'].map(lambda x: region if x != None and region in x else x)
    df['region'] = [a if a != None else city_to_voivodeship_map[b] if b != None and b in city_to_voivodeship_map else None for a, b in zip(df['region'], df['seller_city'])]
    categorize_feature(df, 'region', 'region_cat', indexers=indexers)
    feats_redundant.append('seller_city')

    df['initial_payment_num'] = [np.int64(num_filter(a)) if a != None else -1 for a in df['param_opłata-początkowa']]
    feats_redundant.append('param_opłata-początkowa')
    
    df['buyout_num'] = [np.int64(num_filter(a)) if a != None else -1 for a in df['param_wartość-wykupu']]
    feats_redundant.append('param_wartość-wykupu')
    
    df['remaining_installment_num'] = [np.int64(num_filter(a)) if a != None else -1 for a in df['param_liczba-pozostałych-rat']]
    feats_redundant.append('param_liczba-pozostałych-rat')

    categorize_feature(df, 'price_currency', 'price_currency_cat', indexers=indexers)
    
    if 'price_value' in df:
        df['price_value_log'] = np.log(df['price_value'])
        
    feats_to_delete = feats_useless + feats_redundant

    cols = df.columns
    for feat in feats_to_delete:
        if feat in cols:
            del df[feat]

    return df, (indexers if indexers != None else indexers)

## Funkcje do nauki modelu

In [None]:
def get_feats(df):
    feats = df.select_dtypes(include=[np.int, np.float, np.bool]).columns
    
    return feats[ (feats != 'car_id') & (feats != 'price_value') & (feats != 'price_value_log') ].values

def get_X(df):
    return df[  get_feats(df) ].values

def get_y(df, target_var='price_value'):
    return train[target_var].values

def get_models():
    return [
        ('dt-5md', DecisionTreeRegressor(max_depth=5)),
        ('dt-10md', DecisionTreeRegressor(max_depth=10)),
        ('dt-15md', DecisionTreeRegressor(max_depth=15)),
        ('dt-20md', DecisionTreeRegressor(max_depth=20)),
        ('random_forest-5md', RandomForestRegressor(max_depth=5)),
        ('random_forest-10md', RandomForestRegressor(max_depth=10)),
        ('random_forest-15md', RandomForestRegressor(max_depth=15)),
        ('random_forest-20md', RandomForestRegressor(max_depth=20)),
        ('xgboost-5md', XGBRegressor(max_depth=5))
    ]

def mae_log_eval(y_log_pred, dtrain):
    y_log_true = dtrain.get_label()
    
    shift = 10
    y_true = np.float64(np.exp(y_log_true)) - shift
    y_pred = np.float64(np.exp(y_log_pred)) - shift

    return 'mae', mae(y_true, y_pred)

def mae_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    
    shift = 10

    return 'mae', mae(y_true - shift, y_pred - shift)

def run_cv(model, X, y, train, folds=4, target_log=False,cv_type=KFold, success_metric=mae):
    cv = cv_type(n_splits=folds)
    
    scores = []
    for train_idx, test_idx in cv.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        test = train.copy()
        test = test.iloc[test_idx]

        if target_log:
            y_train = np.log(y_train)
            y_test = np.log(y_test)

        eval_set = [(X_train, y_train), (X_test, y_test)]

        if target_log:
            model.fit(X_train, y_train, eval_set=eval_set, eval_metric=mae_log_eval, verbose=False)
        else:
            model.fit(X_train, y_train, eval_set=eval_set, eval_metric=mae_eval, verbose=False)
        
        y_pred = model.predict(X_test)
        
        if target_log:
            y_test = np.exp(y_test)
            y_pred = np.exp(y_pred)
            y_pred[y_pred < 400] = 400
        
        score = success_metric(y_test, y_pred)
        scores.append( score )
        
    return np.mean(scores), np.std(scores)


def plot_learning_curve(model, title, X, y, ylim=None, cv=None, train_sizes=np.linspace(.1, 1.0, 5), target_log=False):
    plt.figure(figsize=(12,8))
    plt.title(title)
    if ylim is not None:plt.ylim(*ylim)

    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    if target_log:
        y = np.log(y)
    
    def my_scorer(model, X, y):
        y_pred = model.predict(X)
        
        if target_log:
            y = np.exp(y)
            y_pred = np.exp(y_pred)
            y_pred[y_pred < 400] = 400
        
        return mae(y, y_pred)

        
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=cv, n_jobs=1, train_sizes=train_sizes, scoring=my_scorer) # n_jobs > 1 not working
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    
    return plt


def run(train, models=get_models(), plot_lc=False, folds=3, ylim=(0, 1), target_log=False):
    X, y  = get_X(train), get_y(train)

    for model_name, model in models:
        score_mean, score_std = run_cv(model, X, y, train, folds=folds, target_log=target_log)
        print("[{0}]: {1} +/-{2}".format(model_name, score_mean, score_std))
        sys.stdout.flush()

        if False == plot_lc: continue
        plt = plot_learning_curve(model, model_name, X, y, cv=folds, target_log=target_log)
        plt.show()


## Przygotowanie danych

In [None]:
train = original_train.copy()

# drop duplicates
feats = train.columns
feats = feats[(feats != 'breadcrumb') & (feats != 'price_details') & (feats != 'car_id') & (feats != 'created_at')].values
train = train.drop_duplicates(feats)

%time prepared_train, indexers = prepare_data(train)

In [None]:
test = original_test.copy()

prepared_test, a = prepare_data(test, indexers)

## Nauka modelu

**Model 1**

Lokalny wynik: 5437.732148466895 +/-151.16660205998767

Kaggle: 5524.91166 / 5614.16511

In [None]:
train = prepared_train.copy()

def fair_obj(y_true, y_pred):
    x = y_pred - y_true

    den = abs(x) + fair_constant
    
    grad = fair_constant * x / den
    hess = (fair_constant * fair_constant) / (den * den)
    
    return grad, hess

xgb_params = {
    'objective': fair_obj,
    'n_jobs': 4,
    'max_depth': 13, 
    'n_estimators': 280,
    'learning_rate': 0.11417783260309153,
    'random_state': 6700,
    'colsample_bytree': 0.8303395340380215,
    'min_child_weight': 8,
    'reg_alpha': 1.418978766944595,
    'reg_lambda': 0.9766322222818731,
    'subsample': 0.9300215313325608
}
fair_constant = 4698
folds = 10
target_log = True

model = ('xgboost-{0}fc-{1}f'.format(fair_constant, folds), XGBRegressor(**xgb_params))
(model_name, xgb_rmodel) = model

%time run(train, models=[model], plot_lc=False, target_log=target_log, folds=folds)

**Model 2**

Lokalny wynik: 5417.678453461945 +/-114.3701416632329

Kaggle: 5496.60020 / 5588.13150

In [None]:
train = prepared_train.copy()

def fair_obj(y_true, y_pred):
    x = y_pred - y_true

    den = abs(x) + fair_constant
    
    grad = fair_constant * x / den
    hess = (fair_constant * fair_constant) / (den * den)
    
    return grad, hess

xgb_params = {
    'objective': fair_obj,
    'n_jobs': 4,
    'max_depth': 15, 
    'n_estimators': 300,
    'learning_rate': 0.05462990289471652,
    'random_state': 9700,
    'colsample_bytree': 0.8140341130120244,
    'min_child_weight': 1,
    'reg_alpha': 1.0835749586415848,
    'reg_lambda': 0.985332472621205,
    'subsample': 0.9836466473497267
}
fair_constant = 4847
folds = 9
target_log = True

model = ('xgboost-{0}fc-{1}f'.format(fair_constant, folds), XGBRegressor(**xgb_params))
(model_name, xgb_rmodel) = model

%time run(train, models=[model], plot_lc=False, target_log=target_log, folds=folds)

#     %time xgbfir.saveXgbFI(xgb_rmodel.get_booster(), feature_names=get_feats(train), OutputXlsxFile='bost_fi.xlsx')

## Przewidywanie ceny na danych testowych

In [None]:
test = prepared_test.copy()
X_test = get_X(test)
y_pred = xgb_rmodel.predict(X_test)

if target_log:
    y_pred = np.exp(y_pred)
    y_pred[y_pred < 400] = 400

test['price_value'] = y_pred

%time test[['car_id', 'price_value']].to_csv('./output/model_' + model_name + '.csv', index=False)

## Sprawdzanie ważności cech

In [None]:
%time xgbfir.saveXgbFI(xgb_rmodel.get_booster(), feature_names=get_feats(train), OutputXlsxFile='bost_fi.xlsx')

In [None]:
pd.read_excel('bost_fi.xlsx', 'Interaction Depth 0')

In [None]:
pd.read_excel('bost_fi.xlsx', 'Interaction Depth 1')

In [None]:
pd.read_excel('bost_fi.xlsx', 'Interaction Depth 2')

# Połączenie wyników z modeli

Docelowo funkcja trenująca powinna wspierać łączenie modeli, a najlepsza proporcja znaleziona np. za pomocą hiperopt.

0.7 model_xgboost-4698fc-10f + 0.3 model_xgboost-4847fc-9f:

Kaggle: 5413.38492 / 5510.88303

In [None]:
res0 = pd.read_csv('./output/model_xgboost-4698fc-10f.csv')
res1 = pd.read_csv('./output/model_xgboost-4847fc-9f.csv')

res0['price_value_0'] = res0['price_value']
del res0['price_value']

res1['price_value_1'] = res1['price_value']
del res1['price_value']

merged_res = pd.merge(res0, res1, on='car_id')
a = 0.7
b = 0.3
merged_res['price_value'] = [a * p0 + b * p1 for p0, p1 in zip(merged_res['price_value_0'], merged_res['price_value_1'])]

merged_res[['car_id', 'price_value']].to_csv('./output/merged_model{0}.csv'.format(a), index=False)