In [1]:
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_absolute_error, f1_score
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from tqdm import tqdm
import lightgbm as lgb
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from datetime import datetime

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
seed = 2020

In [3]:
df_train = pd.read_csv('./raw_data/used_car_train_20200313.csv', sep=' ')
df_test = pd.read_csv('./raw_data/used_car_testA_20200313.csv', sep=' ')
df_sub = pd.read_csv('./raw_data/used_car_sample_submit.csv', sep=' ')

In [4]:
df_feature = pd.concat([df_train, df_test], sort=False)

In [5]:
df_feature.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,price,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,0.0,1046,0,0,20160404,1850.0,43.357796,3.966344,0.050257,2.159744,1.143786,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,-,4366,0,0,20160309,3600.0,45.305273,5.236112,0.137925,1.380657,-1.422165,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,0.0,2806,0,0,20160402,6222.0,45.978359,4.823792,1.319524,-0.998467,-0.996911,0.25141,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.56533,-0.832687,-0.229963
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,0.0,434,0,0,20160312,2400.0,45.687478,4.492574,-0.050616,0.8836,-2.228079,0.274293,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,0.0,6977,0,0,20160313,5200.0,44.383511,2.031433,0.572169,-1.571239,2.246088,0.228036,0.073205,0.09188,0.078819,0.121534,-1.89624,0.910783,0.93111,2.834518,1.923482


In [6]:
df_feature['notRepairedDamage'] = df_feature['notRepairedDamage'].replace(
    '-', 2)
df_feature['notRepairedDamage'] = df_feature['notRepairedDamage'].astype(
    'float')

In [7]:
df_feature.tail()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,price,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
49995,199995,20903,19960503,4.0,4,4.0,0.0,0.0,116,15.0,0.0,3219,0,0,20160320,,45.621391,5.958453,-0.918571,0.774826,-2.021739,0.284664,0.130044,0.049833,0.028807,0.004616,-5.978511,1.303174,-1.207191,-1.98124,-0.357695
49996,199996,708,19991011,0.0,0,0.0,0.0,0.0,75,15.0,0.0,1857,0,0,20160329,,43.935162,4.476841,-0.84171,1.328253,-1.292675,0.268101,0.108095,0.066039,0.025468,0.025971,-3.913825,1.759524,-2.075658,-1.154847,0.169073
49997,199997,6693,20040412,49.0,1,0.0,1.0,1.0,224,15.0,0.0,3452,0,0,20160305,,46.537137,4.170806,0.388595,-0.704689,-1.48071,0.269432,0.105724,0.117652,0.057479,0.015669,-4.639065,0.654713,1.137756,-1.390531,0.25442
49998,199998,96900,20020008,27.0,1,0.0,0.0,1.0,334,15.0,0.0,1998,0,0,20160404,,46.771359,-3.296814,0.243566,-1.277411,-0.404881,0.261152,0.00049,0.137366,0.086216,0.051383,1.833504,-2.828687,2.46563,-0.911682,-2.057353
49999,199999,193384,20041109,166.0,6,1.0,,1.0,68,9.0,0.0,3276,0,0,20160322,,43.73101,-3.121867,0.027348,-0.808914,2.116551,0.22873,0.0003,0.103534,0.080625,0.124264,2.914571,-1.13527,0.547628,2.094057,-1.55215


# feature engine

In [8]:
del df_feature['seller']
del df_feature['offerType']

In [9]:
df_feature['price'] = np.log1p(df_feature['price'])

In [10]:
df_feature['name_count'] = df_feature.groupby(
    ['name'])['SaleID'].transform('count')

In [11]:
def date_parse(x):
    year = int(str(x)[:4])
    month = int(str(x)[4:6])
    day = int(str(x)[6:8])

    if month < 1:
        month = 1

    date = datetime(year, month, day)
    return date


df_feature['regDate'] = df_feature['regDate'].apply(date_parse)
df_feature['creatDate'] = df_feature['creatDate'].apply(date_parse)
df_feature['regDate_year'] = df_feature['regDate'].dt.year

In [12]:
df_feature['car_age_day'] = (
    df_feature['creatDate'] - df_feature['regDate']).dt.days
df_feature['car_age_year'] = round(df_feature['car_age_day'] / 365, 1)

In [13]:
# 简单统计
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()
    return df_merge

In [14]:
def statis_feat(df_know, df_unknow):
    df_unknow = stat(df_know, df_unknow, ['model'], {'price': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['regionCode'], {'price': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['name'], {'price': ['mean']})

    return df_unknow

In [15]:
# 5折交叉
df_train = df_feature[~df_feature['price'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['price'].isnull()]

df_stas_feat = None
kf = KFold(n_splits=5, random_state=seed, shuffle=True)
for train_index, val_index in kf.split(df_train):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del(df_fold_train)
    del(df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)

del(df_stas_feat)
del(df_train)
del(df_test)
gc.collect()

0

In [16]:
df_feature.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,creatDate,price,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14,name_count,regDate_year,car_age_day,car_age_year,model_price_mean,regionCode_price_mean,name_price_mean
0,3,71865,1996-09-08,109.0,10,0.0,0.0,1.0,193,15.0,0.0,434,2016-03-12,7.783641,45.687478,4.492574,-0.050616,0.8836,-2.228079,0.274293,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699,2,1996,7125,19.5,9.063339,8.013673,8.682877
1,7,165346,1999-07-06,26.0,14,1.0,0.0,0.0,101,15.0,0.0,4000,2016-03-26,6.908755,42.255586,-3.167771,-0.676693,1.942673,0.524206,0.239506,0.0,0.122943,0.039839,0.082413,3.693829,-0.245014,-2.19281,0.236728,0.195567,1,1999,6108,16.7,7.566696,8.281427,
2,12,120103,2001-03-07,48.0,14,1.0,0.0,0.0,58,6.0,0.0,2753,2016-03-21,7.378384,42.309224,-3.082286,-0.604813,0.843333,0.388727,0.240775,0.000116,0.104573,0.053303,0.07425,3.477291,-0.46145,-1.442835,0.659255,1.19935,1,2001,5493,15.0,7.092135,8.66178,
3,16,10036,2011-09-11,105.0,1,0.0,1.0,1.0,239,12.5,0.0,419,2016-03-06,9.259226,48.30777,2.366464,1.160401,-1.641052,0.940788,0.251404,0.082237,0.15008,0.082606,0.088695,-3.625918,-0.621775,3.086576,0.165461,-2.192635,18,2011,1638,4.5,9.912501,9.27388,9.242547
4,23,8949,1994-04-01,78.0,7,5.0,0.0,0.0,105,15.0,1.0,1266,2016-03-17,6.39693,43.740185,3.408253,-1.850466,2.593211,0.749961,0.263572,0.093292,0.016425,0.013495,0.094,-2.891659,1.104114,-3.580304,0.157992,-1.133201,12,1994,8021,22.0,7.744373,8.087276,6.680958


# model

In [17]:
from sklearn.preprocessing import LabelEncoder
for f in tqdm(df_feature.select_dtypes('object')):
    lbl = LabelEncoder()
    df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

  0%|          | 0/200000 [00:00<?, ?it/s]


In [18]:
df_test = df_feature[df_feature['price'].isnull()].copy()
df_train = df_feature[df_feature['price'].notnull()].copy()

In [19]:
ycol = 'price'
feature_names = list(
    filter(lambda x: x not in [ycol, 'SaleID', 'regDate', 'creatDate'], df_train.columns))

model = lgb.LGBMRegressor(num_leaves=64,
                          max_depth=10,
                          learning_rate=0.1,
                          n_estimators=10000000,
                          subsample=0.8,
                          feature_fraction=0.8,
                          reg_alpha=0.5,
                          reg_lambda=0.5,
                          random_state=seed,
                          metric=None
                          )


oof = []
prediction = df_test[['SaleID']]
prediction['price'] = 0
df_importance_list = []

kfold = KFold(n_splits=5, shuffle=False, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric='mae',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = df_train.iloc[val_idx][['SaleID', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction['price'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 50 rounds
[500]	train's l1: 0.0906478	valid's l1: 0.120959
[1000]	train's l1: 0.0715953	valid's l1: 0.117581
[1500]	train's l1: 0.0591564	valid's l1: 0.116015
[2000]	train's l1: 0.0501136	valid's l1: 0.114961
[2500]	train's l1: 0.0432541	valid's l1: 0.114241
[3000]	train's l1: 0.0378658	valid's l1: 0.113719
[3500]	train's l1: 0.033574	valid's l1: 0.113276
[4000]	train's l1: 0.0301141	valid's l1: 0.112923
[4500]	train's l1: 0.0272713	valid's l1: 0.11264
[5000]	train's l1: 0.0248275	valid's l1: 0.112463
[5500]	train's l1: 0.0227752	valid's l1: 0.112264
[6000]	train's l1: 0.0210319	valid's l1: 0.112128
Early stopping, best iteration is:
[6082]	train's l1: 0.0207681	valid's l1: 0.11211


Training until validation scores don't improve for 50 rounds
[500]	train's l1: 0.0908584	valid's l1: 0.120676
[1000]	train's l1: 0.0715355	valid's l1: 0.116882
[1500]	train's l1: 0.0591691	valid's l1: 0.11509
[2000]	train's l1: 0.0501004	valid's l1: 0.11

In [20]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,regionCode_price_mean,28482.2
1,regionCode,27488.0
2,name_price_mean,20725.4
3,v_3,15975.8
4,v_14,15801.8
5,car_age_day,15298.2
6,v_1,15170.2
7,v_11,15132.0
8,v_0,14905.8
9,v_8,14896.8


In [21]:
df_oof = pd.concat(oof)
df_oof[ycol] = np.expm1(df_oof[ycol])
df_oof['pred'] = np.expm1(df_oof['pred'])
mae = mean_absolute_error(df_oof[ycol], df_oof['pred'])
print('mae:', mae)

mae: 483.59994168724864


In [22]:
prediction['price'] = np.expm1(prediction['price'])
sub = prediction.copy(deep=True)
sub.to_csv('sub/{}.csv'.format(mae), index=False, encoding='utf-8')