In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
!ls _data/*.csv

_data/Machine_Appendix.csv		_data/TrainAndValid.csv
_data/median_benchmark.csv		_data/Train.csv
_data/random_forest_benchmark_test.csv	_data/Valid.csv
_data/Test.csv				_data/ValidSolution.csv


In [4]:
appendix = pd.read_csv('_data/Machine_Appendix.csv', low_memory=False)
train = pd.read_csv('_data/Train.csv', low_memory=False)
valid = pd.read_csv('_data/Valid.csv', low_memory=False)
test = pd.read_csv('_data/Test.csv', low_memory=False)

In [5]:
train.apply(lambda x: sum(x.isnull()), axis=0)

SalesID                          0
SalePrice                        0
MachineID                        0
ModelID                          0
datasource                       0
auctioneerID                 20136
YearMade                         0
MachineHoursCurrentMeter    258360
UsageBand                   331486
saledate                         0
fiModelDesc                      0
fiBaseModel                      0
fiSecondaryDesc             137191
fiModelSeries               344217
fiModelDescriptor           329206
ProductSize                 210775
fiProductClassDesc               0
state                            0
ProductGroup                     0
ProductGroupDesc                 0
Drive_System                296764
Enclosure                      325
Forks                       209048
Pad_Type                    321991
Ride_Control                252519
Stick                       321991
Transmission                217895
Turbocharged                321991
Blade_Extension     

## Data Cleaning

In [6]:
id_column = 'SalesID'
target_column = 'SalePrice'
date_columns = ['YearMade', 'saledate']
num_columns = ['MachineHoursCurrentMeter']
cat_columns = [col for col in train.columns \
        if col not in [id_column] + [target_column] + date_columns + num_columns]

In [9]:
from datetime import datetime
def parse_dates(df, column, string_format='%d-%m-%Y'):
    df[f'{column}-day'] = df[column].apply(
        lambda x: datetime.strptime(x, string_format).day)
    df[f'{column}-month'] = df[column].apply(
        lambda x: datetime.strptime(x, string_format).month)
    df[f'{column}-year'] = df[column].apply(
        lambda x: datetime.strptime(x, string_format).year)
    df[f'{column}-weekday'] = df[column].apply(
        lambda x: datetime.strptime(x, string_format).weekday())
    return df.drop(column, axis=1)

In [10]:
train = parse_dates(train, date_columns[1], '%m/%d/%Y %H:%M')
valid = parse_dates(valid, date_columns[1], '%m/%d/%Y %H:%M')
test = parse_dates(test, date_columns[1], '%m/%d/%Y %H:%M')

In [11]:
train[num_columns[0]].fillna(0, inplace=True)
valid[num_columns[0]].fillna(0, inplace=True)
test[num_columns[0]].fillna(0, inplace=True)

train[date_columns[0]].fillna(1995, inplace=True)
valid[date_columns[0]].fillna(1995, inplace=True)
test[date_columns[0]].fillna(1995, inplace=True)

In [12]:
for col in cat_columns:
    train[col].fillna('-999', inplace=True)
    valid[col].fillna('-999', inplace=True)
    test[col].fillna('-999', inplace=True)    

## Data Preprocessing

In [13]:
for col in cat_columns:
    train[col] = train[col].astype('str')
    valid[col] = valid[col].astype('str')
    test[col] = test[col].astype('str')

In [14]:
from sklearn.preprocessing import LabelEncoder
label_encoders = {col: LabelEncoder().fit(pd.concat((train, valid, test))[col]) \
                        for col in cat_columns}
for col in cat_columns:
    train[col] = label_encoders[col].transform(train[col])
    valid[col] = label_encoders[col].transform(valid[col])
    test[col] = label_encoders[col].transform(test[col])

## Model

In [15]:
from lightgbm import LGBMRegressor

In [34]:
lgbm = LGBMRegressor(n_estimators=400, max_depth=30)
lgbm.fit(train.drop([id_column, target_column], axis=1), 
            np.log(train[target_column]))

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=30, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=400,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [38]:
from sklearn.metrics import mean_squared_error
import math

def rmse(y_true, y_pred):
    return math.sqrt(mean_squared_error(y_true, y_pred))

def print_score(m, X_train, y_train, X_valid, y_valid):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [36]:
y = pd.read_csv('_data/ValidSolution.csv', low_memory=False)
print_score(lgbm, train.drop([id_column, target_column], axis=1), 
                  np.log(train[target_column]),
                  valid.drop(id_column, axis=1), 
                  np.log(y[target_column]))

[0.21889488396755447, 0.24874458186109583, 0.9004071890470065, 0.8852872445024529]


In [26]:
submission = pd.DataFrame()
submission[id_column] = test[id_column]
submission[target_column] = lgbm.predict(test.drop(id_column, axis=1))
submission.to_csv('_data/LGBM.csv', index=False)