In [1]:
import pandas as pd
import numpy as np
import json
# データ可視化ライブラリ
import matplotlib.pyplot as plt
%matplotlib inline  
import seaborn as sns
import re
from tqdm import tqdm_notebook as tqdm

import lightgbm as lgb
print('lightgbm version:', lgb.__version__)

from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import sklearn
print('sklearn version:', sklearn.__version__)
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from catboost import Pool

import warnings
warnings.filterwarnings('ignore')


lightgbm version: 2.3.0
sklearn version: 0.22.1


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('../data/raw/train2.csv')
test = pd.read_csv('../data/raw/test2.csv')

In [3]:
drop_columns = ['OpenFloor', 'Workshop', 'D', 'S', 'Studio', 'Maisonette', 'K', 'R']
train = train.drop(drop_columns, axis=1)
test = test.drop(drop_columns, axis=1)
# highのtrainに幅を持たせるのは多分正解
train_high = train.query('MunicipalityCode < 13150 and Area > 400')
train_low = train.query('(MunicipalityCode < 13150 and Area < 600) or MunicipalityCode > 13150')
train_high_y = train_high['y']
train_high = train_high.drop('y', axis=1)

train_low_y = train_low['y']
train_low = train_low.drop('y', axis=1)
test = test.reset_index()

# 800のほうがいいかもしれない
test_high = test.query('MunicipalityCode < 13150 and Area >= 800')
test_low = test.query('(MunicipalityCode < 13150 and Area < 800) or MunicipalityCode > 13150')

test_high_data = test_high.drop('index', axis=1)
test_low_data = test_low.drop('index', axis=1)

In [9]:
### lowの予測
scores = []
y_low_pred = np.zeros(test_low_data.shape[0])
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_low):
    tr_x, va_x = train_low.iloc[tr_idx], train_low.iloc[va_idx]
    tr_y, va_y = train_low_y.iloc[tr_idx], train_low_y.iloc[va_idx]

    train_data = lgb.Dataset(tr_x, tr_y)
    valid_data = lgb.Dataset(va_x, va_y)
    params = {
        'objective': 'regression',
        'metric': 'mape',
        'learning_rate': 0.001,
        'max_depth': -1,
        'num_leaves': 255,
        'max_bin': 255,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'nthread': -1,
        'bagging_freq': 1,
        'verbose': -1,
        'seed': 1
#         'seed': random.randint(1, 100),
    }
    model = lgb.train(params, train_data, valid_sets=[train_data, valid_data],
                      num_boost_round=5000, early_stopping_rounds=200,
                      verbose_eval=200)

    y_val_pred = model.predict(va_x)
    val_score = np.sqrt(mean_squared_error(va_y, y_val_pred))
    y_low_pred  += model.predict(test_low_data, num_iteration=model.best_iteration)
    scores.append(val_score)

print('RMSE:', np.mean(scores))


Training until validation scores don't improve for 200 rounds
[200]	training's mape: 1.36973	valid_1's mape: 1.37146
[400]	training's mape: 1.18717	valid_1's mape: 1.18822
[600]	training's mape: 1.03816	valid_1's mape: 1.03882
[800]	training's mape: 0.915294	valid_1's mape: 0.915963
[1000]	training's mape: 0.812775	valid_1's mape: 0.813514
[1200]	training's mape: 0.727594	valid_1's mape: 0.728375
[1400]	training's mape: 0.657637	valid_1's mape: 0.658692
[1600]	training's mape: 0.599839	valid_1's mape: 0.601204
[1800]	training's mape: 0.552669	valid_1's mape: 0.554522
[2000]	training's mape: 0.513079	valid_1's mape: 0.515318
[2200]	training's mape: 0.480165	valid_1's mape: 0.482772
[2400]	training's mape: 0.452587	valid_1's mape: 0.455636
[2600]	training's mape: 0.429632	valid_1's mape: 0.433081
[2800]	training's mape: 0.410186	valid_1's mape: 0.414076
[3000]	training's mape: 0.393531	valid_1's mape: 0.397877
[3200]	training's mape: 0.379358	valid_1's mape: 0.384285
[3400]	training's ma

In [10]:
### highの予測
scores = []
y_high_pred = np.zeros(test_high_data.shape[0])
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_high):
    tr_x, va_x = train_high.iloc[tr_idx], train_high.iloc[va_idx]
    tr_y, va_y = train_high_y.iloc[tr_idx], train_high_y.iloc[va_idx]

    train_data = lgb.Dataset(tr_x, tr_y)
    valid_data = lgb.Dataset(va_x, va_y)
    params = {
        'objective': 'regression',
        'metric': 'mape',
        'learning_rate': 0.001,
        'max_depth': -1,
        'num_leaves': 200,
        'max_bin': 255,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'nthread': -1,
        'bagging_freq': 1,
        'verbose': -1,
        'seed': 1
#         'seed': random.randint(1, 100),
    }
    model = lgb.train(params, train_data, valid_sets=[train_data, valid_data],
                      num_boost_round=5000, early_stopping_rounds=100,
                      verbose_eval=200)

    y_val_pred = model.predict(va_x)
    val_score = np.sqrt(mean_squared_error(va_y, y_val_pred*1.03))
    y_high_pred  += model.predict(test_high_data, num_iteration=model.best_iteration)
    scores.append(val_score)

print('RMSE:', np.mean(scores))

Training until validation scores don't improve for 100 rounds
[200]	training's mape: 2.16954	valid_1's mape: 2.16155
[400]	training's mape: 1.84335	valid_1's mape: 1.84664
[600]	training's mape: 1.57593	valid_1's mape: 1.59102
[800]	training's mape: 1.35524	valid_1's mape: 1.38181
[1000]	training's mape: 1.17287	valid_1's mape: 1.20972
[1200]	training's mape: 1.02296	valid_1's mape: 1.06975
[1400]	training's mape: 0.897566	valid_1's mape: 0.954012
[1600]	training's mape: 0.79304	valid_1's mape: 0.859085
[1800]	training's mape: 0.706254	valid_1's mape: 0.782441
[2000]	training's mape: 0.633693	valid_1's mape: 0.719364
[2200]	training's mape: 0.573692	valid_1's mape: 0.668593
[2400]	training's mape: 0.52488	valid_1's mape: 0.629375
[2600]	training's mape: 0.484368	valid_1's mape: 0.598592
[2800]	training's mape: 0.450882	valid_1's mape: 0.575244
[3000]	training's mape: 0.422996	valid_1's mape: 0.556623
[3200]	training's mape: 0.399782	valid_1's mape: 0.543404
[3400]	training's mape: 0.37

In [31]:
### highの予測
scores = []
y_high_pred = np.zeros(test_high_data.shape[0])
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_high):
    tr_x, va_x = train_high.iloc[tr_idx], train_high.iloc[va_idx]
    tr_y, va_y = train_high_y.iloc[tr_idx], train_high_y.iloc[va_idx]

    train_data = lgb.Dataset(tr_x, tr_y)
    valid_data = lgb.Dataset(va_x, va_y)
    params = {
        'objective': 'regression',
        'metric': 'mape',
        'learning_rate': 0.001,
        'max_depth': -1,
        'num_leaves': 200,
        'max_bin': 255,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'nthread': -1,
        'bagging_freq': 1,
        'verbose': -1,
        'seed': 1
#         'seed': random.randint(1, 100),
    }
    model = lgb.train(params, train_data, valid_sets=[train_data, valid_data],
                      num_boost_round=5000, early_stopping_rounds=100,
                      verbose_eval=200)

    y_val_pred = model.predict(va_x)
    val_score = np.sqrt(mean_squared_error(va_y, y_val_pred*1.03))
    y_high_pred  += model.predict(test_high_data, num_iteration=model.best_iteration)
    scores.append(val_score)

print('RMSE:', np.mean(scores))

Training until validation scores don't improve for 100 rounds
[200]	training's mape: 2.16954	valid_1's mape: 2.16155
[400]	training's mape: 1.84335	valid_1's mape: 1.84664
[600]	training's mape: 1.57593	valid_1's mape: 1.59102
[800]	training's mape: 1.35524	valid_1's mape: 1.38181
[1000]	training's mape: 1.17287	valid_1's mape: 1.20972
[1200]	training's mape: 1.02296	valid_1's mape: 1.06975
[1400]	training's mape: 0.897566	valid_1's mape: 0.954012
[1600]	training's mape: 0.79304	valid_1's mape: 0.859085
[1800]	training's mape: 0.706254	valid_1's mape: 0.782441
[2000]	training's mape: 0.633693	valid_1's mape: 0.719364
[2200]	training's mape: 0.573692	valid_1's mape: 0.668593
[2400]	training's mape: 0.52488	valid_1's mape: 0.629375
[2600]	training's mape: 0.484368	valid_1's mape: 0.598592
[2800]	training's mape: 0.450882	valid_1's mape: 0.575244
[3000]	training's mape: 0.422996	valid_1's mape: 0.556623
[3200]	training's mape: 0.399782	valid_1's mape: 0.543404
[3400]	training's mape: 0.37

In [32]:
test_high['pred'] = y_high_pred*1.03
test_low['pred'] = y_low_pred
test = pd.concat((test_high, test_low))
test = test.sort_values('index')
y_pred  = test['pred']
y_pred = np.round(y_pred, 2)

In [36]:
submit = pd.read_csv('../data/raw/test_data.csv')
sub = pd.DataFrame({'id': submit['id'], 'y': y_pred})
sub.query('y < 0')

In [39]:
for i in sub[sub['y'] < 0]['id']:
    sub['y'][i-1] = 0 

In [40]:
sub['y'].describe()

count    34844.000000
mean       255.487695
std        720.185326
min          0.000000
25%         95.087500
50%        147.230000
75%        219.152500
max      54295.040000
Name: y, dtype: float64

In [30]:
sub.to_csv('sub_lgbm_4.csv', index=False)

## 超高額物件を抑えたほうがいいかもしれない

In [43]:
pd.DataFrame(y_high_pred).describe()

Unnamed: 0,0
count,223.0
mean,4201.273412
std,5921.200854
min,563.647235
25%,1443.199078
50%,2397.055699
75%,4531.237126
max,52713.632942


In [46]:
 train.query('MunicipalityCode < 13150 and Area > 700')['y'].describe()

count     2938.000000
mean      1288.696392
std       2738.872598
min          7.000000
25%        292.500000
50%        520.000000
75%       1200.000000
max      61000.000000
Name: y, dtype: float64

In [48]:
 # もっと高額の方は幅を持たせるべき！！！
train.query('MunicipalityCode < 13150 and Area > 2~300')['y'].describe() #でいい!!

SyntaxError: Python keyword not valid identifier in numexpr query (<unknown>, line 1)