## 概要
適切に動作するノートブック

In [1]:
import pandas as pd
import numpy as np
import json
# データ可視化ライブラリ
import matplotlib.pyplot as plt
%matplotlib inline  
import seaborn as sns
import re
from tqdm import tqdm_notebook as tqdm

import lightgbm as lgb
print('lightgbm version:', lgb.__version__)

from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import sklearn
print('sklearn version:', sklearn.__version__)

import warnings
warnings.filterwarnings('ignore')

import MeCab
tagger = MeCab.Tagger()

lightgbm version: 2.3.0
sklearn version: 0.22.1


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('../data/raw/train_data.csv')
test = pd.read_csv('../data/raw/test_data.csv')

In [3]:
# ↑のjsonファイルをnames.jsonで保存してある前提
with open("columns.json", "r", encoding="utf-8") as f:
     d = json.load(f)
        
train = train.rename(columns=d)
test = test.rename(columns=d)        

In [4]:
pub = pd.read_csv('../data/raw/published_land_price.csv')

In [5]:
pair = {"所在地コード":"市区町村コード","建蔽率":"建ぺい率（％）","容積率":"容積率（％）","駅名":"最寄駅：名称", 
        "地積":"面積（㎡）","市区町村名":"市区町村名",'前面道路の幅員':'前面道路：幅員（ｍ）', 
        "前面道路の方位区分":"前面道路：方位","前面道路区分":"前面道路：種類","形状区分":"土地の形状",
        "用途区分":"都市計画"
         }

In [6]:
pub = pub.rename(columns=pair)

In [7]:
pub = pub.rename(columns=d)

In [8]:
pub.columns[:100]

Index(['id', '経度', '緯度', 'MunicipalityCode', 'Use', '連番', '年次', '前年所在地コード',
       '前年用途', '前年連番', 'Municipality', '住居表示', '行政', 'Area', '利用の現況', '利用状況表示',
       '建物構造', '施設', 'LandShape', '間口（比率）', '奥行（比率）', '階層（地上）', '階層（地下）',
       'Classification', 'Direction', 'Breadth', '前面道路の駅前区分', '前面道路の舗装状況',
       '側道区分', '側道方位区分', '交通施設との近接区分', '周辺の土地の利用の現況', 'NearestStation', '駅距離',
       'CityPlanning', '防火区分', '都市計画区分', '森林区分', '公園区分', 'CoverageRatio',
       'FloorAreaRatio', '共通地点区分', '選定年次ビット', 'Ｓ５８価格', 'Ｓ５９価格', 'Ｓ６０価格',
       'Ｓ６１価格', 'Ｓ６２価格', 'Ｓ６３価格', 'Ｈ１価格', 'Ｈ２価格', 'Ｈ３価格', 'Ｈ４価格', 'Ｈ５価格',
       'Ｈ６価格', 'Ｈ７価格', 'Ｈ８価格', 'Ｈ９価格', 'Ｈ１０価格', 'Ｈ１１価格', 'Ｈ１２価格', 'Ｈ１３価格',
       'Ｈ１４価格', 'Ｈ１５価格', 'Ｈ１６価格', 'Ｈ１７価格', 'Ｈ１８価格', 'Ｈ１９価格', 'Ｈ２０価格', 'Ｈ２１価格',
       'Ｈ２２価格', 'Ｈ２３価格', 'Ｈ２４価格', 'Ｈ２５価格', 'Ｈ２６価格', 'Ｈ２７価格', 'Ｈ２８価格', 'Ｈ２９価格',
       'Ｈ３０価格', 'Ｈ３１価格', '属性移動Ｓ５９', '属性移動Ｓ６０', '属性移動Ｓ６１', '属性移動Ｓ６２', '属性移動Ｓ６３',
       '属性移動Ｈ１', '属性移動Ｈ２', '属性移動Ｈ３', '属性移動Ｈ４', '属性移動Ｈ５', '属性移動Ｈ６', '属性移動Ｈ７',
     

## 不要なカラムを落とす 

In [9]:
delete_columns = ['id', 'Prefecture', 'Municipality', 'DistrictName']
train = train.drop(delete_columns, axis=1)
test = test.drop(delete_columns, axis=1)
train = train.drop_duplicates()

In [10]:
train = train.drop(index=[48111, 76360])
y = train['y']
train = train.drop('y', axis=1)
y[320210] = 4500.0

In [11]:
data = pd.concat([train, test])

## stationの整形

In [12]:
Near = data['NearestStation'].str.split('(', expand=True)
data = data.drop('NearestStation', axis=1)
data['NearestStation'] = Near[0]

In [13]:
new_columns = ['latitude', 'longitude','h31_price']

data['NearestStation+MunicipalityCode'] = data['NearestStation'] + '+' + data['MunicipalityCode'].astype(str)

for i in new_columns:
    data[i] =data['NearestStation+MunicipalityCode']

def get_mean(search, cat):
        leng = search.shape[0]
        sum = 0
        for i in range(leng):
            if search.iloc[i][cat] != 0:
                sum += search.iloc[i][cat]
        return sum/leng

In [14]:
new_columns = ['latitude', 'longitude','h31_price', 'h30_price', 'h29_price', 'h28_price', 'h27_price', 'SerialNumber']

data['NearestStation+MunicipalityCode'] = data['NearestStation'] + '+' + data['MunicipalityCode'].astype(str)

for i in new_columns:
    data[i] =data['NearestStation+MunicipalityCode']

def get_mean(search, cat):
        leng = search.shape[0]
        sum = 0
        for i in range(leng):
            if search.iloc[i][cat] != 0:
                sum += search.iloc[i][cat]
        return sum/leng

for i in tqdm(data['NearestStation+MunicipalityCode'].unique()):
    try:
            ns, mc = i.split('+')
            search = pub[pub['MunicipalityCode'] == int(mc)][pub['NearestStation'] == ns]
            try:
                data['latitude'] = data['latitude'].replace(i, get_mean(search, '緯度'))
                data['longitude'] = data['longitude'].replace(i, get_mean(search, '経度'))
                data['h31_price'] = data['h31_price'].replace(i, get_mean(search, 'Ｈ３１価格'))
                data['h30_price'] = data['h30_price'].replace(i, get_mean(search, 'Ｈ３０価格'))
                data['h29_price'] = data['h29_price'].replace(i, get_mean(search, 'Ｈ２９価格'))
                data['h28_price'] = data['h28_price'].replace(i, get_mean(search, 'Ｈ２８価格'))
                data['h27_price'] = data['h27_price'].replace(i, get_mean(search, 'Ｈ２７価格'))
                
                data['SerialNumber'] = data['SerialNumber'].replace(i, get_mean(search, '連番'))
            except:
                try:
                    search = pub[pub['NearestStation'] == ns]
                    data['latitude'] = data['latitude'].replace(i, get_mean(search, '緯度'))
                    data['longitude'] = data['longitude'].replace(i, get_mean(search, '経度'))
                    data['h31_price'] = data['h31_price'].replace(i, get_mean(search, 'Ｈ３１価格'))
                    data['h30_price'] = data['h30_price'].replace(i, get_mean(search, 'Ｈ３０価格'))
                    data['h29_price'] = data['h29_price'].replace(i, get_mean(search, 'Ｈ２９価格'))
                    data['h28_price'] = data['h28_price'].replace(i, get_mean(search, 'Ｈ２８価格'))
                    data['h27_price'] = data['h27_price'].replace(i, get_mean(search, 'Ｈ２７価格'))                    
                    data['SerialNumber'] = data['SerialNumber'].replace(i, get_mean(search, '連番'))                    
                except:
                    data['latitude'] = data['latitude'].replace(i, np.nan)
                    data['longitude'] = data['longitude'].replace(i, np.nan)
                    data['h31_price'] = data['h31_price'].replace(i, np.nan) 
                    data['h30_price'] = data['h30_price'].replace(i, np.nan)
                    data['h29_price'] = data['h29_price'].replace(i, np.nan)
                    data['h28_price'] = data['h28_price'].replace(i, np.nan)
                    data['h27_price'] = data['h27_price'].replace(i, np.nan)                    
                    data['SerialNumber'] = data['SerialNumber'].replace(i, np.nan)                    
    except:
            continue

HBox(children=(FloatProgress(value=0.0, max=1138.0), HTML(value='')))




In [15]:
data = data.drop('NearestStation+MunicipalityCode', axis=1)

In [16]:
data['h31_price'] = data['h31_price'].replace(0, np.nan) 
data['h30_price'] = data['h30_price'].replace(0, np.nan)
data['h29_price'] = data['h29_price'].replace(0, np.nan)
data['h28_price'] = data['h28_price'].replace(0, np.nan)
data['h27_price'] = data['h27_price'].replace(0, np.nan) 

In [17]:
price_df = pd.DataFrame({'h31_price': data['h31_price'].values, 'h30_price': data['h30_price'].values, 
                         'h29_price': data['h29_price'].values, 'h28_price': data['h28_price'].values, 
                              'h27_price' : data['h27_price'].values})

In [18]:
price_df['h31_27_mean'] = price_df.mean(axis='columns')

In [19]:
price_df['31-28'] = price_df['h31_price'] - price_df['h28_price']

In [20]:
price_df.columns

Index(['h31_price', 'h30_price', 'h29_price', 'h28_price', 'h27_price',
       'h31_27_mean', '31-28'],
      dtype='object')

In [21]:
price_df = price_df.drop(['h30_price', 'h29_price', 'h28_price', 'h27_price'], axis=1)

In [22]:
data['h31_27_mean'] = price_df['h31_27_mean']

In [23]:
data['31-28'] = price_df['31-28']

## 数値データの整形

In [24]:
def calc_mean_median(df, df_type):
    dumy = df
    dumy = dumy.dropna()
    dumy = dumy.astype(df_type)
    df = df.fillna(dumy.mean())
    df = df.astype(df_type)
    return df

### 最寄駅：距離（分） 

In [25]:
data['TimeToNearestStation'] = data['TimeToNearestStation'].replace('2H?', '120').replace('30分?60分', '45'). \
        replace('1H30?2H', '105').replace('1H?1H30', '75')

In [26]:
data['TimeToNearestStation'] = calc_mean_median(data['TimeToNearestStation'], np.int64)

### 面積

In [27]:
dumy_area = data['Area'].replace('2000㎡以上', np.nan).replace('5000㎡以上', np.nan)
dumy_area = dumy_area.dropna()
dumy_area = dumy_area.astype(np.int64)
# 2000以上5000以下の平均値を取得
area_mean_2000_5000 = np.mean([i for i in dumy_area if i > 2000 and i < 5000])
# 5000以上はないので5500で置換
# 欠損値を埋めるための平均と中央値
dim_mean = dumy_area.mean()
dim_median = dumy_area.median()

In [28]:
data['Area'] = data['Area'].replace('2000㎡以上',  area_mean_2000_5000).replace('5000㎡以上', 5500)
data['Area'] =  data['Area'].fillna(dim_mean)  
data['Area'] = data['Area'].astype(np.int64)

### 間口

In [29]:
data['Frontage'] = data['Frontage'].replace('50.0m以上', '60.0')

In [30]:
data['Frontage'] =   calc_mean_median(data['Frontage'], np.float32)

### 延床面積（㎡）

In [31]:
data['TotalFloorArea'] = data['TotalFloorArea'].replace('2000㎡以上', '2500').replace( '10m^2未満', '5')

In [32]:
data['TotalFloorArea'] =  calc_mean_median(data['TotalFloorArea'], np.int64)

### 築年数

In [33]:
tiku_columns = ['昭和59年', '平成15年', '平成24年', '昭和61年', '平成11年', '昭和60年', '平成19年',
       '平成10年', '昭和57年', '昭和45年', '昭和47年', '昭和43年', '昭和64年', '平成25年',
       '平成16年', '平成9年', '平成5年', '昭和51年', '平成13年', '昭和52年', '昭和55年',
       '昭和54年', '平成2年', '平成21年', '平成22年', '平成18年', '平成14年', '平成6年',
       '平成17年', '昭和62年', '平成12年', '昭和38年', '昭和41年', '昭和48年', '平成20年',
       '昭和53年', '昭和49年', '平成26年', '昭和58年', '昭和56年', '昭和35年', '昭和50年',
       '昭和37年', '平成23年', '平成7年', '平成3年', '昭和63年', '戦前', '平成29年', '昭和36年',
       '平成28年', '昭和39年', '平成4年', '平成27年', '昭和42年', '昭和31年', '平成30年',
       '昭和44年', '昭和29年', '平成8年', '昭和27年', '昭和46年', '昭和30年', '昭和33年',
       '昭和32年', '昭和40年', '昭和26年', '昭和34年', '昭和25年', '昭和28年', '昭和22年',
       '昭和24年', '昭和23年', '昭和21年', '平成31年']

for i in tiku_columns:
    if '平成' in i:
        year = int(i.replace('年', '').replace('平成', '')) +2000-12
        data['BuildingYear'] = data['BuildingYear'].replace(i, year)
    if '昭和' in i:
        year = int(i.replace('年', '').replace('昭和', '')) + 1925
        data['BuildingYear'] = data['BuildingYear'].replace(i, year)
        

In [34]:
data['BuildingYear'].unique()

array([1984, 2003, 2012, 1986, 1999, 1985, 2007, 1998, nan, 1982, 1970,
       1972, 1968, 1989, 2013, 2004, 1997, 1993, 1976, 2001, 1977, 1980,
       1979, 1990, 2009, 2010, 2006, 2002, 1994, 2005, 1987, 2000, 1963,
       1966, 1973, 2008, 1978, 1974, 2014, 1983, 1981, 1960, 1975, 1962,
       2011, 1995, 1991, 1988, '戦前', 2017, 1961, 2016, 1964, 1992, 2015,
       1967, 1956, 2018, 1969, 1954, 1996, 1952, 1971, 1955, 1958, 1957,
       1965, 1951, 1959, 1950, 1953, 1947, 1949, 1948, 1946, 2019],
      dtype=object)

In [35]:
data['BuildingYear'] = data['BuildingYear'].replace('戦前', 1960)

In [36]:
data['BuildingYear'] =  calc_mean_median(data['BuildingYear'], np.int64)

In [37]:
data['BuildingYear'] = data['BuildingYear'] -1900

## 前面道路：幅員（ｍ）

In [38]:
data['Breadth']=  calc_mean_median(data['Breadth'], np.float64)

###  建ぺい率（％) 容積率（％） 

In [39]:
data['CoverageRatio']=  calc_mean_median(data['CoverageRatio'], np.float64)
data['FloorAreaRatio']=  calc_mean_median(data['FloorAreaRatio'], np.float64)

### 取引時点

In [40]:
torihiki_columns = data['Period'].unique()

In [41]:
for i, value in enumerate(sorted(torihiki_columns)):
    data['Period'] =  data['Period'].replace(value, i)

## カテゴリーデータの整形

### 最寄駅：名称 - labelエンコード

### 間取り

In [42]:
data['L'] = data['FloorPlan'].map(lambda x: 1 if 'Ｌ' in str(x) else 0)
data['D'] = data['FloorPlan'].map(lambda x: 1 if 'Ｄ' in str(x) else 0)
data['K'] = data['FloorPlan'].map(lambda x: 1 if 'Ｋ' in str(x) else 0)
data['S'] = data['FloorPlan'].map(lambda x: 1 if 'Ｓ' in str(x) else 0)
data['R'] = data['FloorPlan'].map(lambda x: 1 if 'Ｒ' in str(x) else 0)
data['Maisonette'] = data['FloorPlan'].map(lambda x: 1 if 'メゾネット' in str(x) else 0)
data['OpenFloor'] = data['FloorPlan'].map(lambda x: 1 if 'オープンフロア' in str(x) else 0)
data['Studio'] = data['FloorPlan'].map(lambda x: 1 if 'スタジオ' in str(x) else 0)

In [43]:
use_dummy =data['Use'].fillna('missing')
use_dummy=  use_dummy.str.replace('共同住宅', '共同')
for i in use_dummy.unique():
    new_use = ''
    if '住宅' in i:
        new_use += '1'
    else:
        new_use += '0'
    new_use += ','
    if '事務所' in i:
        new_use += '1'
    else:
        new_use += '0'        
    new_use += ','
    if '店舗' in i:
        new_use += '1'
    else:
        new_use += '0'        
    new_use += ','
    if 'その他' in i:
        new_use += '1'
    else:
        new_use += '0'        
    new_use += ','
    if '倉庫' in i:
        new_use += '1'
    else:
        new_use += '0'        
    new_use += ','
    if '駐車場' in i:
        new_use += '1'
    else:
        new_use += '0'        
    new_use += ','
    if '工場' in i:
        new_use += '1'
    else:
        new_use += '0'        
    new_use += ','
    if '共同' in i:
        new_use += '1'
    else:
        new_use += '0'        
    new_use += ','
    if '作業場' in i:
        new_use += '1'
    else:
        new_use += '0'        
    use_dummy = use_dummy.replace(i, new_use)

In [44]:
data_use = use_dummy.str.split(',', expand=True)
use_columns={0: 'Housing', 1: 'office', 2: 'store', 3: 'other_use', 4: 'Warehouse', 5: 'parking', 6: 'plant', 7: 'shareHouse', 8: 'Workshop'}
data_use = data_use.rename(columns=use_columns)
for i in data_use.columns:
    data_use[i] = data_use[i].astype(np.int64)
for i in data_use.columns:
    data[i] = data_use[i]    

## カウント数

In [45]:
categorical_features = ['Type', 'Region', 'NearestStation', 'FloorPlan','LandShape', 'Structure', 'Use', 'Purpose', 'Direction', \
         'Classification', 'CityPlanning', 'Renovation', 'Remarks']

In [46]:
for c in categorical_features:
    data[c] =  data[c].fillna('missing') 

In [47]:
from sklearn.preprocessing import LabelEncoder

for c in categorical_features:
    le = LabelEncoder()
    le.fit(data[c])
    data[c] = le.transform(data[c])

In [48]:
data['longitude'] = data['longitude'] - 500000

In [49]:
data['latitude'] = data['latitude'] -120000

In [50]:
price_cols = ['h31_price','h30_price','h29_price','h28_price', 'h27_price', 'h31_27_mean']   
for i in price_cols:
    data[i] = data[i] /1000

In [51]:
data = data.drop(['h29_price','h28_price', 'h27_price'], axis =1)

In [52]:
train = data[:len(train)]
test = data[len(train):]

In [53]:
print(len(train))
print(len(y))

352855
352855


In [54]:
train['y'] = y

In [55]:
drop_columns = ['OpenFloor', 'Workshop', 'D', 'S', 'Studio', 'Maisonette', 'K', 'R']
train = train.drop(drop_columns, axis=1)
test = test.drop(drop_columns, axis=1)
# highのtrainに幅を持たせるのは多分正解
train_high = train.query('MunicipalityCode < 13150')
train_low = train.query('(MunicipalityCode < 13150 and Area < 600) or MunicipalityCode > 13150')
train_high_y = train_high['y']
train_high = train_high.drop('y', axis=1)

train_low_y = train_low['y']
train_low = train_low.drop('y', axis=1)

test = test.reset_index()

# 800のほうがいいかもしれない
test_high = test.query('MunicipalityCode < 13150 and Area >= 2200')
test_low = test.query('(MunicipalityCode < 13150 and Area < 2200) or MunicipalityCode > 13150')

test_high_data = test_high.drop('index', axis=1)
test_low_data = test_low.drop('index', axis=1)

## 学習

In [57]:
### lowの予測
scores = []
y_low_pred = np.zeros(test_low_data.shape[0])
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_low):
    tr_x, va_x = train_low.iloc[tr_idx], train_low.iloc[va_idx]
    tr_y, va_y = train_low_y.iloc[tr_idx], train_low_y.iloc[va_idx]

    train_data = lgb.Dataset(tr_x, tr_y)
    valid_data = lgb.Dataset(va_x, va_y)
    params = {
        'objective': 'regression',
        'metric': 'mape',
        'learning_rate': 0.001,
        'max_depth': -1,
        'num_leaves': 255,
        'max_bin': 255,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'nthread': -1,
        'bagging_freq': 1,
        'verbose': -1,
        'seed': 1
#         'seed': random.randint(1, 100),
    }
    model = lgb.train(params, train_data, valid_sets=[train_data, valid_data],
                      num_boost_round=5000, early_stopping_rounds=200,
                      verbose_eval=200)

    y_val_pred = model.predict(va_x)
    print(len(va_x), len(y_val_pred), len(va_y))
    val_score = np.sqrt(mean_squared_error(va_y, y_val_pred))
    y_low_pred  += model.predict(test_low_data, num_iteration=model.best_iteration)
    scores.append(val_score)

print('RMSE:', np.mean(scores))

Training until validation scores don't improve for 200 rounds
[200]	training's mape: 1.3694	valid_1's mape: 1.37107
[400]	training's mape: 1.18655	valid_1's mape: 1.18746
[600]	training's mape: 1.03768	valid_1's mape: 1.03803
[800]	training's mape: 0.914824	valid_1's mape: 0.915126
[1000]	training's mape: 0.812328	valid_1's mape: 0.812672
[1200]	training's mape: 0.727157	valid_1's mape: 0.727548
[1400]	training's mape: 0.657173	valid_1's mape: 0.657917
[1600]	training's mape: 0.599412	valid_1's mape: 0.600499
[1800]	training's mape: 0.552306	valid_1's mape: 0.554076
[2000]	training's mape: 0.512842	valid_1's mape: 0.515097
[2200]	training's mape: 0.480137	valid_1's mape: 0.482862
[2400]	training's mape: 0.452723	valid_1's mape: 0.455925
[2600]	training's mape: 0.429824	valid_1's mape: 0.433518
[2800]	training's mape: 0.410399	valid_1's mape: 0.414655
[3000]	training's mape: 0.393928	valid_1's mape: 0.398724
[3200]	training's mape: 0.379836	valid_1's mape: 0.385247
[3400]	training's map

In [58]:
### highの予測
scores = []
y_high_pred = np.zeros(test_high_data.shape[0])
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_high):
    tr_x, va_x = train_high.iloc[tr_idx], train_high.iloc[va_idx]
    tr_y, va_y = train_high_y.iloc[tr_idx], train_high_y.iloc[va_idx]

    train_data = lgb.Dataset(tr_x, tr_y)
    valid_data = lgb.Dataset(va_x, va_y)
    params = {
        'objective': 'regression',
        'metric': 'mape',
        'learning_rate': 0.001,
        'max_depth': -1,
        'num_leaves': 200,
        'max_bin': 255,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'nthread': -1,
        'bagging_freq': 1,
        'verbose': -1,
        'seed': 1
#         'seed': random.randint(1, 100),
    }
    model = lgb.train(params, train_data, valid_sets=[train_data, valid_data],
                      num_boost_round=5000, early_stopping_rounds=100,
                      verbose_eval=200)

    y_val_pred = model.predict(va_x)
    val_score = np.sqrt(mean_squared_error(va_y, y_val_pred*1.03))
    y_high_pred  += model.predict(test_high_data, num_iteration=model.best_iteration)
    scores.append(val_score)

print('RMSE:', np.mean(scores))

Training until validation scores don't improve for 100 rounds
[200]	training's mape: 1.77796	valid_1's mape: 1.76744
[400]	training's mape: 1.51	valid_1's mape: 1.5034
[600]	training's mape: 1.29204	valid_1's mape: 1.28918
[800]	training's mape: 1.11387	valid_1's mape: 1.11435
[1000]	training's mape: 0.966369	valid_1's mape: 0.969659
[1200]	training's mape: 0.844387	valid_1's mape: 0.850026
[1400]	training's mape: 0.745001	valid_1's mape: 0.752547
[1600]	training's mape: 0.663262	valid_1's mape: 0.672312
[1800]	training's mape: 0.597205	valid_1's mape: 0.607546
[2000]	training's mape: 0.542478	valid_1's mape: 0.553873
[2200]	training's mape: 0.497541	valid_1's mape: 0.509854
[2400]	training's mape: 0.460608	valid_1's mape: 0.473583
[2600]	training's mape: 0.430516	valid_1's mape: 0.444136
[2800]	training's mape: 0.405347	valid_1's mape: 0.419433
[3000]	training's mape: 0.384744	valid_1's mape: 0.399441
[3200]	training's mape: 0.367741	valid_1's mape: 0.382915
[3400]	training's mape: 0.

In [59]:
pd.DataFrame(y_low_pred).describe()

Unnamed: 0,0
count,34816.0
mean,240.702939
std,474.624123
min,-45.58407
25%,94.800827
50%,146.922922
75%,218.370785
max,15168.050016


In [60]:
pd.DataFrame(y_high_pred).describe()

Unnamed: 0,0
count,28.0
mean,10019.169594
std,9218.684459
min,2811.18722
25%,4877.817614
50%,6872.930366
75%,10584.470095
max,39720.328352


In [62]:
test_high['pred'] = y_high_pred
test_low['pred'] = y_low_pred
test = pd.concat((test_high, test_low))
test = test.sort_values('index')
y_pred  = test['pred']
y_pred = np.round(y_pred, 2)

In [63]:
submit = pd.read_csv('../data/raw/test_data.csv')
sub = pd.DataFrame({'id': submit['id'], 'y': y_pred})
sub.query('y < 0')

Unnamed: 0,id,y
34776,34777,-45.58
34806,34807,-42.94
34817,34818,-41.92
34818,34819,-15.77
34820,34821,-29.33


In [64]:
for i in sub[sub['y'] < 0]['id']:
    sub['y'][i-1] = 0 

In [66]:
sub.to_csv('sub_lgbm_6.csv', index=False)

In [65]:
sub['y'].describe()

count    34844.000000
mean       248.565731
std        606.396426
min          0.000000
25%         94.830000
50%        147.010000
75%        218.760000
max      39720.330000
Name: y, dtype: float64