In [1]:
import pandas as pd
import numpy as np
from copy import copy

### データの読み込み

In [2]:
train = pd.read_csv('tutorial_preprocessed_train.csv', parse_dates=['date'])
weather = pd.read_csv('tutorial_preprocessed_weather.csv')
test = pd.read_csv('tutorial_preprocessed_test.csv', parse_dates=['date'])

train['month'] = train.date.dt.month
test['month'] = test.date.dt.month

display(train.head())
display(test.head())

Unnamed: 0,kind,date,amount,mode_price,area,year,weekno,month
0,だいこん,2005-11-07,201445.0,735.0,千葉,2005,45,11
1,だいこん,2005-11-08,189660.0,840.0,千葉_全国_青森,2005,45,11
2,だいこん,2005-11-10,218166.0,735.0,千葉_全国_青森,2005,45,11
3,だいこん,2005-11-11,182624.0,682.5,千葉_青森,2005,45,11
4,だいこん,2005-11-12,220691.0,682.5,千葉_青森,2005,45,11


Unnamed: 0,kind,date,amount,mode_price,area,year,weekno,month
0,だいこん,2022-11-01,,,千葉_全国_横浜,2022,44,11
1,だいこん,2022-11-02,,,千葉_全国_横浜,2022,44,11
2,だいこん,2022-11-04,,,千葉_全国_横浜,2022,44,11
3,だいこん,2022-11-05,,,千葉_全国_横浜,2022,44,11
4,だいこん,2022-11-07,,,千葉_全国_横浜,2022,45,11


### 1, 2, 3, 6, 9, 12か月前の気象情報を特徴量として追加

In [3]:
for i in [0, 1, 2, 3, 6, 9, 12]:
    cweather = copy(weather)
    cweather.month += 1
    cweather.loc[lambda x: x.month==13] = 1
    cweather.columns = [f'{c}_lag{i}' for c in cweather.columns]
    cweather = cweather.rename(columns={f'year_lag{i}': 'year', f'month_lag{i}': 'month', f'area_lag{i}': 'area'})
    train = train.merge(cweather, how='left', on=['year', 'month', 'area'])
    test = test.merge(cweather, how='left', on=['year', 'month', 'area'])
train.columns

Index(['kind', 'date', 'amount', 'mode_price', 'area', 'year', 'weekno',
       'month', 'mean_mean_temp_lag0', 'max_mean_temp_lag0',
       ...
       'min_min_temp_lag12', 'mean_sum_rain_lag12', 'max_sum_rain_lag12',
       'min_sum_rain_lag12', 'mean_sun_time_lag12', 'max_sun_time_lag12',
       'min_sun_time_lag12', 'mean_mean_humid_lag12', 'max_mean_humid_lag12',
       'min_mean_humid_lag12'],
      dtype='object', length=134)

### 過去30日間、過去365日間の平均卸値を特徴量として追加

In [4]:
mode_price_30days_ago = train[['kind', 'date', 'mode_price']].rename(columns={'mode_price': 'mode_price_30days_ago'})
mode_price_30days_ago.date += pd.Timedelta(days=30)
mode_price_30days_ago['year'] = mode_price_30days_ago.date.dt.year
mode_price_30days_ago['month'] = mode_price_30days_ago.date.dt.month
mode_price_30days_ago = mode_price_30days_ago.groupby(['kind', 'year', 'month']).mean().reset_index()
train = train.merge(mode_price_30days_ago, how='left', on=['kind', 'year', 'month'])
test = test.merge(mode_price_30days_ago, how='left', on=['kind', 'year', 'month'])

mode_price_365days_ago = train[['kind', 'date', 'mode_price']].rename(columns={'mode_price': 'mode_price_365days_ago'})
mode_price_365days_ago.date += pd.Timedelta(days=365)
mode_price_365days_ago.date.max(), mode_price_365days_ago.columns
mode_price_365days_ago['year'] = mode_price_365days_ago.date.dt.year
mode_price_365days_ago['month'] = mode_price_365days_ago.date.dt.month
mode_price_365days_ago = mode_price_365days_ago.groupby(['kind', 'year', 'month']).mean().reset_index()
train = train.merge(mode_price_365days_ago, how='left', on=['kind', 'year', 'month'])
test = test.merge(mode_price_365days_ago, how='left', on=['kind', 'year', 'month'])

display(train.head())
display(test.head())

  mode_price_30days_ago = mode_price_30days_ago.groupby(['kind', 'year', 'month']).mean().reset_index()
  mode_price_365days_ago = mode_price_365days_ago.groupby(['kind', 'year', 'month']).mean().reset_index()


Unnamed: 0,kind,date,amount,mode_price,area,year,weekno,month,mean_mean_temp_lag0,max_mean_temp_lag0,...,max_sum_rain_lag12,min_sum_rain_lag12,mean_sun_time_lag12,max_sun_time_lag12,min_sun_time_lag12,mean_mean_humid_lag12,max_mean_humid_lag12,min_mean_humid_lag12,mode_price_30days_ago,mode_price_365days_ago
0,だいこん,2005-11-07,201445.0,735.0,千葉,2005,45,11,18.83871,24.7,...,27.5,0.0,3.577419,11.1,0.0,79.064516,92.0,60.0,,
1,だいこん,2005-11-08,189660.0,840.0,千葉_全国_青森,2005,45,11,17.103797,22.54375,...,28.286458,0.0,4.408501,10.761458,0.0,74.557796,90.895833,57.635417,,
2,だいこん,2005-11-10,218166.0,735.0,千葉_全国_青森,2005,45,11,17.103797,22.54375,...,28.286458,0.0,4.408501,10.761458,0.0,74.557796,90.895833,57.635417,,
3,だいこん,2005-11-11,182624.0,682.5,千葉_青森,2005,45,11,16.454839,21.85,...,27.75,0.0,4.316129,10.9,0.0,76.322581,92.0,60.0,,
4,だいこん,2005-11-12,220691.0,682.5,千葉_青森,2005,45,11,16.454839,21.85,...,27.75,0.0,4.316129,10.9,0.0,76.322581,92.0,60.0,,


Unnamed: 0,kind,date,amount,mode_price,area,year,weekno,month,mean_mean_temp_lag0,max_mean_temp_lag0,...,max_sum_rain_lag12,min_sum_rain_lag12,mean_sun_time_lag12,max_sun_time_lag12,min_sun_time_lag12,mean_mean_humid_lag12,max_mean_humid_lag12,min_mean_humid_lag12,mode_price_30days_ago,mode_price_365days_ago
0,だいこん,2022-11-01,,,千葉_全国_横浜,2022,44,11,17.937923,24.811458,...,49.880208,0.0,4.802148,10.76875,0.0,71.733724,91.34375,53.177083,984.857143,648.0
1,だいこん,2022-11-02,,,千葉_全国_横浜,2022,44,11,17.937923,24.811458,...,49.880208,0.0,4.802148,10.76875,0.0,71.733724,91.34375,53.177083,984.857143,648.0
2,だいこん,2022-11-04,,,千葉_全国_横浜,2022,44,11,17.937923,24.811458,...,49.880208,0.0,4.802148,10.76875,0.0,71.733724,91.34375,53.177083,984.857143,648.0
3,だいこん,2022-11-05,,,千葉_全国_横浜,2022,44,11,17.937923,24.811458,...,49.880208,0.0,4.802148,10.76875,0.0,71.733724,91.34375,53.177083,984.857143,648.0
4,だいこん,2022-11-07,,,千葉_全国_横浜,2022,45,11,17.937923,24.811458,...,49.880208,0.0,4.802148,10.76875,0.0,71.733724,91.34375,53.177083,984.857143,648.0


### 野菜毎にデータ数が同じになるように cross_validation

In [5]:
# 訓練データを2015年6月1日以降に限定
train = train[lambda x: x.date >= '2015-06-01']
#train = train[lambda x: x.date >= '2016-05-01']

from sklearn.model_selection import KFold

n_splits = 5
kf = KFold(n_splits=n_splits)

train['fold'] = -1
train_fold = []
for kind in test.kind.unique():
    train_kind = train[lambda x: x.kind==kind].copy()
    for i, (_, test_index) in enumerate(kf.split(train_kind)):
        train_kind.iloc[test_index, -1] = i
        train_fold.append(train_kind)
train_fold = pd.concat(train_fold, axis=0)

display(train_fold.groupby(['kind', 'fold']).count())
display(train_fold.groupby('fold').count())
display(train_fold.groupby(['kind', 'fold'])['date'].apply(lambda x: [x.min(), x.max()]))

Unnamed: 0_level_0,Unnamed: 1_level_0,date,amount,mode_price,area,year,weekno,month,mean_mean_temp_lag0,max_mean_temp_lag0,min_mean_temp_lag0,...,max_sum_rain_lag12,min_sum_rain_lag12,mean_sun_time_lag12,max_sun_time_lag12,min_sun_time_lag12,mean_mean_humid_lag12,max_mean_humid_lag12,min_mean_humid_lag12,mode_price_30days_ago,mode_price_365days_ago
kind,fold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
かぼちゃ,0,1635,1635,1635,1635,1635,1635,1635,1635,1635,1635,...,1635,1635,1635,1635,1635,1635,1635,1635,1635,1635
かぼちゃ,1,1635,1635,1635,1635,1635,1635,1635,1635,1635,1635,...,1635,1635,1635,1635,1635,1635,1635,1635,1635,1635
かぼちゃ,2,1635,1635,1635,1635,1635,1635,1635,1635,1635,1635,...,1635,1635,1635,1635,1635,1635,1635,1635,1635,1635
かぼちゃ,3,1635,1635,1635,1635,1635,1635,1635,1635,1635,1635,...,1635,1635,1635,1635,1635,1635,1635,1635,1525,1635
かぼちゃ,4,1630,1630,1630,1630,1630,1630,1630,1630,1630,1630,...,1630,1630,1630,1630,1630,1630,1630,1630,1630,1630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
レタス,0,1920,1920,1920,1920,1920,1920,1920,1820,1820,1820,...,1820,1820,1820,1820,1820,1820,1820,1820,1920,1920
レタス,1,1920,1920,1920,1920,1920,1920,1920,1725,1725,1725,...,1725,1725,1725,1725,1725,1725,1725,1725,1920,1920
レタス,2,1915,1915,1915,1915,1915,1915,1915,1820,1820,1820,...,1820,1820,1820,1820,1820,1820,1820,1820,1915,1915
レタス,3,1915,1915,1915,1915,1915,1915,1915,1725,1725,1725,...,1725,1725,1725,1725,1725,1725,1725,1725,1915,1915


Unnamed: 0_level_0,kind,date,amount,mode_price,area,year,weekno,month,mean_mean_temp_lag0,max_mean_temp_lag0,...,max_sum_rain_lag12,min_sum_rain_lag12,mean_sun_time_lag12,max_sun_time_lag12,min_sun_time_lag12,mean_mean_humid_lag12,max_mean_humid_lag12,min_mean_humid_lag12,mode_price_30days_ago,mode_price_365days_ago
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,27115,27115,27115,27115,27115,27115,27115,27115,25715,25715,...,25715,25715,25715,25715,25715,25715,25715,25715,26710,25995
1,27115,27115,27115,27115,27115,27115,27115,27115,24385,24385,...,24385,24385,24385,24385,24385,24385,24385,24385,26935,27115
2,27060,27060,27060,27060,27060,27060,27060,27060,25730,25730,...,25730,25730,25730,25730,25730,25730,25730,25730,26690,27060
3,27060,27060,27060,27060,27060,27060,27060,27060,24400,24400,...,24400,24400,24400,24400,24400,24400,24400,24400,26775,27060
4,27055,27055,27055,27055,27055,27055,27055,27055,25655,25655,...,25655,25655,25655,25655,25655,25655,25655,25655,26680,27055


kind  fold
かぼちゃ  0       [2015-06-01 00:00:00, 2016-10-15 00:00:00]
      1       [2016-10-17 00:00:00, 2018-05-08 00:00:00]
      2       [2018-05-10 00:00:00, 2019-10-08 00:00:00]
      3       [2019-10-10 00:00:00, 2021-05-21 00:00:00]
      4       [2021-05-22 00:00:00, 2022-10-31 00:00:00]
                                 ...                    
レタス   0       [2015-06-01 00:00:00, 2016-11-07 00:00:00]
      1       [2016-11-08 00:00:00, 2018-04-26 00:00:00]
      2       [2018-04-27 00:00:00, 2019-10-23 00:00:00]
      3       [2019-10-24 00:00:00, 2021-04-27 00:00:00]
      4       [2021-04-28 00:00:00, 2022-10-31 00:00:00]
Name: date, Length: 75, dtype: object

### kindをラベルエンコーディング

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
train_fold['kind_le'] = le.fit_transform(train_fold.kind)
test['kind_le'] = le.transform(test.kind)
display(train_fold.head())
display(test.head())

Unnamed: 0,kind,date,amount,mode_price,area,year,weekno,month,mean_mean_temp_lag0,max_mean_temp_lag0,...,mean_sun_time_lag12,max_sun_time_lag12,min_sun_time_lag12,mean_mean_humid_lag12,max_mean_humid_lag12,min_mean_humid_lag12,mode_price_30days_ago,mode_price_365days_ago,fold,kind_le
2585,だいこん,2015-06-01,175714.0,864.0,千葉_青森,2015,23,6,18.05,22.9,...,7.93871,13.15,0.0,65.516129,86.5,42.5,1141.714286,826.071429,0,4
2586,だいこん,2015-06-02,167112.0,756.0,千葉_青森,2015,23,6,18.05,22.9,...,7.93871,13.15,0.0,65.516129,86.5,42.5,1141.714286,826.071429,0,4
2587,だいこん,2015-06-04,202752.0,891.0,千葉_青森,2015,23,6,18.05,22.9,...,7.93871,13.15,0.0,65.516129,86.5,42.5,1141.714286,826.071429,0,4
2588,だいこん,2015-06-05,142763.0,648.0,千葉_全国_青森,2015,23,6,18.66045,23.346875,...,7.853044,13.155208,0.007292,64.958333,86.947917,42.270833,1141.714286,826.071429,0,4
2589,だいこん,2015-06-06,174205.0,810.0,千葉_青森,2015,23,6,18.05,22.9,...,7.93871,13.15,0.0,65.516129,86.5,42.5,1141.714286,826.071429,0,4


Unnamed: 0,kind,date,amount,mode_price,area,year,weekno,month,mean_mean_temp_lag0,max_mean_temp_lag0,...,min_sum_rain_lag12,mean_sun_time_lag12,max_sun_time_lag12,min_sun_time_lag12,mean_mean_humid_lag12,max_mean_humid_lag12,min_mean_humid_lag12,mode_price_30days_ago,mode_price_365days_ago,kind_le
0,だいこん,2022-11-01,,,千葉_全国_横浜,2022,44,11,17.937923,24.811458,...,0.0,4.802148,10.76875,0.0,71.733724,91.34375,53.177083,984.857143,648.0,4
1,だいこん,2022-11-02,,,千葉_全国_横浜,2022,44,11,17.937923,24.811458,...,0.0,4.802148,10.76875,0.0,71.733724,91.34375,53.177083,984.857143,648.0,4
2,だいこん,2022-11-04,,,千葉_全国_横浜,2022,44,11,17.937923,24.811458,...,0.0,4.802148,10.76875,0.0,71.733724,91.34375,53.177083,984.857143,648.0,4
3,だいこん,2022-11-05,,,千葉_全国_横浜,2022,44,11,17.937923,24.811458,...,0.0,4.802148,10.76875,0.0,71.733724,91.34375,53.177083,984.857143,648.0,4
4,だいこん,2022-11-07,,,千葉_全国_横浜,2022,45,11,17.937923,24.811458,...,0.0,4.802148,10.76875,0.0,71.733724,91.34375,53.177083,984.857143,648.0,4


### 予測

In [7]:
import lightgbm as lgb

seed = 1234

params = {
    'boosting_type': 'gbdt',
    'metric': 'None',
    'objective': 'regression',
    'seed': seed,
    'learning_rate': 0.007,
}

import numpy as np

def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [None]:
target = 'mode_price'
pred_train = np.zeros(len(train_fold))
pred_test = []
for fold in range(n_splits):
    print(f'---- {fold} ----')
    drop_cols = ['kind', 'area', 'date', 'amount']#, 'area_le']
    train_X = train_fold.drop(drop_cols, axis=1).loc[lambda x: x.fold!=fold].drop('fold', axis=1)
    train_y = train_X.pop(target)
    valid_X = train_fold.drop(drop_cols, axis=1).loc[lambda x: x.fold==fold].drop('fold', axis=1)
    valid_y = valid_X.pop(target)
    train_w = 1 / np.square(train_y)
    valid_w = 1 / np.square(valid_y)
    feature_name = train_X.columns.tolist()
    train = lgb.Dataset(
        data=train_X,
        label=train_y,
        feature_name=feature_name,
        weight = train_w,
    )
    valid = lgb.Dataset(
        data=valid_X,
        label=valid_y,
        feature_name=feature_name,
        weight = valid_w,
    )
    model = lgb.train(
        params,
        train,
        valid_sets = [valid],
        num_boost_round = 1000,
        verbose_eval = 100,
        early_stopping_rounds = 100,
        feval = feval_rmspe,
	)
    pred_train[train_fold.fold==fold] = model.predict(valid_X)
    pred_test.append(model.predict(test[feature_name]))

---- 0 ----




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29740
[LightGBM] [Info] Number of data points in the train set: 108290, number of used features: 132
[LightGBM] [Info] Start training from score 109.760768
Training until validation scores don't improve for 100 rounds
[100]	valid_0's RMSPE: 0.488466
[200]	valid_0's RMSPE: 0.343305
[300]	valid_0's RMSPE: 0.29024
[400]	valid_0's RMSPE: 0.270924
[500]	valid_0's RMSPE: 0.264942
[600]	valid_0's RMSPE: 0.262365
[700]	valid_0's RMSPE: 0.261832


### CV確認

In [None]:
train_fold['pred'] = pred_train
print(rmspe(train_fold.mode_price, train_fold.pred))

for k in train_fold.kind.unique():
    print(k, len(test[lambda x: x.kind==k]), rmspe(train_fold.loc[lambda x: x.kind==k, 'mode_price'], train_fold.loc[lambda x: x.kind==k, 'pred']))
    
for m in train_fold.month.unique():
    print(m, len(test[lambda x: x.month==m]), rmspe(train_fold.loc[lambda x: x.month==m, 'mode_price'], train_fold.loc[lambda x: x.month==m, 'pred'])) 