In [15]:
from pathlib import Path
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns
sns.set(font="IPAexGothic")
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [16]:
train_df = pd.read_csv('tutorial_preprocessed_train.csv')
print(train_df.shape)
train_df
test_df = pd.read_csv('tutorial_preprocessed_test.csv')
print(test_df.shape)
test_df
train_df['year'] = train_df['date']//10000
test_df['year'] = test_df['date']//10000
train_df['month'] = train_df['date'].apply(lambda x: int(str(x)[4:6]))
test_df['month'] = test_df['date'].apply(lambda x: int(str(x)[4:6]))

# test_dataにない野菜をtrain_dataから除く
kinds = test_df['kind'].unique()
train_df = train_df[train_df['kind'].isin(kinds)]
print(train_df.shape)

have_data_combs = [list(i) for i in all_df[['kind','year','month']].drop_duplicates().values]
dum_data = []
for kind in kinds:
    for year in range(2005, 2023):
        for month in range(1,13):
            if year < 2022 or (year == 2022 and month < 11):
                if [kind, year, month] not in have_data_combs:
                    date = year*10000+month*100+99
                    dum_data.append([kind,date,0,0,'全国',year, month])

dum_df = pd.DataFrame(dum_data, columns=all_df.columns)
all_df = pd.concat([all_df, dum_df]).reset_index(drop=True)
all_df.head()

(77751, 7)
(315, 7)
(61214, 8)


Unnamed: 0,kind,date,amount,mode_price,area,year,month
0,だいこん,20051107,201445.0,735.0,千葉,2005,11
1,だいこん,20051108,189660.0,840.0,千葉_全国_青森,2005,11
2,だいこん,20051110,218166.0,735.0,千葉_全国_青森,2005,11
3,だいこん,20051111,182624.0,682.5,千葉_青森,2005,11
4,だいこん,20051112,220691.0,682.5,千葉_青森,2005,11


In [17]:
wea_df = pd.read_csv('tutorial_preprocessed_weather.csv')
print(wea_df.shape)
wea_df.head()

(46872, 21)


Unnamed: 0,year,month,mean_mean_temp,max_mean_temp,min_mean_temp,mean_max_temp,max_max_temp,min_max_temp,mean_min_temp,max_min_temp,...,mean_sum_rain,max_sum_rain,min_sum_rain,mean_sun_time,max_sun_time,min_sun_time,mean_mean_humid,max_mean_humid,min_mean_humid,area
0,2004.0,11.0,14.596,19.3,9.6,18.34,22.2,12.4,11.228,17.4,...,5.84,87.0,0.0,5.084,9.5,0.0,71.76,93.0,43.0,千葉
1,2004.0,12.0,9.641935,19.0,2.2,13.564516,24.3,3.8,6.13871,11.5,...,2.435484,26.0,0.0,5.509677,9.4,0.0,58.612903,86.0,37.0,千葉
2,2005.0,1.0,6.277419,13.1,3.1,10.46129,17.2,5.2,2.522581,6.4,...,3.33871,66.5,0.0,6.612903,9.8,0.0,54.83871,89.0,32.0,千葉
3,2005.0,2.0,5.960714,12.0,3.3,9.864286,15.4,5.1,2.060714,4.6,...,1.910714,11.5,0.0,5.225,9.7,0.0,55.714286,93.0,31.0,千葉
4,2005.0,3.0,8.8,13.7,1.6,13.122581,18.4,3.6,4.596774,10.0,...,2.612903,22.5,0.0,5.529032,11.3,0.0,59.129032,91.0,31.0,千葉


ラグ特徴量を作ります。以下では1,2,3,6,9,12ヶ月前の天候を特徴量としています

In [18]:
def add_weather_feat(all_df, nshift):

    mer_wea_df = wea_df.copy()
    mer_wea_df.columns = [f'{i}_{nshift}prev' if i not in ['year','month','area'] else i for i in mer_wea_df.columns]
    mer_wea_df = mer_wea_df.rename(columns={'year':'merge_year','month':'merge_month'})

    data = []

    for year, month in zip(all_df['year'].values, all_df['month'].values):
        month -= nshift
        if month <= 0:
            month += 12
            year -=1
        data.append([year, month])

    tmp_df = pd.DataFrame(data, columns=['merge_year','merge_month'])

    mer_df = pd.concat([all_df, tmp_df],axis=1)

    mer_df = pd.merge(mer_df, mer_wea_df, on=['merge_year','merge_month','area'], how='left')
    mer_df.drop(['merge_year', 'merge_month'], axis=1, inplace=True)

    return mer_df


mer_df = all_df.copy()

for nshift in [1,2,3,6,9,12]:
    mer_df = add_weather_feat(mer_df, nshift)

print(mer_df.shape)
mer_df.head()

(61959, 115)


Unnamed: 0,kind,date,amount,mode_price,area,year,month,mean_mean_temp_1prev,max_mean_temp_1prev,min_mean_temp_1prev,...,min_min_temp_12prev,mean_sum_rain_12prev,max_sum_rain_12prev,min_sum_rain_12prev,mean_sun_time_12prev,max_sun_time_12prev,min_sun_time_12prev,mean_mean_humid_12prev,max_mean_humid_12prev,min_mean_humid_12prev
0,だいこん,20051107,201445.0,735.0,千葉,2005,11,18.83871,24.7,13.4,...,6.4,5.84,87.0,0.0,5.084,9.5,0.0,71.76,93.0,43.0
1,だいこん,20051108,189660.0,840.0,千葉_全国_青森,2005,11,17.103797,22.54375,11.726042,...,3.5625,3.898958,47.526042,0.0,4.323542,9.292708,0.0,69.999167,89.989583,48.197917
2,だいこん,20051110,218166.0,735.0,千葉_全国_青森,2005,11,17.103797,22.54375,11.726042,...,3.5625,3.898958,47.526042,0.0,4.323542,9.292708,0.0,69.999167,89.989583,48.197917
3,だいこん,20051111,182624.0,682.5,千葉_青森,2005,11,16.454839,21.85,11.15,...,3.2,4.12,50.0,0.0,3.762,9.2,0.0,71.06,91.0,47.5
4,だいこん,20051112,220691.0,682.5,千葉_青森,2005,11,16.454839,21.85,11.15,...,3.2,4.12,50.0,0.0,3.762,9.2,0.0,71.06,91.0,47.5
