In [68]:
# ライブラリーの読み込み
import numpy as np
import pandas as pd
import pmdarima as pm
from pmdarima import utils
from pmdarima import arima
from pmdarima import model_selection
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from matplotlib import pyplot as plt

# グラフのスタイルとサイズ
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [12, 9]

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# 訓練データにしかない野菜は除く
kinds = test_df['kind'].unique()
train_df = train_df[train_df['kind'].isin(kinds)]
train_df['date_dt'] = pd.to_datetime(train_df['date'], format='%Y%m%d')
train_df['month'] = train_df['date_dt'].map(lambda x: x.month)
train_df['day'] = train_df['date_dt'].map(lambda x: x.day)
train_df['early_and_late'] = 0
train_df['early_and_late'] = train_df['early_and_late'].mask(train_df['day'] > 15, 1)
train_df

Unnamed: 0,kind,date,amount,mode_price,area,year,weekno,date_dt,month,day,early_and_late
0,だいこん,20051107,201445,735.0,千葉,2005,45,2005-11-07,11,7,0
1,だいこん,20051108,189660,840.0,千葉_各地_青森,2005,45,2005-11-08,11,8,0
2,だいこん,20051110,218166,735.0,千葉_各地_青森,2005,45,2005-11-10,11,10,0
3,だいこん,20051111,182624,682.5,千葉_青森,2005,45,2005-11-11,11,11,0
4,だいこん,20051112,220691,682.5,千葉_青森,2005,45,2005-11-12,11,12,0
...,...,...,...,...,...,...,...,...,...,...,...
77531,ミニトマト,20221025,22558,194.0,各地,2022,43,2022-10-25,10,25,1
77532,ミニトマト,20221027,52938,184.0,各地,2022,43,2022-10-27,10,27,1
77533,ミニトマト,20221028,28493,194.0,各地,2022,43,2022-10-28,10,28,1
77534,ミニトマト,20221029,43183,184.0,各地,2022,43,2022-10-29,10,29,1


In [71]:
info = train_df.groupby(['kind', 'year', 'month', 'early_and_late'])['mode_price'].describe()
info


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,std,min,25%,50%,75%,max
kind,year,month,early_and_late,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
かぼちゃ,2005,11,0,7.0,1027.500000,128.086885,840.0,971.25,1050.0,1050.000,1260.0
かぼちゃ,2005,11,1,12.0,1190.000000,81.742389,1050.0,1155.00,1207.5,1260.000,1260.0
かぼちゃ,2005,12,0,12.0,1281.875000,206.945110,945.0,1128.75,1312.5,1391.250,1575.0
かぼちゃ,2005,12,1,12.0,2021.250000,352.180706,1680.0,1758.75,1890.0,2296.875,2677.5
かぼちゃ,2006,3,0,11.0,3154.772727,662.189035,1575.0,2887.50,3412.5,3412.500,4200.0
...,...,...,...,...,...,...,...,...,...,...,...
レタス,2022,8,1,11.0,1124.181818,136.221744,810.0,1134.00,1188.0,1188.000,1242.0
レタス,2022,9,0,11.0,1453.090909,317.136392,1080.0,1161.00,1404.0,1728.000,1836.0
レタス,2022,9,1,10.0,1652.400000,264.789728,1296.0,1458.00,1620.0,1836.000,2052.0
レタス,2022,10,0,11.0,1453.090909,228.407292,1080.0,1296.00,1512.0,1620.000,1728.0


In [72]:
info.to_csv('early_and_late.csv')

In [47]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# 学習データとテストデータ（直近12ヶ月間）に分割
# df_train, df_test = model_selection.train_test_split(df, test_size=30)

train_df = train_df.query('20160101 <= date')
train_df = train_df.query('kind == "だいこん"').reset_index(drop=True)
# train_df.groupby('year').apply(lambda x: x['month'].nunique())

df_train = train_df.query('date <= 20220930').reset_index(drop=True)['mode_price']
df_test = train_df.query('20221001 <= date').reset_index(drop=True)['mode_price']

train_df = train_df.groupby(['year','month']).mean()



print(df_train.shape)
print(df_test.shape)

(1735,)
(22,)


  train_df = train_df.groupby(['year','month']).mean()


In [48]:
df_test

0     1296.0
1     1188.0
2     1188.0
3     1296.0
4     1296.0
5     1188.0
6     1188.0
7     1188.0
8     1080.0
9     1080.0
10    1188.0
11    1080.0
12     972.0
13     864.0
14     864.0
15     972.0
16     756.0
17     702.0
18     648.0
19     648.0
20     648.0
21     648.0
Name: mode_price, dtype: float64

In [45]:
df_test

0    999.0
Name: mode_price, dtype: float64

In [42]:
# モデル構築（Auto ARIMA）
arima_model = pm.auto_arima(df_train, 
                            seasonal=True,
                            m=52,
                            trace=True,
                            n_jobs=-1,
                            maxiter=10)



Performing stepwise search to minimize aic
 ARIMA(2,0,2)(1,0,1)[52] intercept   : AIC=1087.746, Time=1.97 sec
 ARIMA(0,0,0)(0,0,0)[52] intercept   : AIC=1133.461, Time=0.01 sec
 ARIMA(1,0,0)(1,0,0)[52] intercept   : AIC=1086.378, Time=0.66 sec
 ARIMA(0,0,1)(0,0,1)[52] intercept   : AIC=1098.342, Time=0.54 sec
 ARIMA(0,0,0)(0,0,0)[52]             : AIC=1361.875, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[52] intercept   : AIC=1084.646, Time=0.05 sec
 ARIMA(1,0,0)(0,0,1)[52] intercept   : AIC=1086.453, Time=0.91 sec
 ARIMA(1,0,0)(1,0,1)[52] intercept   : AIC=1088.381, Time=1.16 sec
 ARIMA(2,0,0)(0,0,0)[52] intercept   : AIC=1080.703, Time=0.07 sec
 ARIMA(2,0,0)(1,0,0)[52] intercept   : AIC=1082.672, Time=0.84 sec
 ARIMA(2,0,0)(0,0,1)[52] intercept   : AIC=1082.669, Time=0.56 sec
 ARIMA(2,0,0)(1,0,1)[52] intercept   : AIC=1084.666, Time=0.87 sec
 ARIMA(3,0,0)(0,0,0)[52] intercept   : AIC=1081.813, Time=0.05 sec
 ARIMA(2,0,1)(0,0,0)[52] intercept   : AIC=1084.699, Time=0.07 sec
 ARIMA(1,0,1)(0,0,0

In [44]:
# 予測
##学習データの期間の予測値
train_pred = arima_model.predict_in_sample()
##テストデータの期間の予測値
test_pred, test_pred_ci = arima_model.predict(
    n_periods=df_test.shape[0], 
    return_conf_int=True
)
# テストデータで精度検証
print(rmspe(df_test.values, test_pred.values))

0.2717455573372251
