## knock 090 時系列データを分割する

In [27]:
import polars as pl
pl.Config.set_tbl_cols(-1)# 列の表示が省略されないようにする
import polars.selectors as cs# 抽出条件のプリセット

### データを読み込む

In [28]:
df_receipt = pl.read_csv("../docker/work/data/receipt.csv")
display(df_receipt.head())

sales_ymd,sales_epoch,store_cd,receipt_no,receipt_sub_no,customer_id,product_cd,quantity,amount
i64,i64,str,i64,i64,str,str,i64,i64
20181103,1541203200,"""S14006""",112,1,"""CS006214000001…","""P070305012""",1,158
20181118,1542499200,"""S13008""",1132,2,"""CS008415000097…","""P070701017""",1,81
20170712,1499817600,"""S14028""",1102,1,"""CS028414000014…","""P060101005""",1,170
20190205,1549324800,"""S14042""",1132,1,"""ZZ000000000000…","""P050301001""",1,25
20180821,1534809600,"""S14025""",1102,2,"""CS025415000050…","""P060102007""",1,90


### ノック

In [29]:
#2017年1月から2019年10月までの3ヵ月の
(df_receipt["sales_ymd"] // 100).unique().sort()

sales_ymd
i64
201701
201702
201703
201704
201705
201706
201707
201708
201709
201710


In [30]:
# 該当期間の前半12ヵ月を訓練データ(0)、後半6ヵ月をテストデータ(1)とする
flg_series = (
    pl.Series("test_flg",
              [0, 0, 0, 0, 0,
               0, 0, 0, 0, 0,
               0, 0,
               1, 1, 1, 1, 1,
               1])
)

df_ts = (
    df_receipt
    # 販売月の列を加える
    .with_columns(
        (pl.col("sales_ymd") // 100).alias("sales_ym")
    )
    # 月毎の売上高を計算する
    .group_by(by = "sales_ym")
    .agg( pl.sum("amount").alias("sum_amount"))
    # 月の昇順に並べ替える
    .sort(by = "sales_ym")
)
#display(df_ts)

# 期間ごとにテストデータを作成する
df_train_1 = df_ts[0:18].with_columns(flg_series)
df_train_2 = df_ts[6:24].with_columns(flg_series)
df_train_3 = df_ts[12:30].with_columns(flg_series)

display(df_train_1.head(), df_train_2.head(), df_train_3.head())

sales_ym,sum_amount,test_flg
i64,i64,i64
201701,902056,0
201702,764413,0
201703,962945,0
201704,847566,0
201705,884010,0


sales_ym,sum_amount,test_flg
i64,i64,i64
201707,959205,0
201708,954836,0
201709,902037,0
201710,905739,0
201711,932157,0


sales_ym,sum_amount,test_flg
i64,i64,i64
201801,944509,0
201802,864128,0
201803,946588,0
201804,937099,0
201805,1004438,0


In [31]:
# Scikit-learnを使う場合
from sklearn.model_selection import TimeSeriesSplit

df_ts_amount = df_receipt.groupby(pl.col('sales_ymd') // 100).agg(
    pl.col('amount').sum()
).sort('sales_ymd')

tscv = TimeSeriesSplit(gap=0, max_train_size=12, n_splits=3, test_size=6)

series_list = []
for train_index, test_index in tscv.split(df_ts_amount):
    series_list.append(
        (df_ts_amount.with_row_count('index').filter(pl.col('index').is_in(pl.lit(train_index))), 
         df_ts_amount.with_row_count('index').filter(pl.col('index').is_in(pl.lit(test_index))))
    )

df_train_1, df_test_1 = series_list[0]
df_train_2, df_test_2 = series_list[1]
df_train_3, df_test_3 = series_list[2]

  df_ts_amount = df_receipt.groupby(pl.col('sales_ymd') // 100).agg(
