## **データ整理**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
from tqdm import tqdm

from myUtils import reduce_mem_usage

import gc

### **sales_train_val - Preprocess**
1. validation用の日 d_1914,...,d_1942を追加
2. evaluation用の日 d_1943,...,d_1971を追加
3. d_1,...d_1971がカラム方向に並んでいるので, index方向に並べ替える

In [16]:
sales_train_val = pd.read_csv(os.path.join("rawdata","sales_train_validation.csv"),index_col=0) # データの読み出し

# validation,evaluation用 の日をくっつける
valid_data = pd.DataFrame(index=sales_train_val.index,
                          columns=[f"d_{d}" for d in range(1914, 1914+28)],
                          data=np.nan)
eval_data = pd.DataFrame(index=sales_train_val.index,
                          columns=[f"d_{d}" for d in range(1914+28, 1914+28+28)],
                          data=np.nan)

sales_train_val = sales_train_val.join(valid_data)
sales_train_val = sales_train_val.join(eval_data)
sales_train_val = sales_train_val.reset_index()

# メモリ開放
del valid_data, eval_data
gc.collect()


sales_train_val = pd.melt(sales_train_val,
                          id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                          var_name = 'd', value_name = 'item_cnt')

sales_train_val = reduce_mem_usage(sales_train_val)

day = (sales_train_val["d"].str.split("_",expand=True))[1].astype("int64")
day

sales_train_val["data_part"] = "train"
sales_train_val.loc[(1914 <= day)&(day < 1914 + 28),"data_part"] = "validation"
sales_train_val.loc[(1914 + 28 <= day)&(day < 1914 + 28 + 28),"data_part"] = "evaluation"

sales_train_val.loc

display(sales_train_val)

# データ書き出し
path = os.path.join("rawdata","sales_train_valid.pickle")
sales_train_val.to_pickle(path)

Mem. usage decreased to 3320.71 Mb (9.4% reduction)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,item_cnt,data_part
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,train
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,train
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,train
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,train
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,train
...,...,...,...,...,...,...,...,...,...
60034805,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1969,,evaluation
60034806,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1969,,evaluation
60034807,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1969,,evaluation
60034808,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1969,,evaluation


In [2]:
sales_train_val = pd.read_csv(os.path.join("rawdata","sales_train_evaluation.csv"),index_col=0) # データの読み出し

# evaluation用 の日をくっつける
eval_data = pd.DataFrame(index=sales_train_val.index,
                          columns=[f"d_{d}" for d in range(1942, 1970)],
                          data=np.nan)

sales_train_val = sales_train_val.join(eval_data)
sales_train_val = sales_train_val.reset_index()

# メモリ開放
del eval_data
gc.collect()

# d_1,..を縦方向に変換
sales_train_val = pd.melt(sales_train_val,
                          id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                          var_name = 'd', value_name = 'item_cnt')
sales_train_val = reduce_mem_usage(sales_train_val)


day = (sales_train_val["d"].str.split("_",expand=True))[1].astype("int64")
day

sales_train_val["data_part"] = "train"
sales_train_val.loc[(1914 <= day)&(day < 1914 + 28),"data_part"] = "validation"
sales_train_val.loc[(1914 + 28 <= day)&(day < 1914 + 28 + 28),"data_part"] = "evaluation"

display(sales_train_val)

# データ書き出し
path = os.path.join("mydata","sales_train_eval.pickle")
sales_train_val.to_pickle(path)

Mem. usage decreased to 3320.71 Mb (9.4% reduction)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,item_cnt,data_part
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,train
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,train
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,train
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,train
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,train
...,...,...,...,...,...,...,...,...,...
60034805,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1969,,evaluation
60034806,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1969,,evaluation
60034807,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1969,,evaluation
60034808,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1969,,evaluation


In [4]:
sales_train_val = pd.read_csv(os.path.join("rawdata","sales_train_evaluation.csv"),index_col=0) # データの読み出し

cols = [f"d_{d}" for d in range(1, 1942-56)]
cols = [c for c in sales_train_val.columns if c not in set(cols)]
sales_train_val = sales_train_val[cols]

# evaluation用 の日をくっつける
eval_data = pd.DataFrame(index=sales_train_val.index,
                          columns=[f"d_{d}" for d in range(1942, 1970)],
                          data=np.nan)

sales_train_val = sales_train_val.join(eval_data)
sales_train_val = sales_train_val.reset_index()

# メモリ開放
del eval_data
gc.collect()

# d_1,..を縦方向に変換
sales_train_val = pd.melt(sales_train_val,
                          id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                          var_name = 'd', value_name = 'item_cnt')
sales_train_val = reduce_mem_usage(sales_train_val)


day = (sales_train_val["d"].str.split("_",expand=True))[1].astype("int64")
day

sales_train_val["data_part"] = "train"
sales_train_val.loc[(1914 <= day)&(day < 1914 + 28),"data_part"] = "validation"
sales_train_val.loc[(1914 + 28 <= day)&(day < 1914 + 28 + 28),"data_part"] = "evaluation"

display(sales_train_val)

# データ書き出し
path = os.path.join("mydata","sales_train_eval_28.pickle")
sales_train_val.to_pickle(path)

Mem. usage decreased to 141.67 Mb (9.4% reduction)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,item_cnt,data_part
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1886,1.0,train
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1886,1.0,train
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1886,0.0,train
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1886,0.0,train
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1886,1.0,train
...,...,...,...,...,...,...,...,...,...
2561155,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1969,,evaluation
2561156,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1969,,evaluation
2561157,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1969,,evaluation
2561158,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1969,,evaluation
