### **特徴量 ver1**
店舗の情報, 商品の情報, イベント情報, 日時情報, 価格情報

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 100000

#### **ベースのデータ**
ここに情報をくっつけていく

In [2]:
data = pd.concat(map(pd.read_pickle, glob.glob(os.path.join('data',"sales_train_validation_split","sales_train_validation_split_*.pickle")))) #分割ファイルを結合して読み出し
data["item_cnt"] = data["item_cnt"].astype("float64")
data.columns = ["id","item_id","dept_id","cat_id","store_id","state_id","d","item_cnt"]
data.head(5)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,item_cnt
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
1,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_2,0.0
2,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_3,0.0
3,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_4,0.0
4,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_5,0.0


#### **カレンダー情報を結合**

In [3]:
calendar = pd.read_csv(os.path.join("rawdata","calendar.csv"))
print("**** calendar ****")
display(calendar.head(5))

data = pd.merge(data,calendar,on=["d"],how="left")
print("**** data(joined calendar-info) ****")
display(data.head(5))

**** calendar ****


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


**** data(joined calendar-info) ****


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,item_cnt,date,wm_yr_wk,...,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,...,1,1,2011,,,,,0,0,0
1,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_2,0.0,2011-01-30,11101,...,2,1,2011,,,,,0,0,0
2,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_3,0.0,2011-01-31,11101,...,3,1,2011,,,,,0,0,0
3,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_4,0.0,2011-02-01,11101,...,4,2,2011,,,,,1,1,0
4,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_5,0.0,2011-02-02,11101,...,5,2,2011,,,,,1,0,1


#### **価格の情報を結合**
- 各商品, 各店舗ごとに, 週ごとの価格が入ってる. 
- 週番号11101～11324の値がないので, 商品, 店舗ごとに価格の最大値, 最小値, 平均, 中央値を計算する

In [4]:
sell_prices = pd.read_csv(os.path.join("rawdata","sell_prices.csv"))
sell_prices = sell_prices.groupby(by=["item_id","store_id"]).agg({"sell_price":["median","mean","max","min"]})
sell_prices = sell_prices.reset_index()
sell_prices.columns = ["item_id","store_id","price-median","price-mean","price-max","price-min"]
print("**** sell_prices ****")
display(sell_prices)
print()

data = pd.merge(data,sell_prices,on=["item_id","store_id"],how="left")
print("**** data(joined price-info) ****")
display(data.head(5))

**** sell_prices ****


Unnamed: 0,item_id,store_id,price-median,price-mean,price-max,price-min
0,FOODS_1_001,CA_1,2.24,2.169362,2.24,2.00
1,FOODS_1_001,CA_2,2.24,2.169362,2.24,2.00
2,FOODS_1_001,CA_3,2.24,2.158262,2.24,1.75
3,FOODS_1_001,CA_4,2.24,2.169362,2.24,2.00
4,FOODS_1_001,TX_1,2.24,2.157624,2.24,0.99
...,...,...,...,...,...,...
30485,HOUSEHOLD_2_516,TX_2,5.94,5.922482,5.94,3.00
30486,HOUSEHOLD_2_516,TX_3,5.94,5.922447,5.94,0.99
30487,HOUSEHOLD_2_516,WI_1,5.94,5.940000,5.94,5.94
30488,HOUSEHOLD_2_516,WI_2,5.94,5.940000,5.94,5.94



**** data(joined price-info) ****


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,item_cnt,date,wm_yr_wk,...,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,price-median,price-mean,price-max,price-min
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,...,,,,0,0,0,8.26,8.285714,9.58,8.26
1,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_2,0.0,2011-01-30,11101,...,,,,0,0,0,8.26,8.285714,9.58,8.26
2,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_3,0.0,2011-01-31,11101,...,,,,0,0,0,8.26,8.285714,9.58,8.26
3,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_4,0.0,2011-02-01,11101,...,,,,1,1,0,8.26,8.285714,9.58,8.26
4,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_5,0.0,2011-02-02,11101,...,,,,1,0,1,8.26,8.285714,9.58,8.26


#### **各カラム, NaNを埋める**

In [5]:
# 各特徴量 NaNの割合
print("**** ratio of NaN-values ****")
display(data.isna().sum()/len(data)*100)

**** ratio of NaN-values ****


id               0.000000
item_id          0.000000
dept_id          0.000000
cat_id           0.000000
store_id         0.000000
state_id         0.000000
d                0.000000
item_cnt         2.844083
date             0.000000
wm_yr_wk         0.000000
weekday          0.000000
wday             0.000000
month            0.000000
year             0.000000
event_name_1    91.772473
event_type_1    91.772473
event_name_2    99.746064
event_type_2    99.746064
snap_CA          0.000000
snap_TX          0.000000
snap_WI          0.000000
price-median     0.000000
price-mean       0.000000
price-max        0.000000
price-min        0.000000
dtype: float64

In [6]:
# year,month,wday
data["date"] = pd.to_datetime(data["date"])
data["year"] = data["date"].dt.year
data["month"] = data["date"].dt.month
data["wday"] = data["date"].dt.weekday

In [7]:
# event-info
# maybe NaN in event-info indicates that there is no events
data["event_name_1"] = data["event_name_1"].fillna("No-events")
data["event_type_1"] = data["event_type_1"].fillna("No-type")
data["event_name_2"] = data["event_name_2"].fillna("No-events")
data["event_type_2"] = data["event_type_2"].fillna("No-type")

#### **ラベルエンコーディング**

In [8]:
print(data.dtypes)

id                      object
item_id                 object
dept_id                 object
cat_id                  object
store_id                object
state_id                object
d                       object
item_cnt               float64
date            datetime64[ns]
wm_yr_wk                 int64
weekday                 object
wday                     int64
month                    int64
year                     int64
event_name_1            object
event_type_1            object
event_name_2            object
event_type_2            object
snap_CA                  int64
snap_TX                  int64
snap_WI                  int64
price-median           float64
price-mean             float64
price-max              float64
price-min              float64
dtype: object


In [9]:
from sklearn.preprocessing import LabelEncoder

encode_col = data.columns[(data.dtypes == "object")&(data.columns != "id")]
for col in encode_col:
    le = LabelEncoder()
    le = le.fit(data[col])
    
    data[col] = le.transform(data[col]).astype("int32")
    print(col+" encoded!")

item_id encoded!
dept_id encoded!
cat_id encoded!
store_id encoded!
state_id encoded!
d encoded!
weekday encoded!
event_name_1 encoded!
event_type_1 encoded!
event_name_2 encoded!
event_type_2 encoded!


In [10]:
print(data.dtypes)

id                      object
item_id                  int32
dept_id                  int32
cat_id                   int32
store_id                 int32
state_id                 int32
d                        int32
item_cnt               float64
date            datetime64[ns]
wm_yr_wk                 int64
weekday                  int32
wday                     int64
month                    int64
year                     int64
event_name_1             int32
event_type_1             int32
event_name_2             int32
event_type_2             int32
snap_CA                  int64
snap_TX                  int64
snap_WI                  int64
price-median           float64
price-mean             float64
price-max              float64
price-min              float64
dtype: object


#### **大きすぎるので分割して保存**

In [11]:
dir = os.path.join("data","features_v1")
if not(os.path.exists(dir)):
    os.makedirs(dir)

batchsize = 300*1969
ns = 0
N = len(data)
nact = N/batchsize
while ns <= nact:
    act = np.array(range(ns*batchsize, np.minimum((ns+1)*batchsize, N))).astype("int64")
    
    (data.loc[act]).to_pickle(os.path.join("data","features_v1","features_v1_split_%03d.pickle"%ns))
    ns += 1