# 数据导入

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format='retina'

In [2]:
store = pd.read_csv('data/store.csv', index_col = 0)
train = pd.read_csv('data/train.csv', index_col = 0)
test = pd.read_csv('data/test.csv', index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


# 数据预处理

##  store 数据

### Bag-of-Word

对 Store 的 PromoInterval 使用 BOW 离散化：

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

doc_array = count_vector.fit_transform(store['PromoInterval'].fillna('NaN')).toarray()
count_vector.get_feature_names()

def normallize(item):
    return 'PromoInterval_'+item.capitalize()
columns = list(map(normallize,count_vector.get_feature_names()))

Bow_Matrix = pd.DataFrame(doc_array, columns=columns)
Bow_Matrix.head(5)

Unnamed: 0,PromoInterval_Apr,PromoInterval_Aug,PromoInterval_Dec,PromoInterval_Feb,PromoInterval_Jan,PromoInterval_Jul,PromoInterval_Jun,PromoInterval_Mar,PromoInterval_May,PromoInterval_Nan,PromoInterval_Nov,PromoInterval_Oct,PromoInterval_Sept
0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,1,1,0,0,0,0,0,1,0
2,1,0,0,0,1,1,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0


In [7]:
store[Bow_Matrix.columns] = Bow_Matrix
store.drop('PromoInterval', axis=1, inplace=True)
store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,Promo2,Promo2SinceWeek,Promo2SinceYear,CompetitionOpenTime,PromoInterval_Apr,PromoInterval_Aug,...,PromoInterval_Feb,PromoInterval_Jan,PromoInterval_Jul,PromoInterval_Jun,PromoInterval_Mar,PromoInterval_May,PromoInterval_Nan,PromoInterval_Nov,PromoInterval_Oct,PromoInterval_Sept
0,1,c,a,1270.0,0,-1.0,-1.0,88.0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2,a,a,570.0,1,13.0,2010.0,98.0,1,0,...,0,1,1,0,0,0,0,0,1,0
2,3,a,a,14130.0,1,14.0,2011.0,109.0,1,0,...,0,1,1,0,0,0,0,0,1,0
3,4,c,c,620.0,0,-1.0,-1.0,76.0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,5,a,a,29910.0,0,-1.0,-1.0,9.0,0,0,...,0,0,0,0,0,0,1,0,0,0


## train 数据

### 数据联合及时间转换

将 train 的日期数据（date）转换为月份列和日期列

In [8]:
Date_to_Date = lambda x: x.day
Date_to_Month = lambda x: x.month
Date_to_Year = lambda x: x.year
Month_to_Season = dict(zip(np.arange(12)+1, np.repeat(np.arange(4)+1,3)))

Data = train.drop(['Sales','Customers'], axis=1)
Data = pd.merge(Data,store,how='left')
Data['Date2'] = pd.to_datetime(Data['Date']) 
Data['Day'] = Data['Date2'].apply(Date_to_Date)
Data['Month'] = Data['Date2'].apply(Date_to_Month)
Data['Year'] = Data['Date2'].apply(Date_to_Year)
Data['Season'] = Data['Month'].map(Month_to_Season)
Data.drop(['Date','Date2'], axis=1, inplace=True)
Data = Data[['Store', 'Month', 'Day', 'Year', 'DayOfWeek',
 'Season', 'Open','StateHoliday', 'SchoolHoliday','CompetitionDistance',
 'CompetitionOpenTime','StoreType', 'Assortment', 'Promo',
 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear']+Bow_Matrix.columns.to_list()]
Data.head()

Unnamed: 0,Store,Month,Day,Year,DayOfWeek,Season,Open,StateHoliday,SchoolHoliday,CompetitionDistance,...,PromoInterval_Feb,PromoInterval_Jan,PromoInterval_Jul,PromoInterval_Jun,PromoInterval_Mar,PromoInterval_May,PromoInterval_Nan,PromoInterval_Nov,PromoInterval_Oct,PromoInterval_Sept
0,1,7,31,2015,5,3,1,0,1,1270.0,...,0,0,0,0,0,0,1,0,0,0
1,2,7,31,2015,5,3,1,0,1,570.0,...,0,1,1,0,0,0,0,0,1,0
2,3,7,31,2015,5,3,1,0,1,14130.0,...,0,1,1,0,0,0,0,0,1,0
3,4,7,31,2015,5,3,1,0,1,620.0,...,0,0,0,0,0,0,1,0,0,0
4,5,7,31,2015,5,3,1,0,1,29910.0,...,0,0,0,0,0,0,1,0,0,0


### One-Hot

对 ['StoreType','Assortment'] 两个分类变量进行独热编码：

In [9]:
index = Data[Data['StateHoliday'] == 0].index
Data.loc[index,['StateHoliday']] = '0'

One_Hot_Matrix = pd.get_dummies(Data[['StoreType','Assortment','StateHoliday']]) 
Data[One_Hot_Matrix.columns] = One_Hot_Matrix
Data.drop(['StoreType','Assortment','StateHoliday'], axis=1, inplace=True)
Data.head()

Unnamed: 0,Store,Month,Day,Year,DayOfWeek,Season,Open,SchoolHoliday,CompetitionDistance,CompetitionOpenTime,...,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
0,1,7,31,2015,5,3,1,1,1270.0,88.0,...,0,1,0,1,0,0,1,0,0,0
1,2,7,31,2015,5,3,1,1,570.0,98.0,...,0,0,0,1,0,0,1,0,0,0
2,3,7,31,2015,5,3,1,1,14130.0,109.0,...,0,0,0,1,0,0,1,0,0,0
3,4,7,31,2015,5,3,1,1,620.0,76.0,...,0,1,0,0,0,1,1,0,0,0
4,5,7,31,2015,5,3,1,1,29910.0,9.0,...,0,0,0,1,0,0,1,0,0,0


***

处理完成后的数据：

In [10]:
Data.columns

Index(['Store', 'Month', 'Day', 'Year', 'DayOfWeek', 'Season', 'Open',
       'SchoolHoliday', 'CompetitionDistance', 'CompetitionOpenTime', 'Promo',
       'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval_Apr',
       'PromoInterval_Aug', 'PromoInterval_Dec', 'PromoInterval_Feb',
       'PromoInterval_Jan', 'PromoInterval_Jul', 'PromoInterval_Jun',
       'PromoInterval_Mar', 'PromoInterval_May', 'PromoInterval_Nan',
       'PromoInterval_Nov', 'PromoInterval_Oct', 'PromoInterval_Sept',
       'StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d',
       'Assortment_a', 'Assortment_b', 'Assortment_c', 'StateHoliday_0',
       'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c'],
      dtype='object')

## 对数据进行转换

对测试数据可以采取同样的处理过程：

In [24]:
def BOW(store=store):

    from sklearn.feature_extraction.text import CountVectorizer
    count_vector = CountVectorizer()

    doc_array = count_vector.fit_transform(store['PromoInterval'].fillna('NaN')).toarray()
    count_vector.get_feature_names()

    def normallize(item):
        return 'PromoInterval_'+item.capitalize()
    columns = list(map(normallize,count_vector.get_feature_names()))

    Bow_Matrix = pd.DataFrame(doc_array, columns=columns)

    store[Bow_Matrix.columns] = Bow_Matrix
    store.drop('PromoInterval', axis=1, inplace=True)
    return store, Bow_Matrix

In [19]:
def data_preprocess(Data, extend_data=store, BOW_key = Bow_Matrix.columns):
    
    Date_to_Date = lambda x: x.day
    Date_to_Month = lambda x: x.month
    Date_to_Year = lambda x: x.year
    Month_to_Season = dict(zip(np.arange(12)+1, np.repeat(np.arange(4)+1,3)))

    # merge data
    Data = pd.merge(Data,extend_data,how='left')
    
    # transform time data('Data') to ['Day','Month','Year','Season']
    Data['Date2'] = pd.to_datetime(Data['Date']) 
    Data['Day'] = Data['Date2'].apply(Date_to_Date)
    Data['Month'] = Data['Date2'].apply(Date_to_Month)
    Data['Year'] = Data['Date2'].apply(Date_to_Year)
    Data['Season'] = Data['Month'].map(Month_to_Season)
    Data.drop(['Date','Date2'], axis=1, inplace=True)
    Data = Data[['Store', 'Month', 'Day', 'Year', 'DayOfWeek',
     'Season', 'Open','StateHoliday', 'SchoolHoliday','CompetitionDistance',
     'CompetitionOpenTime','StoreType', 'Assortment', 'Promo',
     'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear']+BOW_key.to_list()]
    
    # One-Hot on ['StateHoliday', 'StateHoliday']
    index = Data[Data['StateHoliday'] == 0].index
    Data.loc[index,['StateHoliday']] = '0'

    One_Hot_Matrix = pd.get_dummies(Data[['StoreType','Assortment','StateHoliday']]) 
    Data[One_Hot_Matrix.columns] = One_Hot_Matrix
    Data.drop(['StoreType','Assortment','StateHoliday'], axis=1, inplace=True)
    
    return Data

In [26]:
Data = data_preprocess(train.drop(['Sales','Customers'], axis=1))
display(Data.head())


Data_test = data_preprocess(test.drop('Id', axis=1))
display(Data_test.head())

Unnamed: 0,Store,Month,Day,Year,DayOfWeek,Season,Open,SchoolHoliday,CompetitionDistance,CompetitionOpenTime,...,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
0,1,7,31,2015,5,3,1,1,1270.0,88.0,...,0,1,0,1,0,0,1,0,0,0
1,2,7,31,2015,5,3,1,1,570.0,98.0,...,0,0,0,1,0,0,1,0,0,0
2,3,7,31,2015,5,3,1,1,14130.0,109.0,...,0,0,0,1,0,0,1,0,0,0
3,4,7,31,2015,5,3,1,1,620.0,76.0,...,0,1,0,0,0,1,1,0,0,0
4,5,7,31,2015,5,3,1,1,29910.0,9.0,...,0,0,0,1,0,0,1,0,0,0


Unnamed: 0,Store,Month,Day,Year,DayOfWeek,Season,Open,SchoolHoliday,CompetitionDistance,CompetitionOpenTime,...,PromoInterval_Sept,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,StateHoliday_0,StateHoliday_a
0,1,9,17,2015,4,3,1,0,1270.0,88.0,...,0,0,0,1,0,1,0,0,1,0
1,3,9,17,2015,4,3,1,0,14130.0,109.0,...,0,1,0,0,0,1,0,0,1,0
2,7,9,17,2015,4,3,1,0,24000.0,33.0,...,0,1,0,0,0,0,0,1,1,0
3,8,9,17,2015,4,3,1,0,7520.0,15.0,...,0,1,0,0,0,1,0,0,1,0
4,9,9,17,2015,4,3,1,0,2030.0,185.0,...,0,1,0,0,0,0,0,1,1,0
