# 模型算法

## 数据导入

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format='retina'

In [3]:
store = pd.read_csv('data/store.csv', index_col = 0)
train = pd.read_csv('data/train.csv', index_col = 0)
test = pd.read_csv('data/test.csv', index_col = 0)

In [4]:
X = train.drop(['Sales','Customers'], axis=1)
y = train['Sales']
X.head(5)

Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,1,1,0,1
1,2,5,2015-07-31,1,1,0,1
2,3,5,2015-07-31,1,1,0,1
3,4,5,2015-07-31,1,1,0,1
4,5,5,2015-07-31,1,1,0,1


In [5]:
y.head(5)

0     5263
1     6064
2     8314
3    13995
4     4822
Name: Sales, dtype: int64

### 对 Store 使用 BOG

### 数据联合及转换

将日期数据（date）转换为月份列和日期列

In [93]:
Date_to_Date = lambda x: x.day
Date_to_Month = lambda x: x.month
Date_to_Year = lambda x: x.year
Month_to_Season = dict(zip(np.arange(12)+1, np.repeat(np.arange(4)+1,3)))

Data = X
Data = pd.merge(Data,store,how='left')
Data['Date2'] = pd.to_datetime(Data['Date']) 
Data['Day'] = Data['Date2'].apply(Date_to_Date)
Data['Month'] = Data['Date2'].apply(Date_to_Month)
Data['Year'] = Data['Date2'].apply(Date_to_Year)
Data['Season'] = Data['Month'].map(Month_to_Season)
Data.drop(['Date','Date2'], axis=1, inplace=True)
Data = Data[['Store', 'Month', 'Day', 'Year', 'DayOfWeek',
 'Season', 'Open','StateHoliday', 'SchoolHoliday','CompetitionDistance',
 'CompetitionOpenTime','StoreType', 'Assortment', 'Promo',
 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear','PromoInterval']]
Data.head()

Unnamed: 0,Store,Month,Day,Year,DayOfWeek,Season,Open,StateHoliday,SchoolHoliday,CompetitionDistance,CompetitionOpenTime,StoreType,Assortment,Promo,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,7,31,2015,5,3,1,0,1,1270.0,88.0,c,a,1,0,-1.0,-1.0,
1,2,7,31,2015,5,3,1,0,1,570.0,98.0,a,a,1,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,7,31,2015,5,3,1,0,1,14130.0,109.0,a,a,1,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,7,31,2015,5,3,1,0,1,620.0,76.0,c,c,1,0,-1.0,-1.0,
4,5,7,31,2015,5,3,1,0,1,29910.0,9.0,a,a,1,0,-1.0,-1.0,


In [94]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

doc_array = count_vector.fit_transform(store['PromoInterval'].fillna('NaN')).toarray()
count_vector.get_feature_names()

def normallize(item):
    return 'PromoInterval_'+item.capitalize()
columns = list(map(normallize,count_vector.get_feature_names()))

Bow_Matrix = pd.DataFrame(doc_array, columns=columns)
Bow_Matrix.head(5)

Unnamed: 0,PromoInterval_Apr,PromoInterval_Aug,PromoInterval_Dec,PromoInterval_Feb,PromoInterval_Jan,PromoInterval_Jul,PromoInterval_Jun,PromoInterval_Mar,PromoInterval_May,PromoInterval_Nan,PromoInterval_Nov,PromoInterval_Oct,PromoInterval_Sept
0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,1,1,0,0,0,0,0,1,0
2,1,0,0,0,1,1,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0


### Bog-of-Word

对 PromoInterval 使用词袋处理：

In [95]:
Data[Bow_Matrix.columns] = Bow_Matrix
Data.drop('PromoInterval', axis=1, inplace=True)
Data.head()

Unnamed: 0,Store,Month,Day,Year,DayOfWeek,Season,Open,StateHoliday,SchoolHoliday,CompetitionDistance,...,PromoInterval_Feb,PromoInterval_Jan,PromoInterval_Jul,PromoInterval_Jun,PromoInterval_Mar,PromoInterval_May,PromoInterval_Nan,PromoInterval_Nov,PromoInterval_Oct,PromoInterval_Sept
0,1,7,31,2015,5,3,1,0,1,1270.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2,7,31,2015,5,3,1,0,1,570.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,7,31,2015,5,3,1,0,1,14130.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,7,31,2015,5,3,1,0,1,620.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,5,7,31,2015,5,3,1,0,1,29910.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### One-Hot

对 ['StoreType','Assortment'] 两个分类变量进行独热编码：

In [96]:
index = Data[Data['StateHoliday'] == 0].index
Data.loc[index,['StateHoliday']] = '0'

One_Hot_Matrix = pd.get_dummies(Data[['StoreType','Assortment','StateHoliday']]) 
Data[One_Hot_Matrix.columns] = One_Hot_Matrix
Data.drop(['StoreType','Assortment','StateHoliday'], axis=1, inplace=True)
Data.head()

Unnamed: 0,Store,Month,Day,Year,DayOfWeek,Season,Open,SchoolHoliday,CompetitionDistance,CompetitionOpenTime,...,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
0,1,7,31,2015,5,3,1,1,1270.0,88.0,...,0,1,0,1,0,0,1,0,0,0
1,2,7,31,2015,5,3,1,1,570.0,98.0,...,0,0,0,1,0,0,1,0,0,0
2,3,7,31,2015,5,3,1,1,14130.0,109.0,...,0,0,0,1,0,0,1,0,0,0
3,4,7,31,2015,5,3,1,1,620.0,76.0,...,0,1,0,0,0,1,1,0,0,0
4,5,7,31,2015,5,3,1,1,29910.0,9.0,...,0,0,0,1,0,0,1,0,0,0


In [97]:
Data.columns

Index(['Store', 'Month', 'Day', 'Year', 'DayOfWeek', 'Season', 'Open',
       'SchoolHoliday', 'CompetitionDistance', 'CompetitionOpenTime', 'Promo',
       'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval_Apr',
       'PromoInterval_Aug', 'PromoInterval_Dec', 'PromoInterval_Feb',
       'PromoInterval_Jan', 'PromoInterval_Jul', 'PromoInterval_Jun',
       'PromoInterval_Mar', 'PromoInterval_May', 'PromoInterval_Nan',
       'PromoInterval_Nov', 'PromoInterval_Oct', 'PromoInterval_Sept',
       'StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d',
       'Assortment_a', 'Assortment_b', 'Assortment_c', 'StateHoliday_0',
       'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c'],
      dtype='object')

In [98]:
Data.iloc[0]

Store                     1.0
Month                     7.0
Day                      31.0
Year                   2015.0
DayOfWeek                 5.0
Season                    3.0
Open                      1.0
SchoolHoliday             1.0
CompetitionDistance    1270.0
CompetitionOpenTime      88.0
Promo                     1.0
Promo2                    0.0
Promo2SinceWeek          -1.0
Promo2SinceYear          -1.0
PromoInterval_Apr         0.0
PromoInterval_Aug         0.0
PromoInterval_Dec         0.0
PromoInterval_Feb         0.0
PromoInterval_Jan         0.0
PromoInterval_Jul         0.0
PromoInterval_Jun         0.0
PromoInterval_Mar         0.0
PromoInterval_May         0.0
PromoInterval_Nan         1.0
PromoInterval_Nov         0.0
PromoInterval_Oct         0.0
PromoInterval_Sept        0.0
StoreType_a               0.0
StoreType_b               0.0
StoreType_c               1.0
StoreType_d               0.0
Assortment_a              1.0
Assortment_b              0.0
Assortment

In [120]:
Data.iloc[1116]

Store                     2.0
Month                     7.0
Day                      30.0
Year                   2015.0
DayOfWeek                 4.0
Season                    3.0
Open                      1.0
SchoolHoliday             1.0
CompetitionDistance     570.0
CompetitionOpenTime      98.0
Promo                     1.0
Promo2                    1.0
Promo2SinceWeek          13.0
Promo2SinceYear        2010.0
PromoInterval_Apr         NaN
PromoInterval_Aug         NaN
PromoInterval_Dec         NaN
PromoInterval_Feb         NaN
PromoInterval_Jan         NaN
PromoInterval_Jul         NaN
PromoInterval_Jun         NaN
PromoInterval_Mar         NaN
PromoInterval_May         NaN
PromoInterval_Nan         NaN
PromoInterval_Nov         NaN
PromoInterval_Oct         NaN
PromoInterval_Sept        NaN
StoreType_a               1.0
StoreType_b               0.0
StoreType_c               0.0
StoreType_d               0.0
Assortment_a              1.0
Assortment_b              0.0
Assortment

In [114]:
for index in Data.columns:
    print(index,len(Data[Data[index].isnull()]))

Store 0
Month 0
Day 0
Year 0
DayOfWeek 0
Season 0
Open 0
SchoolHoliday 0
CompetitionDistance 0
CompetitionOpenTime 0
Promo 0
Promo2 0
Promo2SinceWeek 0
Promo2SinceYear 0
PromoInterval_Apr 1016094
PromoInterval_Aug 1016094
PromoInterval_Dec 1016094
PromoInterval_Feb 1016094
PromoInterval_Jan 1016094
PromoInterval_Jul 1016094
PromoInterval_Jun 1016094
PromoInterval_Mar 1016094
PromoInterval_May 1016094
PromoInterval_Nan 1016094
PromoInterval_Nov 1016094
PromoInterval_Oct 1016094
PromoInterval_Sept 1016094
StoreType_a 0
StoreType_b 0
StoreType_c 0
StoreType_d 0
Assortment_a 0
Assortment_b 0
Assortment_c 0
StateHoliday_0 0
StateHoliday_a 0
StateHoliday_b 0
StateHoliday_c 0
