In [2]:
from sklearn.preprocessing import MinMaxScaler
import math
import pandas as pd
import seaborn as sns
import numpy as np
import datetime

%matplotlib inline
sns.set(color_codes=True)

train = pd.read_csv('data/train.csv')
store = pd.read_csv('data/store.csv')
test = pd.read_csv('data/test.csv')

train.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [3]:
def week_of_year(dt):
    return dt.isocalendar()[1]

def week_of_month(dt):
    """ Returns the week of the month for the specified date.
    """
    first_day = dt.replace(day=1)
    dom = dt.day
    adjusted_dom = dom + first_day.weekday()
    return int(math.ceil(adjusted_dom/7.0))

def day_of_year(dt):
    return dt.timetuple().tm_yday

def generate_all_features_in_one_column(x):
    year = x.split('-')[0]
    month = x.split('-')[1]
    date = x.split('-')[2]
    dt = datetime.date(int(year), int(month), int(date))
    woy = week_of_year(dt)
    wom = week_of_month(dt)
    doy = day_of_year(dt)
    return "{}-{}-{}-{}-{}-{}".format(year,month,date,woy,wom,doy)

def split_date(df):
    df['Date'] = df['Date'].apply(lambda x: generate_all_features_in_one_column(x))
    df['Year'] = df['Date'].apply(lambda x: x.split('-')[0])
    df['Month'] = df['Date'].apply(lambda x: x.split('-')[1])
    df['Day'] = df['Date'].apply(lambda x: x.split('-')[2])
    #df['WeekOfYear'] = df['Date'].apply(lambda x: x.split('-')[3])
    #df['WeekOfMonth'] = df['Date'].apply(lambda x: x.split('-')[4])
    #df['DayOfYear'] = df['Date'].apply(lambda x: x.split('-')[5])
    df = df.drop(['Date'], axis=1)

    return df

def preprocess_data(df):
    # StateHoliday，处理为不放假 0，放假 1
    df["StateHoliday"] = df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
    df = split_date(df)
    df_day_of_week  = pd.get_dummies(df['DayOfWeek'], prefix='Day')
    df = df.join(df_day_of_week)
    df = df.drop(['DayOfWeek'], axis=1)
    return df


# 训练集-预处理
train = preprocess_data(train)
# 查看训练集合并后的情况
train.head()

Unnamed: 0,Store,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,Day_1,Day_2,Day_3,Day_4,Day_5,Day_6,Day_7
0,1,5263,555,1,1,0,1,2015,7,31,0,0,0,0,1,0,0
1,2,6064,625,1,1,0,1,2015,7,31,0,0,0,0,1,0,0
2,3,8314,821,1,1,0,1,2015,7,31,0,0,0,0,1,0,0
3,4,13995,1498,1,1,0,1,2015,7,31,0,0,0,0,1,0,0
4,5,4822,559,1,1,0,1,2015,7,31,0,0,0,0,1,0,0


In [4]:
# 通过drop duplicates发现Open有NaN的值
print(test['Open'].drop_duplicates())
print("===================================================================")
# 打印出Open为空值的那几行数据
index = test['Open'].index[test['Open'].apply(np.isnan)]
print(test.loc[index])
# 分析后发现，那几天都为工作日并且没有放假，所以全部设置为1
test['Open'][index] = 1
print("===================================================================")
# 打印出修改后的数据
print(test.loc[index])

0      1.0
479    NaN
543    0.0
Name: Open, dtype: float64
          Id  Store  Open  Promo  StateHoliday  SchoolHoliday  Year Month Day  \
479      480    622   NaN      1             0              0  2015    09  17   
1335    1336    622   NaN      1             0              0  2015    09  16   
2191    2192    622   NaN      1             0              0  2015    09  15   
3047    3048    622   NaN      1             0              0  2015    09  14   
4759    4760    622   NaN      0             0              0  2015    09  12   
5615    5616    622   NaN      0             0              0  2015    09  11   
6471    6472    622   NaN      0             0              0  2015    09  10   
7327    7328    622   NaN      0             0              0  2015    09  09   
8183    8184    622   NaN      0             0              0  2015    09  08   
9039    9040    622   NaN      0             0              0  2015    09  07   
10751  10752    622   NaN      0             0   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [4]:
# 训练集-预处理
test = preprocess_data(test)
# 查看训练集合并后的情况
test.head()

Unnamed: 0,Id,Store,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,Day_1,Day_2,Day_3,Day_4,Day_5,Day_6,Day_7
0,1,1,1.0,1,0,0,2015,9,17,0,0,0,1,0,0,0
1,2,3,1.0,1,0,0,2015,9,17,0,0,0,1,0,0,0
2,3,7,1.0,1,0,0,2015,9,17,0,0,0,1,0,0,0
3,4,8,1.0,1,0,0,2015,9,17,0,0,0,1,0,0,0
4,5,9,1.0,1,0,0,2015,9,17,0,0,0,1,0,0,0


In [5]:
#先选取一家店来做训练
train_store1 = train[train['Store'] == 1]
# 提取标签
train_store1_labels = train_store1['Sales']
# 从训练集中移除标签
train_store1 = train_store1.drop(['Store','Sales','Customers'], axis=1)
#拆分训练集和验证集,这里选择最后2个月的数据为验证集（2015年6月1号至2015年7月31号） TBC

In [6]:
# 导入 train_test_split
from sklearn.model_selection import train_test_split

# 将数据切分成训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(train_store1, train_store1_labels, test_size = 0.2, random_state = 0)

# 显示切分的结果
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 753 samples.
Testing set has 189 samples.


In [7]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

params = {'n_estimators': 500,\
          'max_depth': 5,\
          'min_samples_split': 2,\
          'learning_rate': 0.1,\
          'loss': 'ls',\
          'verbose': 2}

# 初始化回归模型
est = GradientBoostingRegressor(**params)

est.fit(X_train, y_train)

mse = mean_squared_error(y_test, est.predict(X_test))

print("MSE: %.4f" % mse)

      Iter       Train Loss   Remaining Time 
         1     3414449.1231            0.50s
         2     2827654.9121            0.50s
         3     2352982.7802            0.50s
         4     1966298.6878            0.50s
         5     1650531.2302            0.59s
         6     1395018.5505            0.58s
         7     1185782.2733            0.56s
         8     1016245.4699            0.55s
         9      878790.8429            0.55s
        10      763986.4903            0.54s
        11      671222.2550            0.53s
        12      595731.1995            0.53s
        13      531711.2842            0.56s
        14      475759.4074            0.56s
        15      431535.3916            0.55s
        16      394574.0336            0.54s
        17      361033.1027            0.54s
        18      335052.9375            0.54s
        19      311542.0004            0.53s
        20      291666.7009            0.53s
        21      275068.6536            0.52s
        2

       249       14100.6003            0.41s
       250       14012.7628            0.41s
       251       13917.3380            0.41s
       252       13828.6162            0.41s
       253       13722.0215            0.41s
       254       13590.4203            0.41s
       255       13504.4544            0.41s
       256       13430.6724            0.41s
       257       13311.9431            0.41s
       258       13210.6241            0.40s
       259       13183.7344            0.40s
       260       13155.2745            0.40s
       261       13132.0774            0.40s
       262       13069.5578            0.40s
       263       12992.7602            0.40s
       264       12926.5125            0.40s
       265       12850.8143            0.40s
       266       12784.3128            0.40s
       267       12767.9903            0.40s
       268       12748.5860            0.39s
       269       12690.2281            0.39s
       270       12593.2500            0.39s
       271

       500        2369.8695            0.00s
MSE: 219265.6977
