In [1]:
from sklearn.preprocessing import MinMaxScaler
import math
import pandas as pd
import seaborn as sns
import numpy as np
import datetime

%matplotlib inline
sns.set(color_codes=True)

train = pd.read_csv('data/train.csv')
store = pd.read_csv('data/store.csv')
test = pd.read_csv('data/test.csv')

train.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [2]:
def week_of_year(dt):
    return dt.isocalendar()[1]

def week_of_month(dt):
    """ Returns the week of the month for the specified date.
    """
    first_day = dt.replace(day=1)
    dom = dt.day
    adjusted_dom = dom + first_day.weekday()
    return int(math.ceil(adjusted_dom/7.0))

def day_of_year(dt):
    return dt.timetuple().tm_yday

def generate_all_features_in_one_column(x):
    year = x.split('-')[0]
    month = x.split('-')[1]
    date = x.split('-')[2]
    dt = datetime.date(int(year), int(month), int(date))
    woy = week_of_year(dt)
    wom = week_of_month(dt)
    doy = day_of_year(dt)
    return "{}-{}-{}-{}-{}-{}".format(year,month,date,woy,wom,doy)

def split_date(df):
    df['Date'] = df['Date'].apply(lambda x: generate_all_features_in_one_column(x))
    df['Year'] = df['Date'].apply(lambda x: x.split('-')[0])
    df['Month'] = df['Date'].apply(lambda x: x.split('-')[1])
    df['Day'] = df['Date'].apply(lambda x: x.split('-')[2])
    #df['WeekOfYear'] = df['Date'].apply(lambda x: x.split('-')[3])
    #df['WeekOfMonth'] = df['Date'].apply(lambda x: x.split('-')[4])
    #df['DayOfYear'] = df['Date'].apply(lambda x: x.split('-')[5])
    df = df.drop(['Date'], axis=1)

    return df

def preprocess_data(df):
    # StateHoliday，处理为不放假 0，放假 1
    df["StateHoliday"] = df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
    df = split_date(df)
    df_day_of_week  = pd.get_dummies(df['DayOfWeek'], prefix='Day')
    df = df.join(df_day_of_week)
    df = df.drop(['DayOfWeek'], axis=1)
    return df


# 训练集-预处理
train = preprocess_data(train)
# 查看训练集合并后的情况
train.head()

Unnamed: 0,Store,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,Day_1,Day_2,Day_3,Day_4,Day_5,Day_6,Day_7
0,1,5263,555,1,1,0,1,2015,7,31,0,0,0,0,1,0,0
1,2,6064,625,1,1,0,1,2015,7,31,0,0,0,0,1,0,0
2,3,8314,821,1,1,0,1,2015,7,31,0,0,0,0,1,0,0
3,4,13995,1498,1,1,0,1,2015,7,31,0,0,0,0,1,0,0
4,5,4822,559,1,1,0,1,2015,7,31,0,0,0,0,1,0,0


In [3]:
# 通过drop duplicates发现Open有NaN的值
print(test['Open'].drop_duplicates())
print("===================================================================")
# 打印出Open为空值的那几行数据
index = test['Open'].index[test['Open'].apply(np.isnan)]
print(test.loc[index])
# 分析后发现，那几天都为工作日并且没有放假，所以全部设置为1
test['Open'][index] = 1
print("===================================================================")
# 打印出修改后的数据
print(test.loc[index])

0      1.0
479    NaN
543    0.0
Name: Open, dtype: float64
          Id  Store  DayOfWeek        Date  Open  Promo StateHoliday  \
479      480    622          4  2015-09-17   NaN      1            0   
1335    1336    622          3  2015-09-16   NaN      1            0   
2191    2192    622          2  2015-09-15   NaN      1            0   
3047    3048    622          1  2015-09-14   NaN      1            0   
4759    4760    622          6  2015-09-12   NaN      0            0   
5615    5616    622          5  2015-09-11   NaN      0            0   
6471    6472    622          4  2015-09-10   NaN      0            0   
7327    7328    622          3  2015-09-09   NaN      0            0   
8183    8184    622          2  2015-09-08   NaN      0            0   
9039    9040    622          1  2015-09-07   NaN      0            0   
10751  10752    622          6  2015-09-05   NaN      0            0   

       SchoolHoliday  
479                0  
1335               0  
2191  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [4]:
# 训练集-预处理
test = preprocess_data(test)
# 查看训练集合并后的情况
test.head()

Unnamed: 0,Id,Store,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,Day_1,Day_2,Day_3,Day_4,Day_5,Day_6,Day_7
0,1,1,1.0,1,0,0,2015,9,17,0,0,0,1,0,0,0
1,2,3,1.0,1,0,0,2015,9,17,0,0,0,1,0,0,0
2,3,7,1.0,1,0,0,2015,9,17,0,0,0,1,0,0,0
3,4,8,1.0,1,0,0,2015,9,17,0,0,0,1,0,0,0
4,5,9,1.0,1,0,0,2015,9,17,0,0,0,1,0,0,0


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

scores = []

for i in range(1, train['Store'].max()+1):
    #选取一家店来做训练
    train_store = train[train['Store'] == i]
    # 提取标签
    train_store_labels = train_store['Sales']
    # 从训练集中移除标签
    train_store = train_store.drop(['Store','Sales','Customers'], axis=1)
    # 将数据切分成训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(train_store, train_store_labels, test_size = 0.2, random_state = 0)
    # 初始化回归模型
    regr = RandomForestRegressor(max_depth=7, random_state=0)
    regr.fit(X_train, y_train)
    pred = regr.predict(X_test)
    score = r2_score(y_test, pred)
    scores.append(score)
    #print("Store {} R2: {:.4f}".format(i, score))
    # 保存模型
    joblib.dump(regr, 'saved_models\store_{}.pkl'.format(i)) 

Store 1 R2: 0.9263
Store 2 R2: 0.9283
Store 3 R2: 0.9200
Store 4 R2: 0.9246
Store 5 R2: 0.9423
Store 6 R2: 0.9443
Store 7 R2: 0.9083
Store 8 R2: 0.9466
Store 9 R2: 0.9361
Store 10 R2: 0.9425
Store 11 R2: 0.9180
Store 12 R2: 0.9174
Store 13 R2: 0.8698
Store 14 R2: 0.9321
Store 15 R2: 0.9157
Store 16 R2: 0.9506
Store 17 R2: 0.9342
Store 18 R2: 0.9301
Store 19 R2: 0.9379
Store 20 R2: 0.9200
Store 21 R2: 0.9293
Store 22 R2: 0.8756
Store 23 R2: 0.9275
Store 24 R2: 0.9251
Store 25 R2: 0.8667
Store 26 R2: 0.9182
Store 27 R2: 0.8994
Store 28 R2: 0.8082
Store 29 R2: 0.9245
Store 30 R2: 0.9144
Store 31 R2: 0.9278
Store 32 R2: 0.9196
Store 33 R2: 0.8984
Store 34 R2: 0.9445
Store 35 R2: 0.8947
Store 36 R2: 0.9538
Store 37 R2: 0.9041
Store 38 R2: 0.9168
Store 39 R2: 0.8991
Store 40 R2: 0.9111
Store 41 R2: 0.8509
Store 42 R2: 0.9233
Store 43 R2: 0.9073
Store 44 R2: 0.9304
Store 45 R2: 0.9151
Store 46 R2: 0.9043
Store 47 R2: 0.9115
Store 48 R2: 0.9249
Store 49 R2: 0.9281
Store 50 R2: 0.9366
Store 51 

Store 397 R2: 0.9108
Store 398 R2: 0.9039
Store 399 R2: 0.8836
Store 400 R2: 0.9288
Store 401 R2: 0.9201
Store 402 R2: 0.9006
Store 403 R2: 0.9482
Store 404 R2: 0.9063
Store 405 R2: 0.9452
Store 406 R2: 0.9300
Store 407 R2: 0.8153
Store 408 R2: 0.9382
Store 409 R2: 0.9264
Store 410 R2: 0.9245
Store 411 R2: 0.9301
Store 412 R2: 0.8858
Store 413 R2: 0.9082
Store 414 R2: 0.9125
Store 415 R2: 0.9000
Store 416 R2: 0.9414
Store 417 R2: 0.9285
Store 418 R2: 0.9353
Store 419 R2: 0.9230
Store 420 R2: 0.8451
Store 421 R2: 0.8572
Store 422 R2: 0.9196
Store 423 R2: 0.6222
Store 424 R2: 0.8978
Store 425 R2: 0.9015
Store 426 R2: 0.9337
Store 427 R2: 0.9072
Store 428 R2: 0.8704
Store 429 R2: 0.8493
Store 430 R2: 0.8357
Store 431 R2: 0.9204
Store 432 R2: 0.9632
Store 433 R2: 0.9237
Store 434 R2: 0.8795
Store 435 R2: 0.9218
Store 436 R2: 0.9339
Store 437 R2: 0.8529
Store 438 R2: 0.8983
Store 439 R2: 0.9320
Store 440 R2: 0.9306
Store 441 R2: 0.8999
Store 442 R2: 0.9181
Store 443 R2: 0.9281
Store 444 R2:

Store 792 R2: 0.9313
Store 793 R2: 0.8958
Store 794 R2: 0.9217
Store 795 R2: 0.9071
Store 796 R2: 0.9403
Store 797 R2: 0.9060
Store 798 R2: 0.9411
Store 799 R2: 0.9276
Store 800 R2: 0.8950
Store 801 R2: 0.8898
Store 802 R2: 0.9427
Store 803 R2: 0.8002
Store 804 R2: 0.8677
Store 805 R2: 0.8838
Store 806 R2: 0.8900
Store 807 R2: 0.9454
Store 808 R2: 0.9506
Store 809 R2: 0.9155
Store 810 R2: 0.9304
Store 811 R2: 0.9216
Store 812 R2: 0.9629
Store 813 R2: 0.9458
Store 814 R2: 0.9363
Store 815 R2: 0.8966
Store 816 R2: 0.8660
Store 817 R2: 0.9616
Store 818 R2: 0.9432
Store 819 R2: 0.9403
Store 820 R2: 0.9116
Store 821 R2: 0.9196
Store 822 R2: 0.8927
Store 823 R2: 0.9297
Store 824 R2: 0.9191
Store 825 R2: 0.9122
Store 826 R2: 0.9382
Store 827 R2: 0.8929
Store 828 R2: 0.9310
Store 829 R2: 0.9216
Store 830 R2: 0.9221
Store 831 R2: 0.9019
Store 832 R2: 0.9453
Store 833 R2: 0.9155
Store 834 R2: 0.9501
Store 835 R2: 0.8486
Store 836 R2: 0.9595
Store 837 R2: 0.9247
Store 838 R2: 0.9390
Store 839 R2:

In [18]:
df = pd.read_csv('data\sample_submission.csv')
#选取一家店来做预测
test_feature = test[test['Store'] == 1]
# save ids
ids = test_feature['Id']

print(ids)

df['Sales'][df['Id'] == ids[0]] = [123]

df.head()

0            1
856        857
1712      1713
2568      2569
3424      3425
4280      4281
5136      5137
5992      5993
6848      6849
7704      7705
8560      8561
9416      9417
10272    10273
11128    11129
11984    11985
12840    12841
13696    13697
14552    14553
15408    15409
16264    16265
17120    17121
17976    17977
18832    18833
19688    19689
20544    20545
21400    21401
22256    22257
23112    23113
23968    23969
24824    24825
25680    25681
26536    26537
27392    27393
28248    28249
29104    29105
29960    29961
30816    30817
31672    31673
32528    32529
33384    33385
34240    34241
35096    35097
35952    35953
36808    36809
37664    37665
38520    38521
39376    39377
40232    40233
Name: Id, dtype: int64


Unnamed: 0,Id,Sales
0,1,123
1,2,0
2,3,0
3,4,0
4,5,0


In [26]:
def loal_and_predit(i, x):
    model_path = 'saved_models\store_{}.pkl'.format(i)
    regr = joblib.load(model_path)
    pred = regr.predict(x)
    return pred

def update_dataframe(df, ids, pred):
    for i, sid in enumerate(ids):
        df['Sales'][df['Id'] == sid] = pred[i]       
    #return df  

def generate_submission_file():
    # load submission sample file
    df = pd.read_csv('data\sample_submission.csv')
    for i in range(1, test['Store'].max()+1):
        # choose one store at a time
        test_feature = test[test['Store'] == i]
        # save ids
        ids = test_feature['Id']
        # remove Id and Store for preidct
        test_feature = test_feature.drop(['Id','Store'], axis=1)
        # predict
        pred = loal_and_predit(i, test_feature)
        # write predictions into submission according to Ids
        update_dataframe(df, ids, pred)
    # generate new submission file
    df.to_csv('data\new_sub.csv')
        
generate_submission_file()

ValueError: Found array with 0 sample(s) (shape=(0, 14)) while a minimum of 1 is required.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

#先选取一家店来做训练
train_store = train[train['Store'] == 897]
# 提取标签
train_store_labels = train_store['Sales']
# 从训练集中移除标签
train_store = train_store.drop(['Store','Sales','Customers'], axis=1)
# 将数据切分成训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(train_store, train_store_labels, test_size = 0.2, random_state = 0)
# 初始化回归模型
regr = RandomForestRegressor()

scoring_fnc = make_scorer(r2_score)

param = {'max_depth':[5,6,7,8,9,10]}

grid = GridSearchCV(regr, param, scoring=scoring_fnc)

grid.fit(X_train, y_train)

estimator = grid.best_estimator_

pred = estimator.predict(X_test)

score = r2_score(y_test, pred)

print(score)

In [None]:
for idx, s in enumerate(scores):
    if (s < 0.8):
        print(idx + 1, s)