# Modeling : KNN

* ## Load the data and analysis libraries

> ### Load libraries

In [1]:
from  datetime import datetime, timedelta
# import gc
import numpy as np, pandas as pd
from sklearn.neighbors import KNeighborsRegressor
import time	
from sklearn.externals import joblib



> ### We are using the local test from now on.

In [4]:
IS_LOCAL_DEMO = True
# IS_LOCAL_DEMO = False

In [5]:
max_lags = 57
last_day = 1913  
fday = datetime(2016,4, 25) 

> ### load processed data 

In [9]:
data_file = ("train_data.csv.gz" if(IS_LOCAL_DEMO) else "train_data.csv")
dt = pd.read_csv(data_file)

print("loading:",data_file)

In [4]:
dt["date"] = pd.to_datetime(dt["date"])  


> ### deep copy the data for sliding window model temporary 

In [5]:
df = dt.copy()

> ### In order to overfit the model we constructed, we did not use the previous day for prediction, but created more means as new features. 7_28 refers to the average of the first 7 days of the previous cycle (7), that is, log = 7, wins = 28.

In [6]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):  # [(7, lag_7), (28, lag_28)]
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)  

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean()) 

    date_features = {
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",

    }
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [7]:
create_fea(df)

In [8]:
df.head(3)

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,1850,0.0,2016-02-21,11604,...,8.26,,,,,,,7,1,21
1,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,1851,4.0,2016-02-22,11604,...,8.26,,,,,,,8,1,22
2,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,1852,0.0,2016-02-23,11604,...,8.26,,,,,,,8,1,23


In [9]:
df.dropna(inplace = True)
df.shape

(90000, 31)

In [11]:
df.columns

Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_7', 'lag_28',
       'rmean_7_7', 'rmean_28_7', 'rmean_7_28', 'rmean_28_28', 'week',
       'quarter', 'mday'],
      dtype='object')

>### Split the dataset to training and testing. 

In [12]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday", 'event_name_1', 'event_name_2']
train_cols = df.columns[~df.columns.isin(useless_cols)]
# print(train_cols)
X_train = df[train_cols]
y_train = df["sales"]

In [13]:
X_train.head()

Unnamed: 0,item_id,dept_id,store_id,cat_id,state_id,wday,month,year,event_type_1,event_type_2,...,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
550000,0,0,0,0,0,1,4,2016,0,0,...,8.38,1.0,1.0,1.571429,1.0,1.071429,1.035714,15,2,16
550001,0,0,0,0,0,2,4,2016,0,0,...,8.38,2.0,3.0,1.714286,1.428571,1.142857,1.142857,15,2,17
550002,0,0,0,0,0,3,4,2016,0,0,...,8.38,0.0,1.0,1.714286,1.571429,1.142857,1.035714,16,2,18
550003,0,0,0,0,0,4,4,2016,0,0,...,8.38,0.0,2.0,1.142857,1.714286,1.107143,1.107143,16,2,19
550004,0,0,0,0,0,5,4,2016,0,0,...,8.38,0.0,2.0,0.857143,1.857143,1.071429,1.142857,16,2,20


> ### Create a KNN Regressor model to training all selected features by Skit-learn.

In [14]:
model_knn = KNeighborsRegressor(3)
model_knn.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                    weights='uniform')

> ### Unsupervised learning is very useful to make the model overfit. We try to use multiple sets of weight values to obtain the final predicted value. At present, the highest weight of kaggle score is [1.035, 1.03, 1.025]. Model optimization is highly dependent on weight hyperparameters,which needs to be optimized more carefully later.¶

In [15]:
# alphas = [1.035, 1.03, 1.025] #[1.015,1.25,1.03] #[1.0,1.15,1.01] #[1.0,1.15,1.01] # for kaggle
alphas = [1] # for local demo
weights = [1 / len(alphas)] * len(alphas)  # 权重，当前权重值一样
print(weights)

[1.0]


In [16]:
sub = 0.
cols = [f"F{i}" for i in range(1,29)]

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):  # [(1.035, 0.333), (1.03, 0.333), (1.025, 0.333)]
    te = dt # get the all dataset
    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)  # single predict day
        print(icount, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy() # get sliding window date of dataset  
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]  # sliding window select data
        te.loc[te.date == day, "sales"] = alpha * model_knn.predict(tst) # train sliding window select data
    
    # formate
    te_sub = te.loc[te.date >= fday, ["id", "sales"]]  
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]  # [F1,F2,F3,...F28]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()  # change the formate same with the submission
    te_sub.fillna(0., inplace = True) 
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    
     # Calculation weighting
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight 
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)



0 2016-04-25 00:00:00
0 2016-04-26 00:00:00
0 2016-04-27 00:00:00
0 2016-04-28 00:00:00
0 2016-04-29 00:00:00
0 2016-04-30 00:00:00
0 2016-05-01 00:00:00
0 2016-05-02 00:00:00
0 2016-05-03 00:00:00
0 2016-05-04 00:00:00
0 2016-05-05 00:00:00
0 2016-05-06 00:00:00
0 2016-05-07 00:00:00
0 2016-05-08 00:00:00
0 2016-05-09 00:00:00
0 2016-05-10 00:00:00
0 2016-05-11 00:00:00
0 2016-05-12 00:00:00
0 2016-05-13 00:00:00
0 2016-05-14 00:00:00
0 2016-05-15 00:00:00
0 2016-05-16 00:00:00
0 2016-05-17 00:00:00
0 2016-05-18 00:00:00
0 2016-05-19 00:00:00
0 2016-05-20 00:00:00
0 2016-05-21 00:00:00
0 2016-05-22 00:00:00
0 1 1.0


In [17]:
sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
# sub.to_csv("submission_rf.csv",index=False)

In [18]:
print(sub.shape)
print(sub.columns)
sub.head()

(20000, 29)
Index(['id', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10',
       'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20',
       'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28'],
      dtype='object', name='F')


F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,1.333333,1.0,1.0,0.333333,1.0,0.333333,1.666667,1.333333,1.666667,...,1.666667,1.666667,2.333333,0.0,1.333333,2.0,0.666667,0.666667,0.666667,0.666667
1,FOODS_1_001_CA_2_validation,0.333333,0.333333,0.333333,1.666667,0.333333,0.333333,1.0,1.0,1.0,...,1.333333,1.333333,2.0,1.333333,1.333333,2.0,1.666667,0.0,1.333333,1.666667
2,FOODS_1_001_CA_3_validation,0.666667,0.666667,0.333333,0.333333,0.666667,0.333333,0.333333,0.333333,0.333333,...,0.0,0.333333,2.0,0.333333,0.0,0.333333,0.0,1.333333,1.0,0.333333
3,FOODS_1_002_CA_1_validation,0.333333,0.333333,0.333333,0.0,0.333333,0.333333,1.666667,1.666667,1.666667,...,0.0,2.0,1.666667,0.0,0.0,0.0,0.333333,1.0,0.333333,0.333333
4,FOODS_1_002_CA_2_validation,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,1.666667,1.666667,1.666667,...,0.0,2.0,0.666667,0.0,0.0,0.333333,0.666667,1.0,1.666667,0.333333


* ## Dataset persistence
>###  Save the dataframe to hard drive and use the script of "4agg-12lelvel" to generate submission to upload to kaggle.


In [19]:
# sub.to_csv("submission_knn.csv",index=False) # for kaggle
sub.to_csv("submission_knn.csv.gz",compression='gzip',index=False)# for local demo