# Modeling : ridge regression

In [1]:
from  datetime import datetime, timedelta
# import gc
import numpy as np, pandas as pd
from sklearn.linear_model import Ridge
import time	
from sklearn.externals import joblib





In [None]:
IS_LOCAL_DEMO = True
# IS_LOCAL_DEMO = False

In [None]:
max_lags = 57
last_day = 1913  
fday = datetime(2016,4, 25) 

In [None]:
data_file = ("train_data.csv.gz" if(IS_LOCAL_DEMO) else "train_data.csv")
dt = pd.read_csv(data_file)

print("loading:",data_file)

In [None]:
dt["date"] = pd.to_datetime(dt["date"])  


In [None]:
df = dt.copy()

In [12]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):  # [(7, lag_7), (28, lag_28)]
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag) 

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean()) 

    date_features = {
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",

    }
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [13]:
create_fea(df)

In [14]:
df.head(3)

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,1850,0.0,2016-02-21,11604,...,8.26,,,,,,,7,1,21
1,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,1851,4.0,2016-02-22,11604,...,8.26,,,,,,,8,1,22
2,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,1852,0.0,2016-02-23,11604,...,8.26,,,,,,,8,1,23
3,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,1853,1.0,2016-02-24,11604,...,8.26,,,,,,,8,1,24
4,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,1854,0.0,2016-02-25,11604,...,8.26,,,,,,,8,1,25


In [15]:
df.dropna(inplace = True)
df.shape

(274410, 31)

In [16]:
df.columns

Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_7', 'lag_28',
       'rmean_7_7', 'rmean_28_7', 'rmean_7_28', 'rmean_28_28', 'week',
       'quarter', 'mday'],
      dtype='object')

In [17]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday", 'event_name_1', 'event_name_2']
train_cols = df.columns[~df.columns.isin(useless_cols)]
print(train_cols)
X_train = df[train_cols]
y_train = df["sales"]

Index(['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'wday', 'month',
       'year', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI',
       'sell_price', 'lag_7', 'lag_28', 'rmean_7_7', 'rmean_28_7',
       'rmean_7_28', 'rmean_28_28', 'week', 'quarter', 'mday'],
      dtype='object')


In [18]:
train_cols

Index(['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'wday', 'month',
       'year', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI',
       'sell_price', 'lag_7', 'lag_28', 'rmean_7_7', 'rmean_28_7',
       'rmean_7_28', 'rmean_28_28', 'week', 'quarter', 'mday'],
      dtype='object')

In [19]:
X_train.head(3)

Unnamed: 0,item_id,dept_id,store_id,cat_id,state_id,wday,month,year,event_type_1,event_type_2,...,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
1676950,0,0,0,0,0,1,4,2016,0,0,...,8.38,1.0,1.0,1.571429,1.0,1.071429,1.035714,15,2,16
1676951,0,0,0,0,0,2,4,2016,0,0,...,8.38,2.0,3.0,1.714286,1.428571,1.142857,1.142857,15,2,17
1676952,0,0,0,0,0,3,4,2016,0,0,...,8.38,0.0,1.0,1.714286,1.571429,1.142857,1.035714,16,2,18
1676953,0,0,0,0,0,4,4,2016,0,0,...,8.38,0.0,2.0,1.142857,1.714286,1.107143,1.107143,16,2,19
1676954,0,0,0,0,0,5,4,2016,0,0,...,8.38,0.0,2.0,0.857143,1.857143,1.071429,1.142857,16,2,20


In [20]:
model_ridge = RidgeCV(alphas=[0.1, 1.0, 10.0])
model_ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [22]:
if(IS_LOCAL_DEMO):
    alphas = [1] # for local demo
else:
    alphas = [1.035, 1.03, 1.025] #[1.015,1.25,1.03] #[1.0,1.15,1.01] #[1.0,1.15,1.01] # for kaggle

weights = [1 / len(alphas)] * len(alphas) 
print(weights)

[1.0]


In [24]:
sub = 0.
cols = [f"F{i}" for i in range(1,29)]

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):  # [(1.035, 0.333), (1.03, 0.333), (1.025, 0.333)]
    te = dt
    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)  
        print(icount, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()  
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]  
        te.loc[te.date == day, "sales"] = alpha * model_ridge.predict(tst)
    
    te_sub = te.loc[te.date >= fday, ["id", "sales"]]
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]  
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index() 
    te_sub.fillna(0., inplace = True) 
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


0 2016-04-25 00:00:00
0 2016-04-26 00:00:00
0 2016-04-27 00:00:00
0 2016-04-28 00:00:00
0 2016-04-29 00:00:00
0 2016-04-30 00:00:00
0 2016-05-01 00:00:00
0 2016-05-02 00:00:00
0 2016-05-03 00:00:00
0 2016-05-04 00:00:00
0 2016-05-05 00:00:00
0 2016-05-06 00:00:00
0 2016-05-07 00:00:00
0 2016-05-08 00:00:00
0 2016-05-09 00:00:00
0 2016-05-10 00:00:00
0 2016-05-11 00:00:00
0 2016-05-12 00:00:00
0 2016-05-13 00:00:00
0 2016-05-14 00:00:00
0 2016-05-15 00:00:00
0 2016-05-16 00:00:00
0 2016-05-17 00:00:00
0 2016-05-18 00:00:00
0 2016-05-19 00:00:00
0 2016-05-20 00:00:00
0 2016-05-21 00:00:00
0 2016-05-22 00:00:00
0 1 1.0


(30490, 29)
Index(['id', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10',
       'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20',
       'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28'],
      dtype='object', name='F')


F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,1.601232,1.03122,1.040118,0.762508,1.367211,1.148543,-0.383512,-0.028097,-0.482517,...,-1.067345,-0.590698,-0.852996,-0.844406,-1.324728,-1.351195,-1.512323,-1.452185,-1.151026,-1.334869
1,FOODS_1_001_CA_2_validation,0.621037,0.813414,0.83709,1.431999,0.819426,1.075763,0.356675,-0.373753,-0.339171,...,-0.651151,-0.461867,0.948764,-1.241898,-1.151463,-1.146332,-0.92,-1.422375,-1.141494,-0.473909
2,FOODS_1_001_CA_3_validation,0.635827,0.639292,0.750566,0.633107,0.639398,1.202239,-0.538861,-0.748601,-0.718591,...,-1.154952,-0.767216,0.462925,-1.62367,-1.668396,-1.693415,-1.725641,-1.755335,-1.378947,-1.286919
3,FOODS_1_001_CA_4_validation,0.572057,0.020477,0.029375,0.038055,0.230432,0.534725,-0.811026,-1.106357,-1.308186,...,-1.756627,-1.471162,-1.531343,-1.76636,-2.066335,-2.109269,-2.15667,-2.099751,-1.852913,-1.92002
4,FOODS_1_001_TX_1_validation,-0.083,-0.07432,-0.065422,-0.056742,-0.047625,0.223919,-1.162185,-1.508452,-1.39848,...,-2.023136,-1.74474,-1.818682,-2.266816,-2.314088,-2.371259,-2.424269,-2.483296,-2.218417,-2.271463


In [27]:

sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)


In [28]:
print(sub.shape)
print(sub.columns)
sub.head()

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,1.601232,1.031220,1.040118,0.762508,1.367211,1.148543,-0.383512,-0.028097,-0.482517,...,-1.067345,-0.590698,-0.852996,-0.844406,-1.324728,-1.351195,-1.512323,-1.452185,-1.151026,-1.334869
1,FOODS_1_001_CA_2_validation,0.621037,0.813414,0.837090,1.431999,0.819426,1.075763,0.356675,-0.373753,-0.339171,...,-0.651151,-0.461867,0.948764,-1.241898,-1.151463,-1.146332,-0.920000,-1.422375,-1.141494,-0.473909
2,FOODS_1_001_CA_3_validation,0.635827,0.639292,0.750566,0.633107,0.639398,1.202239,-0.538861,-0.748601,-0.718591,...,-1.154952,-0.767216,0.462925,-1.623670,-1.668396,-1.693415,-1.725641,-1.755335,-1.378947,-1.286919
3,FOODS_1_001_CA_4_validation,0.572057,0.020477,0.029375,0.038055,0.230432,0.534725,-0.811026,-1.106357,-1.308186,...,-1.756627,-1.471162,-1.531343,-1.766360,-2.066335,-2.109269,-2.156670,-2.099751,-1.852913,-1.920020
4,FOODS_1_001_TX_1_validation,-0.083000,-0.074320,-0.065422,-0.056742,-0.047625,0.223919,-1.162185,-1.508452,-1.398480,...,-2.023136,-1.744740,-1.818682,-2.266816,-2.314088,-2.371259,-2.424269,-2.483296,-2.218417,-2.271463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,HOUSEHOLD_2_516_TX_2_evaluation,0.177986,0.184059,0.481418,0.312561,0.195102,0.460487,-0.923737,-1.218775,-1.100798,...,-1.667665,-1.274728,-1.336364,-1.899423,-1.955574,-1.895780,-2.066901,-2.113445,-1.817486,-1.858092
30486,HOUSEHOLD_2_516_TX_3_evaluation,0.084634,0.193082,0.177780,0.057496,0.066176,0.335113,-1.045776,-1.358206,-1.223975,...,-1.856404,-1.551562,-1.634652,-2.075139,-2.116982,-2.163433,-2.214340,-2.262916,-1.989214,-2.038490
30487,HOUSEHOLD_2_516_WI_1_evaluation,-0.028701,-0.022628,-0.013948,-0.005268,-0.002748,0.268796,-1.109486,-1.443030,-1.434152,...,-1.790985,-1.635793,-1.700126,-2.141887,-2.188075,-2.235439,-2.283982,-2.314426,-2.065555,-2.113924
30488,HOUSEHOLD_2_516_WI_2_evaluation,-0.082355,-0.073675,-0.064995,0.129989,-0.015104,0.256440,-1.121841,-1.465382,-1.457764,...,-1.922991,-1.639618,-1.704746,-2.147540,-2.194575,-2.242821,-2.189366,-2.366929,-2.098320,-2.146808


In [None]:
if(IS_LOCAL_DEMO):
    sub.to_csv("submission_ridge.csv.gz",compression='gzip',index=False)# for local demo
else:
    sub.to_csv("submission_ridge.csv",index=False) # for kaggle  


* ## ridge kaggle result ： 1.37491
