In [1]:
import os
import random
from joblib import dump
import numpy as np
import pandas as pd
import calendar as cal
#from sklearn.category_encoders import TargetEncoders
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error as MSE
import lightgbm as lgb

from src.features.build_train_features import *
from src.features.build_test_features import *
from src.models.preprocess import *

In [2]:
calendar_path = os.path.join('..', 'data', 'raw', 'calendar.csv')
sales_train_path = os.path.join('..', 'data', 'raw', 'sales_train_validation.csv')
price_path = os.path.join('..', 'data', 'raw', 'sell_prices.csv')
sample_path = os.path.join('..', 'data', 'raw', 'sample_submission.csv')

calendar = pd.read_csv(calendar_path, parse_dates=['date'])
sales = pd.read_csv(sales_train_path)
prices = pd.read_csv(price_path)
sample = pd.read_csv(sample_path)

In [12]:
ar_lags = {'days':[7, 28],
           'months':[1],
           'years':[1]}

cat_features = ['weekday', 'month', 'year',  'store_id',
                'cat_id', 'state_id', 'event_name_1', 'event_name_2']
quant_features = ['days', 'sell_price', 'mean']
str_features = ['store_id', 'cat_id', 'state_id', 'event_name_1', 'event_name_2', 'weekday']

ma_periods = [30]
other_features = ['days', 'weekday'] #, # 'store_id',
                 # 'event_name_1', 'event_name_2']
arma_prefixes = ['ma_', 'days_', 'months_', 'years_']

fold = 1
scores = []
importances = []

for train_idx, test_idx in TimeSeriesSplit(3).split(calendar[calendar.d.isin(sales.columns)]):
    
    train_dates = calendar.date[train_idx].values
    test_dates = calendar.date[test_idx].values
    
    train_features = build_train_features(train_dates=train_dates, ar_lags=ar_lags,
                                          ma_periods=ma_periods, other_features=other_features,
                                          calendar=calendar, sales=sales.head(1000), prices=prices)
    arma_features = [col for col in train_features.columns
                     if any([ap in col for ap in arma_prefixes])]
    feature_names = other_features + arma_features
    cat_names = [fn for fn in feature_names if fn in cat_features]
    quant_names = arma_features + [fn for fn in feature_names if fn in quant_features]
    str_names = [fn for fn in feature_names if fn in str_features]
    
    cat_df, les = le_cat_features(train_features, str_names)
    
    X_train = pd.concat([train_features[quant_names].reset_index(drop=True), cat_df], axis=1)
    
    #X_train = train[quant_features + cat_features]
    y_train = train_features.value
    
    lgb_data = lgb.Dataset(X_train, label = y_train, 
                         categorical_feature=cat_names, free_raw_data=False)
    
    print('Finished building lgb_data for fold ' + str(fold) + '...')
    
    params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
        'verbosity': 1,
        'num_iterations' : 1200,
        'num_leaves': 128,
        "min_data_in_leaf": 100,
    }
    
    reg = lgb.train(params, lgb_data)

    print('Finished training lgb model for fold ' + str(fold) + '...')
    
    test_features = build_test_features(test_dates=test_dates, train_dates=train_dates, ar_lags=ar_lags,
                                        ma_periods=ma_periods, other_features=other_features,
                                        calendar=calendar, sales=sales.head(1000), prices=prices)
    
    print('Finished building test features for fold ' + str(fold) + '...')
        
    cat_test = [le.transform(test_features[str_names[i]]) for i, le in enumerate(les)]
    cat_test_df = pd.concat([pd.DataFrame(cp) for cp in cat_test], axis=1)
    cat_test_df.columns = str_names

    print('Finished label encoding for fold ' + str(fold) + '...')
    
    X_test = pd.concat([test_features[quant_names], cat_test_df], axis=1)
    
    print('Finished making X_test for fold ' + str(fold) + '...')
    
    y_test = sales.head(1000)[calendar[calendar.date.isin(test_dates)].d].values.flatten('F')
        
    scores.append(MSE(y_test, reg.predict(X_test)))
    
    importances.append(reg.feature_importance())
        
    #test_features['forecast'] = reg.predict(X_test)
            
    print('Finished predicting for fold ' + str(fold) + '...')
    
    fold+=1

Finished with pre-merge steps
Finished merging
Added days
Finished building lgb_data for fold 1...




Finished training lgb model for fold 1...
Added arma features...
Finished building test features for fold 1...
Finished label encoding for fold 1...
Finished making X_test for fold 1...
Finished predicting for fold 1...
Finished with pre-merge steps
Finished merging
Added days
Finished building lgb_data for fold 2...




Finished training lgb model for fold 2...
Added arma features...
Finished building test features for fold 2...
Finished label encoding for fold 2...
Finished making X_test for fold 2...
Finished predicting for fold 2...
Finished with pre-merge steps
Finished merging
Added days
Finished building lgb_data for fold 3...




Finished training lgb model for fold 3...
Added arma features...
Finished building test features for fold 3...
Finished label encoding for fold 3...
Finished making X_test for fold 3...
Finished predicting for fold 3...


In [18]:
reg.feature_name()

['days_7', 'days_28', 'months_1', 'years_1', 'ma_30', 'days', 'weekday']

In [19]:
importances

[array([26775,     0,     0,     0, 54664, 50190, 20771], dtype=int32),
 array([27438,     0,     0,     0, 49903, 52792, 22267], dtype=int32),
 array([27762,     0,     0,     0, 48394, 53453, 22791], dtype=int32)]

In [20]:
np.mean(importances, axis=0).tolist()

[27325.0, 0.0, 0.0, 0.0, 50987.0, 52145.0, 21943.0]

In [21]:
def draw_id(file_path, create_new=False, size=10000):
    if create_new:
        ids = random.sample(range(size), size)
    else:
        with open(file_path, 'rb') as f:
            ids = load(f)
    model_id = ids.pop(0)
    with open(file_path, 'wb') as f:
        dump(ids, f)
    return model_id

def save_metadata(model, model_id, model_score, importances, dir_path):
    meta_path = os.path.join(dir_path, ''.join(['model_', str(model_id), '_metadata.csv']))
    model_meta = pd.Series({'model_id': model_id,
                        'feature_names': list(model.feature_name()),
                        'feature_importances': importances,
                        'score': model_score})
    model_meta.to_csv(meta_path, header=False)
    return model_meta

In [24]:
model_score = np.mean(scores)
mn_importances = np.mean(importances, axis=0).tolist()

print('Finished cross validating. Model score: ' + str(model_score))

model_id = draw_id(os.path.join('..', 'models', 'model_ids.joblib'), create_new=True)

meta_dir = os.path.join('..', 'models', 'metadata')

model_meta = save_metadata(model=reg, model_id=model_id, model_score=model_score,
                           importances=mn_importances, dir_path=meta_dir)

best_model_path = os.path.join('..', 'models', 'metadata', 'best_model.csv')

if os.path.exists(best_model_path):
    best_model = pd.read_csv(best_model_path, squeeze=True, index_col=0)
else:
    best_model = model_meta

if model_score <= float(best_model.loc['score']):
    
    print('(this is the best model so far)')
    
    model_meta.to_csv(best_model_path, header=False)


Finished scoring. Model score: 4.869316916367562
(this is the best model so far)


In [8]:
sales.head(1000)[calendar[calendar.date.isin(test_dates)].d].values.flatten('F')

array([0, 0, 0, ..., 0, 1, 0])

In [7]:
test_features[quant_names].shape

(478000, 6)

In [9]:
cat_test_df.shape

(478000, 1)

In [10]:
cat_test_df.head()

Unnamed: 0,weekday
0,5
1,5
2,5
3,5
4,5


In [15]:
cat_test_df

Unnamed: 0,weekday
0,5
1,5
2,5
3,5
4,5
...,...
477995,6
477996,6
477997,6
477998,6


In [16]:
pd.concat([test_features[quant_names].reset_index(drop=True), cat_test_df], axis=1)

Unnamed: 0,days_7,days_28,months_1,years_1,ma_30,days,weekday
0,0,0,0,0,0.000000,479,5
1,0,0,0,0,0.166667,479,5
2,0,0,0,0,0.000000,479,5
3,0,0,0,1,0.900000,479,5
4,0,0,0,6,0.333333,479,5
...,...,...,...,...,...,...,...
477995,2,5,3,6,2.133333,956,6
477996,0,0,0,0,0.000000,956,6
477997,0,0,0,0,0.000000,956,6
477998,0,0,2,2,2.266667,956,6


In [8]:
sales_df = sales[calendar[calendar.date.isin(train_dates)].d].T
sales_df['date'] = train_dates
sales_df.set_index('date', inplace=True)

sales_values = pd.melt(sales_df.reset_index(), id_vars='date').dropna()

In [10]:
sales_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-29,0,0,0,0,0,0,0,12,2,0,...,0,14,1,0,4,0,0,0,0,0
2011-01-30,0,0,0,0,0,0,0,15,0,0,...,0,11,1,0,4,0,0,6,0,0
2011-01-31,0,0,0,0,0,0,0,0,7,1,...,0,5,1,0,2,2,0,0,0,0
2011-02-01,0,0,0,0,0,0,0,0,3,0,...,0,6,1,0,5,2,0,2,0,0
2011-02-02,0,0,0,0,0,0,0,0,0,0,...,0,5,1,0,2,0,0,2,0,0


In [12]:
np.tile(sales_df.mean(), 10)

array([0.        , 0.1691023 , 0.        , ..., 1.38622129, 0.        ,
       0.        ])

In [7]:
sales_df = sales[calendar[calendar.date.isin(test_dates)].d].T
sales_df['date'] = test_dates
sales_df.set_index('date', inplace=True)

In [8]:
test_date = test_dates[0]
lags = ar_lags
test_date = pd.to_datetime(test_date)
offsets = [[*map(lambda e: pd.DateOffset(**{k:e}), v)]
            for k,v in lags.items()]
offsets = [*chain.from_iterable(offsets)]
offsets_used = offsets.copy()
for i, offset in enumerate(offsets):
    multiple = 1
    while test_date - offsets_used[i] not in sales_df.index:
        multiple+=1
        offsets_used[i] = offset * multiple
        if multiple > 200:
            print("Couldn't find ar feature for " + str(offset))
            break
ar_dfs = [sales_df.loc[test_date - ou] for ou in offsets_used]
ar_df = pd.concat(ar_dfs, axis=1)
ar_df.columns = [offset.freqstr[13:-1].replace('=', '_')
                 for offset in offsets]

Couldn't find ar feature for <DateOffset: days=7>
Couldn't find ar feature for <DateOffset: days=28>
Couldn't find ar feature for <DateOffset: months=1>
Couldn't find ar feature for <DateOffset: years=1>


KeyError: Timestamp('2008-07-15 00:00:00')

In [18]:
pd.to_datetime?

In [17]:
sales_df.index

DatetimeIndex(['2012-05-22', '2012-05-23', '2012-05-24', '2012-05-25',
               '2012-05-26', '2012-05-27', '2012-05-28', '2012-05-29',
               '2012-05-30', '2012-05-31',
               ...
               '2013-09-02', '2013-09-03', '2013-09-04', '2013-09-05',
               '2013-09-06', '2013-09-07', '2013-09-08', '2013-09-09',
               '2013-09-10', '2013-09-11'],
              dtype='datetime64[ns]', name='date', length=478, freq=None)

In [15]:
sales_df.loc[test_date - offsets[0]]

KeyError: Timestamp('2012-05-15 00:00:00')

In [13]:
[sales_df.loc[test_date - ou] for ou in offsets]

KeyError: Timestamp('2012-05-15 00:00:00')

In [None]:
sales_df.

In [11]:
test_date - offsets[0]

Timestamp('2012-05-15 00:00:00')

In [4]:
make_ar_test(sales_df, date, ar_lags)

NameError: name 'sales_df' is not defined

In [32]:
sales_df = sales[calendar[calendar.date.isin(test_dates)].d].T
sales_df['date'] = test_dates
sales_df.set_index('date', inplace=True)

sales_values = pd.melt(sales_df.reset_index(), id_vars='date').dropna()

In [23]:
def make_ar_test(sales_df, test_date, lags):
    offsets = [[*map(lambda e: pd.DateOffset(**{k:e}), v)]
                for k,v in lags.items()]
    offsets = [*chain.from_iterable(offsets)]
    offsets_used = offsets.copy()
    for i, offset in enumerate(offsets):
        print(offset)
        multiple = 1
        while test_date - offsets_used[i] not in sales_df.index:
            multiple+=1
            offsets_used[i] = offset * multiple
    ar_dfs = [sales_df.loc[date - ou] for ou in offsets_used]
    ar_df = pd.concat(ar_dfs, axis=1)
    ar_df.columns = [offset.freqstr[13:-1].replace('=', '_')
                     for offset in offsets]
    return ar_df

In [27]:
pd.to_datetime(test_dates[0]) - pd.DateOffset(days=7)

Timestamp('2012-05-08 00:00:00')

In [24]:
make_ar_test(sales_df, test_dates[0], ar_lags)

<DateOffset: days=7>


UFuncTypeError: ufunc 'subtract' cannot use operands with types dtype('<M8[ns]') and dtype('O')

In [None]:
make_ma_test(sales_df, date, ar_lags)

In [8]:
make_ar_test(sales, date, ar_lags)

KeyboardInterrupt: 

In [80]:
%%timeit
sales_df = sales[calendar[calendar.date.isin(train_dates)].d].T
sales_df['date'] = train_dates
sales_df.set_index('date', inplace=True)

119 ms ± 8.64 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
#add sell_price, mean later
quant_features = ['last_week', 'last_year', 'days', 'ma30']
cat_features = ['weekday', 'store_id', 'event_name_1']

In [17]:
def build_features(dates, ar_lags, ma_periods, other_features,
                   calendar=calendar, sales=sales, prices=prices):
    
    sales_df = sales[calendar[calendar.date.isin(dates)].d].T
    sales_df['date'] = dates
    sales_df.set_index('date', inplace=True)

    sales_values = pd.melt(sales_df.reset_index(), id_vars='date').dropna()
    
    lag_df = make_ar_features(sales_df, ar_lags)
    ma_df = make_ma_features(sales_df, ma_periods)

    print('Finished with pre-merge steps')

    feature_df = (sales_values.merge(lag_df, how='left', on=['date', 'variable'])
                              .merge(ma_df, how='left', on=['date', 'variable']))
    
    cal_features = ['weekday', 'event_name_1', 'event_name_2']
    cal_features = [cf for cf in cal_features if cf in other_features]
    
    if 'sell_price' in other_features:
        cal_features = cal_features + ['wm_yr_wk']
    
    if len(cal_features) > 0:
        feature_df = feature_df.merge(calendar[cal_features + ['date']], how='left', on='date')
    
    if 'mean' in other_features:
        mean_df = sales_df.mean().reset_index()
        mean_df.columns = ['variable', 'mean']
        feature_df = feature_df.merge(mean_df, how='left', on='variable')
        
    feature_df = feature_df.merge(sales[['item_id', 'cat_id', 'store_id']], how='left',
                                  left_on='variable', right_index=True)    
    
    if 'sell_price' in other_features:
        feature_df = feature_df.merge(prices, how='left', on=['store_id', 'item_id', 'wm_yr_wk'])
            
    print('Finished merging')
    
    if 'month' in other_features:
        feature_df['month'] = feature_df.date.dt.month
    
    if 'year' in other_features:
        feature_df['year'] = feature_df.date.dt.year
    
    if 'days' in other_features:
        feature_df['days'] = [diff.days for diff in (feature_df.date - calendar.date.min())]
    
    print('Added days')
    
    feature_df.sort_values(['variable', 'date'], inplace=True)
    
    print('Sorted')
    
    #feature_df.fillna(method='bfill', inplace=True)
    #print('Filled NAs')
    
    return feature_df

In [20]:
str_names

['store_id', 'event_name_1', 'event_name_2']

In [26]:
str_names

['store_id', 'event_name_1', 'event_name_2']

In [34]:
test_date = calendar.date[test_idx[0]]

In [56]:
sales_df = sales[calendar.d[calendar.date.isin(train_dates)]].T
sales_df['date'] = calendar.date[train_idx].values
sales_df.set_index('date', inplace=True)

In [None]:
def make_ar_train(sales_df, lags):
    offsets = [[*map(lambda e: pd.DateOffset(**{k:e}), v)]
                for k,v in lags.items()]
    offsets = [*chain.from_iterable(offsets)]
    lag_dfs = [pd.melt(sales_df.shift(freq=offset).reset_index(),
               id_vars='date',
               value_name=offset.freqstr[13:-1].replace('=', '_'))
               for offset in offsets]

    lag_df = pd.concat(lag_dfs, axis=1)

    return lag_df.loc[:,~lag_df.columns.duplicated()]

In [67]:
sales_df.tail()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-02-09,0,0,1,0,0,0,0,1,1,0,...,1,2,1,1,0,0,0,1,0,1
2015-02-10,1,0,0,1,1,0,0,29,2,0,...,2,0,1,1,0,0,0,1,0,6
2015-02-11,0,0,0,1,2,0,0,2,3,1,...,5,1,0,2,0,1,0,3,0,2
2015-02-12,0,0,1,1,0,0,0,3,0,1,...,3,0,0,1,0,0,0,2,1,0
2015-02-13,3,0,0,1,1,0,0,15,0,2,...,0,3,2,2,0,2,0,1,0,4


In [68]:
test_date

Timestamp('2015-02-14 00:00:00')

In [71]:
sales_df.loc[test_date - pd.DateOffset(days=1)]

0        3
1        0
2        0
3        1
4        1
        ..
30485    2
30486    0
30487    1
30488    0
30489    4
Name: 2015-02-13 00:00:00, Length: 30490, dtype: int64

In [78]:
from itertools import chain

In [79]:
make_ar_pred(sales_df, test_date, ar_lags)

Unnamed: 0,days_7,days_28,months_1,years_1
0,2,1,2,0
1,0,0,0,0
2,0,0,0,1
3,10,2,4,0
4,5,1,1,0
...,...,...,...,...
30485,0,0,0,0
30486,0,0,0,0
30487,4,0,3,0
30488,0,0,0,0


In [75]:
def make_ar_pred(sales_df, date, lags):
    offsets = [[*map(lambda e: pd.DateOffset(**{k:e}), v)]
                for k,v in lags.items()]
    offsets = [*chain.from_iterable(offsets)]
    ar_features = [sales_df.loc[date - offset] for offset in offsets]
    feature_df = pd.concat(ar_features, axis=1)
    feature_df.columns = [offset.freqstr[13:-1].replace('=', '_')
                          for offset in offsets]
    return feature_df

In [None]:
    for date in calendar_df.date[test_idx]:
        ma_features = build_ma_features(df, date, [30])
        ar_features = build_ar_features(df, date, [7])
        last_year = df[date - pd.DateOffset(years=1)].rename('last_year')
        features = pd.concat([ma_features, ar_features, last_year], axis=1)
        features['date'] = date
        #features['mean'] = item_mean
        #features['month'] = months[date.month]
        features['year'] = date.year
        features['days'] = (date - start_date).days
        features['weekday'] = calendar.weekday[calendar.date == date].values[0]
        features['store_id'] = sales_train['store_id']
        features['event_name_1'] = calendar.event_name_1[calendar.date == date].values[0]


        X_cat = dv.transform(features[cat_features].to_dict('records'))
        X_test = np.c_[features[quant_features], X_cat]
        #X_test = sc.transform(X_test)
        features['forecast'] = reg.predict(X_test)
        forecast_row = pd.pivot(features, columns='date', values='forecast').T
        df = df.append(forecast_row)

In [31]:
ar_lags = {'days':[7, 28],
           'months':[1],
           'years':[1]}

cat_features = ['weekday', 'month', 'year', 'store_id',
                'cat_id', 'state_id', 'event_name_1', 'event_name_2']
quant_features = ['days', 'sell_price', 'mean']
str_features = ['store_id', 'cat_id', 'state_id', 'event_name_1', 'event_name_2', 'weekday']

ma_periods = [30]
other_features = ['days', 'weekday', 'store_id', 'event_name_1', 'event_name_2']
arma_prefixes = ['ma_', 'days_', 'months_', 'years_']

#model_id = draw_id('../models/model_ids.joblib')

fold=1
for train_idx, test_idx in TimeSeriesSplit(3).split(calendar):
    train_dates = calendar.date[train_idx].values
    train_features = build_train_features(dates=train_dates, ar_lags=ar_lags,
                                          ma_periods=ma_periods, other_features=other_features,
                                          sales=sales.head(1000))
    arma_features = [col for col in train_features.columns
                     if any([ap in col for ap in arma_prefixes])]
    feature_names = other_features + arma_features
    cat_names = [fn for fn in feature_names if fn in cat_features]
    quant_names = arma_features + [fn for fn in feature_names if fn in quant_features]
    str_names = [fn for fn in feature_names if fn in str_features]
    
    cat_df, les = le_cat_features(train_features, str_names)
    
    X_train = pd.concat([train_features[quant_names].reset_index(drop=True), cat_df], axis=1)
    
    #X_train = train[quant_features + cat_features]
    y_train = train_features.value
    
    lgb_data = lgb.Dataset(X_train, label = y_train, 
                         categorical_feature=cat_names, free_raw_data=False)
    
    print('Finished building lgb_data for fold ' + str(fold) + '...')
    
    params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
        'verbosity': 1,
        'num_iterations' : 1200,
        'num_leaves': 128,
        "min_data_in_leaf": 100,
    }
    
    reg = lgb.train(params, lgb_data)
    
    print('Finished training lgb model for fold ' + str(fold) + '...')
    
    sales_vals = sales[calendar[calendar.date.isin(train_dates)].d].T
    sales_vals['date'] = train_dates
    sales_vals.set_index('date', inplace=True)
    
    for date in calendar_df.date[test_idx]:
        pred_features = build_pred_features(date)
        
        cat_pred = [sle.transform(pred_features[str_names[i]]) for i, le in enumerate(les)]
        cat_pred_df = pd.concat(cat_preds, axis=1)
        
        X_test = pd.concat([pred_features[quant_names], cat_df], axis=1)
        
        pred_features['forecast'] = reg.predict(X_test)
        forecast_row = pd.pivot(features, columns='date', values='forecast').T
        sales_vals = sales_vals.append(forecast_row)
    
    fold+=1

Finished with pre-merge steps
Finished merging
Added days
Sorted
Finished building lgb_data for fold 1...
Finished training lgb model for fold 1...
Finished with pre-merge steps
Finished merging
Added days
Sorted
Finished building lgb_data for fold 1...
Finished training lgb model for fold 1...
Finished with pre-merge steps
Finished merging
Added days
Sorted
Finished building lgb_data for fold 1...
Finished training lgb model for fold 1...


In [91]:
def build_pred_features(pred_date, ar_lags=ar_lags, ma_periods=ma_periods,
                        other_features=other_features, train_dates=train_dates,
                        calendar=calendar, sales=sales, prices=prices):

    sales_df = sales[calendar[calendar.date.isin(train_dates)].d].T
    sales_df['date'] = train_dates
    sales_df.set_index('date', inplace=True)

    pred_mas = make_ma_pred(sales_df, pred_date, ma_periods)
    pred_ars = make_ar_pred(sales_df, pred_date, ar_lags)

    pred_features = pd.concat([pred_mas, pred_ars], axis=1)

    if 'mean' in other_features:
        pred_features['mean'] = sales_df.mean()
    if 'month' in other_features:
        pred_features['month'] = pred_date.month
    if 'year' in other_features:
        pred_features['year'] = pred_date.year
    if 'days' in other_features:
        pred_features['days'] = (pred_date - calendar.date.min()).days
    if 'weekday' in other_features:
        pred_features['weekday'] = calendar.weekday[calendar.date == pred_date].values[0]
    if 'store_id' in other_features:
        pred_features['store_id'] = sales['store_id']
    if 'event_name_1' in other_features:
        pred_features['event_name_1'] = calendar.event_name_1[calendar.date == pred_date].values[0]
    if 'event_name_2' in other_features:
        pred_features['event_name_2'] = calendar.event_name_2[calendar.date == pred_date].values[0]
    if 'sell_price' in other_features:
        date_prices = prices[prices.wm_yr_wk == calendar[calendar.date == pred_date].wm_yr_wk.values[0]]
        pred_features['sell_price'] = date_prices.sell_price

    return pred_features

In [87]:
def make_ar_pred(sales_df, date, lags):
    offsets = [[*map(lambda e: pd.DateOffset(**{k:e}), v)]
                for k,v in lags.items()]
    offsets = [*chain.from_iterable(offsets)]
    ar_dfs = [sales_df.loc[date - offset] for offset in offsets]
    ar_df = pd.concat(ar_dfs, axis=1)
    ar_df.columns = [offset.freqstr[13:-1].replace('=', '_')
                     for offset in offsets]
    return ar_df

def make_ma_pred(sales_df, date, periods):
    yesterday = date - pd.DateOffset(days=1)
    ma_features = [sales_df[date - pd.DateOffset(days=period):yesterday].mean()
                   for period in periods]
    feature_df =  pd.concat(ma_features, axis=1)
    feature_df.columns = [''.join(['ma_', period])
                          for period in map(str, periods)]
    return feature_df

In [94]:
build_pred_features(test_date)

Unnamed: 0,ma_30,days_7,days_28,months_1,years_1,days,weekday,store_id,event_name_1,event_name_2
0,0.666667,2,1,2,0,1477,Saturday,CA_1,ValentinesDay,
1,0.066667,0,0,0,0,1477,Saturday,CA_1,ValentinesDay,
2,0.366667,0,0,0,1,1477,Saturday,CA_1,ValentinesDay,
3,2.333333,10,2,4,0,1477,Saturday,CA_1,ValentinesDay,
4,1.133333,5,1,1,0,1477,Saturday,CA_1,ValentinesDay,
...,...,...,...,...,...,...,...,...,...,...
30485,0.666667,0,0,0,0,1477,Saturday,WI_3,ValentinesDay,
30486,0.066667,0,0,0,0,1477,Saturday,WI_3,ValentinesDay,
30487,1.033333,4,0,3,0,1477,Saturday,WI_3,ValentinesDay,
30488,0.033333,0,0,0,0,1477,Saturday,WI_3,ValentinesDay,


In [None]:
model_id = draw_id('../models/model_ids.joblib')

fold=1
for train_idx, test_idx in TimeSeriesSplit(3).split(calendar):
    train_dates = calendar_df.date[train_idx].values
    train_features = build
    
    
    train = df[df.date.isin(calendar_df.date[train_idx])]
    test = df[df.date.isin(calendar_df.date[test_idx])]
    
    print('Finished with train/test split for fold ' + str(fold) + '...')
    
    cat_df, les = le_cat_features(train, cat_features)
    
    X_train = pd.concat([train[quant_features].reset_index(drop=True), cat_df], axis=1)
    
    #LEFT OFF HERE 
    
    #X_train = train[quant_features + cat_features]
    y_train = train.value
    
    lgb_data = lgb.Dataset(X_train, label = y_train, 
                         categorical_feature=cat_features, free_raw_data=False)
    
    print('Finished building lgb_data for fold ' + str(fold) + '...')
    
    params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
        'verbosity': 1,
        'num_iterations' : 1200,
        'num_leaves': 128,
        "min_data_in_leaf": 100,
    }
    
    reg = lgb.train(params, lgb_data)
    
    print('Finished training lgb model for fold ' + str(fold) + '...')
    
    #testing
    for date in calendar_df.date[test_idx]:
        ma_features = build_ma_features(df, date, [30])
        ar_features = build_ar_features(df, date, [7])
        last_year = df[date - pd.DateOffset(years=1)].rename('last_year')
        features = pd.concat([ma_features, ar_features, last_year], axis=1)
        features['date'] = date
        #features['mean'] = item_mean
        #features['month'] = months[date.month]
        features['year'] = date.year
        features['days'] = (date - start_date).days
        features['weekday'] = calendar.weekday[calendar.date == date].values[0]
        features['store_id'] = sales_train['store_id']
        features['event_name_1'] = calendar.event_name_1[calendar.date == date].values[0]


        X_cat = dv.transform(features[cat_features].to_dict('records'))
        X_test = np.c_[features[quant_features], X_cat]
        #X_test = sc.transform(X_test)
        features['forecast'] = reg.predict(X_test)
        forecast_row = pd.pivot(features, columns='date', values='forecast').T
        df = df.append(forecast_row)
    
    
    
    
    
    #Testing
sales_train_path = os.path.join('data', 'sales_train_nan.csv')
sales_train = pd.read_csv(sales_train_path)
price_path = os.path.join('data', 'sell_prices.csv')
if 'snap' in quant_features or 'sell_price' in quant_features:
    prices = pd.read_csv(price_path)

mask = ['d_' in col for col in sales_train.columns]
df = sales_train.iloc[:, mask].T.set_index(calendar.date[:sum(mask)])
df = df[~df.index.isin(test_dates)]



df_last_year = df.set_index(df.index + pd.DateOffset(years=1))

item_mean = df.mean()
start_date = calendar.date.min()

for date in calendar_df.date[test_idx]:
    ma_features = build_ma_features(df, date, [30])
    ar_features = build_ar_features(df, date, [7])
    year_lag = df[date - pd.DateOffset(years=1)].rename('last_year')
    features = pd.concat([ma_features, ar_features, year_lag], axis=1)
    features['date'] = date
    features['mean'] = item_mean
    features['month'] = months[date.month]
    features['year'] = date.year
    features['days'] = (date - start_date).days
    features['weekday'] = calendar.weekday[calendar.date == date].values[0]
    features['store_id'] = sales_train['store_id']
    features['event_name_1'] = calendar.event_name_1[calendar.date == date].values[0]
    
    
    if 'sell_price' in quant_features:
        date_prices = prices[prices.wm_yr_wk == calendar[calendar.date == date].wm_yr_wk.values[0]]
        features['sell_price'] = date_prices.sell_price
    if 'snap' in quant_features:
        date_prices = prices[prices.wm_yr_wk == calendar[calendar.date == date].wm_yr_wk.values[0]]
        snap_map = {'CA': calendar[calendar.date == date].snap_CA.values[0],
                    'TX': calendar[calendar.date == date].snap_TX.values[0],
                    'WI': calendar[calendar.date == date].snap_WI.values[0]}
        features['snap'] = sales_train[sales_train.item_id.isin(date_prices.item_id.values)].state_id.map(snap_map)
    X_cat = dv.transform(features[cat_features].to_dict('records'))
    X_test = np.c_[features[quant_features], X_cat]
    #X_test = sc.transform(X_test)
    features['forecast'] = reg.predict(X_test)
    forecast_row = pd.pivot(features, columns='date', values='forecast').T
    df = df.append(forecast_row)

y_pred = pd.melt(df[df.index.isin(test_dates)]).value

model_score = MSE(y_pred, y_test)

print('Finished scoring. Model score: ' + str(model_score))

meta_path = os.path.join('models', ''.join(['model_', str(model_id), '_metadata.csv']))
model_meta = pd.Series({'model_id': model_id,
                        'quant_features': quant_features,
                        'cat_features': cat_features,
                        'feature_names': quant_features + dv.get_feature_names(),
                        'feature_importances': reg.feature_importances_,
                        'score': model_score})
model_meta.to_csv(meta_path, header=False)

best_model_path = os.path.join('models', 'best_model.csv')
if os.path.exists(best_model_path):
    best_model = pd.read_csv(best_model_path, squeeze=True, index_col=0)
else:
    best_model = model_meta

if model_score <= float(best_model.loc['score']):
    
    print('(this is the best model so far)')
    
    model_meta.to_csv(best_model_path, header=False)
    
    dv_path = os.path.join('models', ''.join(['dv_', str(model_id), '.joblib']))
    dump(dv, dv_path)
    
    sc_path = os.path.join('models', ''.join(['sc_', str(model_id), '.joblib']))
    dump(sc, sc_path)
    
    reg_path = os.path.join('models', ''.join(['reg_', str(model_id), '.joblib']))
    dump(reg, reg_path)

print('Testing complete')
    
    #reg.fit(X_train, y_train)
#print('Training complete')

    
fold+=1

In [None]:
model_id = draw_id('../models/model_ids.joblib')

fold=1
for train_idx, test_idx in TimeSeriesSplit(3).split(calendar):
    train = df[df.date.isin(calendar_df.date[train_idx])]
    test = df[df.date.isin(calendar_df.date[test_idx])]
    
    print('Finished with train/test split for fold ' + str(fold) + '...')
    
    cat_df, les = le_cat_features(train, cat_features)
    
    X_train = pd.concat([train[quant_features].reset_index(drop=True), cat_df], axis=1)
    
    #LEFT OFF HERE 
    
    #X_train = train[quant_features + cat_features]
    y_train = train.value
    
    lgb_data = lgb.Dataset(X_train, label = y_train, 
                         categorical_feature=cat_features, free_raw_data=False)
    
    print('Finished building lgb_data for fold ' + str(fold) + '...')
    
    params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
        'verbosity': 1,
        'num_iterations' : 1200,
        'num_leaves': 128,
        "min_data_in_leaf": 100,
    }
    
    reg = lgb.train(params, lgb_data)
    
    print('Finished training lgb model for fold ' + str(fold) + '...')
    
    #testing
    for date in calendar_df.date[test_idx]:
        ma_features = build_ma_features(df, date, [30])
        ar_features = build_ar_features(df, date, [7])
        last_year = df[date - pd.DateOffset(years=1)].rename('last_year')
        features = pd.concat([ma_features, ar_features, last_year], axis=1)
        features['date'] = date
        #features['mean'] = item_mean
        #features['month'] = months[date.month]
        features['year'] = date.year
        features['days'] = (date - start_date).days
        features['weekday'] = calendar.weekday[calendar.date == date].values[0]
        features['store_id'] = sales_train['store_id']
        features['event_name_1'] = calendar.event_name_1[calendar.date == date].values[0]


        X_cat = dv.transform(features[cat_features].to_dict('records'))
        X_test = np.c_[features[quant_features], X_cat]
        #X_test = sc.transform(X_test)
        features['forecast'] = reg.predict(X_test)
        forecast_row = pd.pivot(features, columns='date', values='forecast').T
        df = df.append(forecast_row)
    
    
    
    
    
    #Testing
sales_train_path = os.path.join('data', 'sales_train_nan.csv')
sales_train = pd.read_csv(sales_train_path)
price_path = os.path.join('data', 'sell_prices.csv')
if 'snap' in quant_features or 'sell_price' in quant_features:
    prices = pd.read_csv(price_path)

mask = ['d_' in col for col in sales_train.columns]
df = sales_train.iloc[:, mask].T.set_index(calendar.date[:sum(mask)])
df = df[~df.index.isin(test_dates)]



df_last_year = df.set_index(df.index + pd.DateOffset(years=1))

item_mean = df.mean()
start_date = calendar.date.min()

for date in calendar_df.date[test_idx]:
    ma_features = build_ma_features(df, date, [30])
    ar_features = build_ar_features(df, date, [7])
    year_lag = df[date - pd.DateOffset(years=1)].rename('last_year')
    features = pd.concat([ma_features, ar_features, year_lag], axis=1)
    features['date'] = date
    features['mean'] = item_mean
    features['month'] = months[date.month]
    features['year'] = date.year
    features['days'] = (date - start_date).days
    features['weekday'] = calendar.weekday[calendar.date == date].values[0]
    features['store_id'] = sales_train['store_id']
    features['event_name_1'] = calendar.event_name_1[calendar.date == date].values[0]
    
    
    if 'sell_price' in quant_features:
        date_prices = prices[prices.wm_yr_wk == calendar[calendar.date == date].wm_yr_wk.values[0]]
        features['sell_price'] = date_prices.sell_price
    if 'snap' in quant_features:
        date_prices = prices[prices.wm_yr_wk == calendar[calendar.date == date].wm_yr_wk.values[0]]
        snap_map = {'CA': calendar[calendar.date == date].snap_CA.values[0],
                    'TX': calendar[calendar.date == date].snap_TX.values[0],
                    'WI': calendar[calendar.date == date].snap_WI.values[0]}
        features['snap'] = sales_train[sales_train.item_id.isin(date_prices.item_id.values)].state_id.map(snap_map)
    X_cat = dv.transform(features[cat_features].to_dict('records'))
    X_test = np.c_[features[quant_features], X_cat]
    #X_test = sc.transform(X_test)
    features['forecast'] = reg.predict(X_test)
    forecast_row = pd.pivot(features, columns='date', values='forecast').T
    df = df.append(forecast_row)

y_pred = pd.melt(df[df.index.isin(test_dates)]).value

model_score = MSE(y_pred, y_test)

print('Finished scoring. Model score: ' + str(model_score))

meta_path = os.path.join('models', ''.join(['model_', str(model_id), '_metadata.csv']))
model_meta = pd.Series({'model_id': model_id,
                        'quant_features': quant_features,
                        'cat_features': cat_features,
                        'feature_names': quant_features + dv.get_feature_names(),
                        'feature_importances': reg.feature_importances_,
                        'score': model_score})
model_meta.to_csv(meta_path, header=False)

best_model_path = os.path.join('models', 'best_model.csv')
if os.path.exists(best_model_path):
    best_model = pd.read_csv(best_model_path, squeeze=True, index_col=0)
else:
    best_model = model_meta

if model_score <= float(best_model.loc['score']):
    
    print('(this is the best model so far)')
    
    model_meta.to_csv(best_model_path, header=False)
    
    dv_path = os.path.join('models', ''.join(['dv_', str(model_id), '.joblib']))
    dump(dv, dv_path)
    
    sc_path = os.path.join('models', ''.join(['sc_', str(model_id), '.joblib']))
    dump(sc, sc_path)
    
    reg_path = os.path.join('models', ''.join(['reg_', str(model_id), '.joblib']))
    dump(reg, reg_path)

print('Testing complete')
    
    #reg.fit(X_train, y_train)
#print('Training complete')

    
fold+=1

In [40]:
months = dict(zip(range(1,13), calendar.month_name[1:13]))

feature_path = os.path.join('..', 'data', 'interim', 'feature_set.csv')
calendar_path = os.path.join('..', 'data', 'raw', 'calendar.csv')

feature_df = pd.read_csv(feature_path, parse_dates=['date'])
calendar_df = pd.read_csv(calendar_path, parse_dates=['date'])

sales_train_path = os.path.join('..', 'data', 'interim', 'sales_train_nan.csv')
sales_train = pd.read_csv(sales_train_path)

#df['month'] = df.month.map(months)

feature_df = feature_df.merge(calendar_df[['date', 'event_name_1']])

KeyboardInterrupt: 

In [None]:
#add sell_price, mean later
quant_features = ['last_week', 'last_year', 'days', 'ma30']
cat_features = ['weekday', 'store_id', 'event_name_1']

Iterate through cv folds and build one-step-ahead forecasts:

In [19]:
def build_ar_features(df, date, lags):
    ar_features = [df.loc[date - pd.DateOffset(days=lag)] for lag in lags]
    feature_df = pd.concat(ar_features, axis=1)
    feature_df.columns = [''.join(['ar_', lag]) for lag in map(str, lags)]
    return feature_df

def build_ma_features(df, date, periods):
    yesterday = date - pd.DateOffset(days=1)
    ma_features = [df[date - pd.DateOffset(days=period):yesterday].mean()
                   for period in periods]
    feature_df =  pd.concat(ma_features, axis=1)
    feature_df.columns = [''.join(['ma_', period])
                          for period in map(str, periods)]
    return feature_df

In [28]:
yesterday

NameError: name 'yesterday' is not defined

In [23]:
sum(df.date == date)

19608

In [25]:
df_date.index

DatetimeIndex(['2012-06-05', '2012-06-05', '2012-06-05', '2012-06-05',
               '2012-06-05', '2012-06-05', '2012-06-05', '2012-06-05',
               '2012-06-05', '2012-06-05',
               ...
               '2012-06-05', '2012-06-05', '2012-06-05', '2012-06-05',
               '2012-06-05', '2012-06-05', '2012-06-05', '2012-06-05',
               '2012-06-05', '2012-06-05'],
              dtype='datetime64[ns]', name='date', length=19608, freq=None)

In [20]:
%timeit
#testing
for date in calendar_df.date[test_idx[:5]]:
    df_date = df[df.date == date].set_index('date')
    ma_features = build_ma_features(df_date, date, [30])
    ar_features = build_ar_features(df_date, date, [7])

KeyError: 1338249600000000000

In [13]:
#testing
for date in calendar_df.date[test_idx[:5]]:
    ma_features = build_ma_features(df, date, [30])
    ar_features = build_ar_features(df, date, [7])
    last_year = df[date - pd.DateOffset(years=1)].rename('last_year')
    features = pd.concat([ma_features, ar_features, last_year], axis=1)
    features['date'] = date
    #features['mean'] = item_mean
    #features['month'] = months[date.month]
    features['year'] = date.year
    features['days'] = (date - start_date).days
    features['weekday'] = calendar.weekday[calendar.date == date].values[0]
    features['store_id'] = sales_train['store_id']
    features['event_name_1'] = calendar.event_name_1[calendar.date == date].values[0]


    X_cat = dv.transform(features[cat_features].to_dict('records'))
    X_test = np.c_[features[quant_features], X_cat]
    #X_test = sc.transform(X_test)
    features['forecast'] = reg.predict(X_test)
    forecast_row = pd.pivot(features, columns='date', values='forecast').T
    df = df.append(forecast_row)

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.numeric.Int64Index'> with these indexers [2012-05-06 00:00:00] of <class 'pandas._libs.tslibs.timestamps.Timestamp'>

In [17]:
train.head()

NameError: name 'train' is not defined

In [11]:
model_id = draw_id('../models/model_ids.joblib')

fold=1
for train_idx, test_idx in TimeSeriesSplit(3).split(calendar_df):
    train = df[df.date.isin(calendar_df.date[train_idx])]
    test = df[df.date.isin(calendar_df.date[test_idx])]
    
    print('Finished with train/test split for fold ' + str(fold) + '...')
    
    cat_df, les = le_cat_features(train, cat_features)
    
    X_train = pd.concat([train[quant_features].reset_index(drop=True), cat_df], axis=1)
    
    #LEFT OFF HERE 
    
    #X_train = train[quant_features + cat_features]
    y_train = train.value
    
    lgb_data = lgb.Dataset(X_train, label = y_train, 
                         categorical_feature=cat_features, free_raw_data=False)
    
    print('Finished building lgb_data for fold ' + str(fold) + '...')
    
    params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
        'verbosity': 1,
        'num_iterations' : 1200,
        'num_leaves': 128,
        "min_data_in_leaf": 100,
    }
    
    reg = lgb.train(params, lgb_data)
    
    print('Finished training lgb model for fold ' + str(fold) + '...')
    
    #testing
    for date in calendar_df.date[test_idx]:
        ma_features = build_ma_features(df, date, [30])
        ar_features = build_ar_features(df, date, [7])
        last_year = df[date - pd.DateOffset(years=1)].rename('last_year')
        features = pd.concat([ma_features, ar_features, last_year], axis=1)
        features['date'] = date
        #features['mean'] = item_mean
        #features['month'] = months[date.month]
        features['year'] = date.year
        features['days'] = (date - start_date).days
        features['weekday'] = calendar.weekday[calendar.date == date].values[0]
        features['store_id'] = sales_train['store_id']
        features['event_name_1'] = calendar.event_name_1[calendar.date == date].values[0]


        X_cat = dv.transform(features[cat_features].to_dict('records'))
        X_test = np.c_[features[quant_features], X_cat]
        #X_test = sc.transform(X_test)
        features['forecast'] = reg.predict(X_test)
        forecast_row = pd.pivot(features, columns='date', values='forecast').T
        df = df.append(forecast_row)
    
    
    
    
    
    #Testing
sales_train_path = os.path.join('data', 'sales_train_nan.csv')
sales_train = pd.read_csv(sales_train_path)
price_path = os.path.join('data', 'sell_prices.csv')
if 'snap' in quant_features or 'sell_price' in quant_features:
    prices = pd.read_csv(price_path)

mask = ['d_' in col for col in sales_train.columns]
df = sales_train.iloc[:, mask].T.set_index(calendar.date[:sum(mask)])
df = df[~df.index.isin(test_dates)]



df_last_year = df.set_index(df.index + pd.DateOffset(years=1))

item_mean = df.mean()
start_date = calendar.date.min()

for date in calendar_df.date[test_idx]:
    ma_features = build_ma_features(df, date, [30])
    ar_features = build_ar_features(df, date, [7])
    year_lag = df[date - pd.DateOffset(years=1)].rename('last_year')
    features = pd.concat([ma_features, ar_features, year_lag], axis=1)
    features['date'] = date
    features['mean'] = item_mean
    features['month'] = months[date.month]
    features['year'] = date.year
    features['days'] = (date - start_date).days
    features['weekday'] = calendar.weekday[calendar.date == date].values[0]
    features['store_id'] = sales_train['store_id']
    features['event_name_1'] = calendar.event_name_1[calendar.date == date].values[0]
    
    
    if 'sell_price' in quant_features:
        date_prices = prices[prices.wm_yr_wk == calendar[calendar.date == date].wm_yr_wk.values[0]]
        features['sell_price'] = date_prices.sell_price
    if 'snap' in quant_features:
        date_prices = prices[prices.wm_yr_wk == calendar[calendar.date == date].wm_yr_wk.values[0]]
        snap_map = {'CA': calendar[calendar.date == date].snap_CA.values[0],
                    'TX': calendar[calendar.date == date].snap_TX.values[0],
                    'WI': calendar[calendar.date == date].snap_WI.values[0]}
        features['snap'] = sales_train[sales_train.item_id.isin(date_prices.item_id.values)].state_id.map(snap_map)
    X_cat = dv.transform(features[cat_features].to_dict('records'))
    X_test = np.c_[features[quant_features], X_cat]
    #X_test = sc.transform(X_test)
    features['forecast'] = reg.predict(X_test)
    forecast_row = pd.pivot(features, columns='date', values='forecast').T
    df = df.append(forecast_row)

y_pred = pd.melt(df[df.index.isin(test_dates)]).value

model_score = MSE(y_pred, y_test)

print('Finished scoring. Model score: ' + str(model_score))

meta_path = os.path.join('models', ''.join(['model_', str(model_id), '_metadata.csv']))
model_meta = pd.Series({'model_id': model_id,
                        'quant_features': quant_features,
                        'cat_features': cat_features,
                        'feature_names': quant_features + dv.get_feature_names(),
                        'feature_importances': reg.feature_importances_,
                        'score': model_score})
model_meta.to_csv(meta_path, header=False)

best_model_path = os.path.join('models', 'best_model.csv')
if os.path.exists(best_model_path):
    best_model = pd.read_csv(best_model_path, squeeze=True, index_col=0)
else:
    best_model = model_meta

if model_score <= float(best_model.loc['score']):
    
    print('(this is the best model so far)')
    
    model_meta.to_csv(best_model_path, header=False)
    
    dv_path = os.path.join('models', ''.join(['dv_', str(model_id), '.joblib']))
    dump(dv, dv_path)
    
    sc_path = os.path.join('models', ''.join(['sc_', str(model_id), '.joblib']))
    dump(sc, sc_path)
    
    reg_path = os.path.join('models', ''.join(['reg_', str(model_id), '.joblib']))
    dump(reg, reg_path)

print('Testing complete')
    
    #reg.fit(X_train, y_train)
#print('Training complete')

    
fold+=1

KeyboardInterrupt: 

In [8]:
#Testing
sales_train_path = os.path.join('..', 'data', 'interim', 'sales_train_nan.csv')
sales_train = pd.read_csv(sales_train_path)
price_path = os.path.join('..', 'data', 'sell_prices.csv')
#if 'snap' in quant_features or 'sell_price' in quant_features:
#    prices = pd.read_csv(price_path)

calendar_path = os.path.join('..', 'data', 'raw', 'calendar.csv')
calendar_df = pd.read_csv(calendar_path, parse_dates=['date'])

train_dates = calendar_df.date[:-84]
test_dates = calendar_df.date[-84:-56]

mask = ['d_' in col for col in sales_train.columns]
df = sales_train.iloc[:, mask].T.set_index(calendar_df.date[:sum(mask)])
df = df[~df.index.isin(test_dates)]

In [30]:
df.head()

Unnamed: 0,date,variable,value,last_week,last_year,ma30,store_id,weekday,days
0,2013-07-18,0,1.0,1.0,1.0,0.366667,CA_1,3,901
1,2013-07-19,0,0.0,1.0,1.0,0.366667,CA_1,4,902
2,2013-07-20,0,0.0,1.0,1.0,0.366667,CA_1,5,903
3,2013-07-21,0,0.0,1.0,1.0,0.366667,CA_1,6,904
4,2013-07-22,0,0.0,1.0,1.0,0.366667,CA_1,0,905


In [6]:
periods = [30] #[15, 30, 60, 90]
date = pd.to_datetime('2016-03-27')
ma_features = build_ma_features(df, date, periods)

In [7]:
ma_features

Unnamed: 0,ma_30
0,1.133333
1,0.266667
2,0.500000
3,1.866667
4,1.133333
...,...
30485,0.300000
30486,0.000000
30487,1.200000
30488,1.033333


In [16]:
X_train

NameError: name 'X_train' is not defined

In [None]:

important_holidays = ['SuperBowl', 'Christmas', 'NewYear', 'Thanksgiving',
                      'ValentinesDay', 'Chanukah End', 'IndependenceDay',
                      'NBAFinalsEnd', 'MemorialDay', 'Halloween']
df.loc[~df.event_name_1.isin(important_holidays), 'event_name_1'] = np.nan

test_dates = calendar.date[-84:-56]

quant_features = ['last_week', 'last_year', 'days', 'sell_price', 'ma30', 'mean']
cat_features = ['weekday', 'store_id', 'event_name_1']

y_test = df[df.date.isin(test_dates)].value
y_train = df[~df.date.isin(test_dates)].value
X_train = df[~df.date.isin(test_dates)][quant_features + cat_features]

del df

print('Finished with train/test split')

model_id = random.choice(range(100000))

#Training

#full set
#quant_features = ['last_week', 'last_year', 'days', 'year', 'snap', 'ma7', 'ma14', 'ma30',
#                  'ma60', 'ma90', 'wd_lag_5', 'wd_lag_10', 'wd_lag_20' 'mean']
#cat_features = ['weekday', 'month', 'store_id', 'event_name_1']

#dv = DictVectorizer(sparse=False)

#X_cat = dv.fit_transform(X_train[cat_features].to_dict('records'))
#print('DV of train complete')

#X_train = np.c_[X_train[quant_features], X_cat]
#print('Done making X_train')

#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#print('Done scaling X_train')



les = []
for feature in cat_features:
    le = LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature].fillna('None'))
    les.append(le)

train_data = lgb.Dataset(X_train, label = y_train, 
                         categorical_feature=cat_features, free_raw_data=False)
reg = lgb.train(params, train_data)


reg.fit(X_train, y_train)
print('Training complete')

In [138]:
list(reg.feature_name())

['days_7', 'days_28', 'months_1', 'years_1', 'ma_30', 'days', 'weekday']

In [374]:
prices[prices.wm_yr_wk == calendar[calendar.date == test_date].wm_yr_wk.values[0]]

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
211,CA_1,HOBBIES_1_002,11226,3.97
613,CA_1,HOBBIES_1_004,11226,4.34
879,CA_1,HOBBIES_1_005,11226,2.98
1100,CA_1,HOBBIES_1_006,11226,1.00
1307,CA_1,HOBBIES_1_007,11226,7.94
...,...,...,...,...
6839350,WI_3,FOODS_3_820,11226,1.98
6839801,WI_3,FOODS_3_822,11226,3.98
6840083,WI_3,FOODS_3_823,11226,2.98
6840365,WI_3,FOODS_3_824,11226,2.68


In [137]:
list(reg.feature_importance())

[26779, 0, 0, 0, 54435, 49961, 21225]

In [None]:
months = dict(zip(range(1,13), calendar.month_name[1:13]))

feature_path = os.path.join('data', 'feature_set.csv')
calendar_path = os.path.join('data', 'calendar.csv')

df = pd.read_csv(feature_path, parse_dates=['date'])
calendar = pd.read_csv(calendar_path, parse_dates=['date'])

df['month'] = df.month.map(months)

important_holidays = ['SuperBowl', 'Christmas', 'NewYear', 'Thanksgiving',
                      'ValentinesDay', 'Chanukah End', 'IndependenceDay',
                      'NBAFinalsEnd', 'MemorialDay', 'Halloween']
df.loc[~df.event_name_1.isin(important_holidays), 'event_name_1'] = np.nan

test_dates = calendar.date[-84:-56]

quant_features = ['last_week', 'last_year', 'days', 'sell_price', 'ma30', 'mean']
cat_features = ['weekday', 'store_id', 'event_name_1']

y_test = df[df.date.isin(test_dates)].value
y_train = df[~df.date.isin(test_dates)].value
X_train = df[~df.date.isin(test_dates)][quant_features + cat_features]

del df

print('Finished with train/test split')

model_id = random.choice(range(100000))

#Training

#full set
#quant_features = ['last_week', 'last_year', 'days', 'year', 'snap', 'ma7', 'ma14', 'ma30',
#                  'ma60', 'ma90', 'wd_lag_5', 'wd_lag_10', 'wd_lag_20' 'mean']
#cat_features = ['weekday', 'month', 'store_id', 'event_name_1']

#dv = DictVectorizer(sparse=False)

#X_cat = dv.fit_transform(X_train[cat_features].to_dict('records'))
#print('DV of train complete')

#X_train = np.c_[X_train[quant_features], X_cat]
#print('Done making X_train')

#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#print('Done scaling X_train')

params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

les = []
for feature in cat_features:
    le = LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature].fillna('None'))
    les.append(le)

train_data = lgb.Dataset(X_train, label = y_train, 
                         categorical_feature=cat_features, free_raw_data=False)
reg = lgb.train(params, train_data)


reg.fit(X_train, y_train)
print('Training complete')


#Testing
sales_train_path = os.path.join('data', 'sales_train_nan.csv')
sales_train = pd.read_csv(sales_train_path)
price_path = os.path.join('data', 'sell_prices.csv')
if 'snap' in quant_features or 'sell_price' in quant_features:
    prices = pd.read_csv(price_path)

mask = ['d_' in col for col in sales_train.columns]
df = sales_train.iloc[:, mask].T.set_index(calendar.date[:sum(mask)])
df = df[~df.index.isin(test_dates)]



df_last_year = df.set_index(df.index + pd.DateOffset(years=1))

item_mean = df.mean()
start_date = calendar.date.min()

for date in test_dates:
    ma_features = build_ma_features(df, date, [30])
    ar_features = build_ar_features(df, date, [7])
    year_lag = df[date - pd.DateOffset(years=1)].rename('last_year')
    features = pd.concat([ma_features, ar_features, year_lag], axis=1)
    features['date'] = date
    features['mean'] = item_mean
    features['month'] = months[date.month]
    features['year'] = date.year
    features['days'] = (date - start_date).days
    features['weekday'] = calendar.weekday[calendar.date == date].values[0]
    features['store_id'] = sales_train['store_id']
    features['event_name_1'] = calendar.event_name_1[calendar.date == date].values[0]
    
    
    if 'sell_price' in quant_features:
        date_prices = prices[prices.wm_yr_wk == calendar[calendar.date == date].wm_yr_wk.values[0]]
        features['sell_price'] = date_prices.sell_price
    if 'snap' in quant_features:
        date_prices = prices[prices.wm_yr_wk == calendar[calendar.date == date].wm_yr_wk.values[0]]
        snap_map = {'CA': calendar[calendar.date == date].snap_CA.values[0],
                    'TX': calendar[calendar.date == date].snap_TX.values[0],
                    'WI': calendar[calendar.date == date].snap_WI.values[0]}
        features['snap'] = sales_train[sales_train.item_id.isin(date_prices.item_id.values)].state_id.map(snap_map)
    X_cat = dv.transform(features[cat_features].to_dict('records'))
    X_test = np.c_[features[quant_features], X_cat]
    #X_test = sc.transform(X_test)
    features['forecast'] = reg.predict(X_test)
    forecast_row = pd.pivot(features, columns='date', values='forecast').T
    df = df.append(forecast_row)

y_pred = pd.melt(df[df.index.isin(test_dates)]).value

model_score = MSE(y_pred, y_test)

print('Finished scoring. Model score: ' + str(model_score))

meta_path = os.path.join('models', ''.join(['model_', str(model_id), '_metadata.csv']))
model_meta = pd.Series({'model_id': model_id,
                        'quant_features': quant_features,
                        'cat_features': cat_features,
                        'feature_names': quant_features + dv.get_feature_names(),
                        'feature_importances': reg.feature_importances_,
                        'score': model_score})
model_meta.to_csv(meta_path, header=False)

best_model_path = os.path.join('models', 'best_model.csv')
if os.path.exists(best_model_path):
    best_model = pd.read_csv(best_model_path, squeeze=True, index_col=0)
else:
    best_model = model_meta

if model_score <= float(best_model.loc['score']):
    
    print('(this is the best model so far)')
    
    model_meta.to_csv(best_model_path, header=False)
    
    dv_path = os.path.join('models', ''.join(['dv_', str(model_id), '.joblib']))
    dump(dv, dv_path)
    
    sc_path = os.path.join('models', ''.join(['sc_', str(model_id), '.joblib']))
    dump(sc, sc_path)
    
    reg_path = os.path.join('models', ''.join(['reg_', str(model_id), '.joblib']))
    dump(reg, reg_path)

print('Testing complete')