In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import os
import gc
from sklearn.metrics import mean_squared_error as MSE

## Build training data

In [2]:
df = pd.DataFrame(
    {
        'other_feature_1':[0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0], 
        'other_feature_2':[2, 5, 2, 3, 1, 4, 5, 3, 2, 1, 4, 6, 7, 1, 2, 2, 8, 4, 1, 1, 3, 2, 4, 2, 1], 
        'target':[5, 6, 8, 9, 7, 6, 5, 3, 2, 5, 6, 3, 2, 4, 2, 6, 7, 5, 2, 1, 3, 4, 8, 6, 8]
    }
)

In [3]:
df[:5]

Unnamed: 0,other_feature_1,other_feature_2,target
0,0,2,5
1,1,5,6
2,1,2,8
3,0,3,9
4,1,1,7


In [4]:
df['last_sales'] = df['target'].shift(1)

In [5]:
df = df[['last_sales', 'other_feature_1', 'other_feature_2', 'target']]
df[:5]

Unnamed: 0,last_sales,other_feature_1,other_feature_2,target
0,,0,2,5
1,5.0,1,5,6
2,6.0,1,2,8
3,8.0,0,3,9
4,9.0,1,1,7


## Train the models

In [6]:
features = [f for f in df.columns if f not in ['target']]
y = df['target']
categorical_feature = ['other_feature_1']

In [7]:
params = {
    'num_leaves': 2,
    'max_depth': 2,
    'objective': 'regression',
    'learning_rate': 0.1,
    'seed': 2333,
    'metric': 'l2',
    'min_data': 1,
}

In [8]:
batch_size = 12 # how many data are there in a batch to train 1 model
interval = 3 # what is the interval between batches

In [9]:
batch = (len(df) + 1 - batch_size) // interval # calc how many batches will there be
print('%i batches in all'%(batch))

models = []
for i in range(0, batch): # generate data for each batch
    X_train = df[features].iloc[np.r_[range(len(df)-batch_size-(i+1)*interval, len(df)-(i+1)*interval)]] # train features
    y_train = y.iloc[np.r_[range(len(df)-batch_size-(i+1)*interval, len(df)-(i+1)*interval)]] # train target
    X_val = df[features].iloc[np.r_[range(len(df)-batch_size-i*interval, len(df)-i*interval)]] # val features
    y_val = y.iloc[np.r_[range(len(df)-batch_size-i*interval, len(df)-i*interval)]] # val target
    
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_feature)
    lgb_val = lgb.Dataset(X_val, y_val, categorical_feature=categorical_feature, reference=lgb_train)
    
    bst = lgb.train(params,
                    lgb_train,
                    num_boost_round=2000,
                    valid_sets=[lgb_train, lgb_val],
                    early_stopping_rounds = 10,
                    verbose_eval = 10)
    models.append(bst)

4 batches in all
Training until validation scores don't improve for 10 rounds.
[10]	training's l2: 2.61076	valid_1's l2: 5.49424
[20]	training's l2: 2.37715	valid_1's l2: 5.32769
Early stopping, best iteration is:
[19]	training's l2: 2.39242	valid_1's l2: 5.30221
Training until validation scores don't improve for 10 rounds.
[10]	training's l2: 2.38909	valid_1's l2: 3.37119
Early stopping, best iteration is:
[2]	training's l2: 2.88295	valid_1's l2: 3.331
Training until validation scores don't improve for 10 rounds.
[10]	training's l2: 2.06362	valid_1's l2: 2.81376
[20]	training's l2: 1.62906	valid_1's l2: 2.47332
[30]	training's l2: 1.33858	valid_1's l2: 2.25893
[40]	training's l2: 1.14673	valid_1's l2: 2.10776
[50]	training's l2: 1.02034	valid_1's l2: 2.0224
[60]	training's l2: 0.934663	valid_1's l2: 1.97775
[70]	training's l2: 0.874806	valid_1's l2: 1.93564
[80]	training's l2: 0.833027	valid_1's l2: 1.89104
[90]	training's l2: 0.803528	valid_1's l2: 1.87926
[100]	training's l2: 0.7827



In [10]:
len(models) # check how many models we got

4

## Check the overall performance on validation

In [11]:
overall_pred = np.zeros([len(df),])
for i in range(len(models)):
    bst = models[i]
    overall_pred = overall_pred + 1.0 / batch * bst.predict(df[features])

In [12]:
overall_pred

array([3.94996843, 4.45015517, 4.69081331, 5.27336021, 5.25299804,
       5.27336021, 4.24650648, 4.33751527, 3.32702944, 3.92960627,
       4.61598863, 4.52497984, 3.16119598, 3.65113291, 3.6055028 ,
       3.94996843, 4.52497984, 5.55183357, 4.03867974, 3.92960627,
       3.94996843, 3.32702944, 3.6055028 , 5.27336021, 4.11350442])

In [13]:
MSE(df['target'], np.round(overall_pred))

4.24

## Test

In [14]:
test_df = pd.DataFrame(
    {
        'other_feature_1':[0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0], 
        'other_feature_2':[2, 5, 2, 3, 1, 4, 5, 3, 2, 1, 4, 6, 7, 1, 2, 2, 8, 4, 1, 1, 3, 2, 4, 2, 1], 
        'target':np.nan
    }
) # create some test data

In [15]:
test_df['last_sales'] = test_df['target'].shift(1)
test_df = test_df[['last_sales', 'other_feature_1', 'other_feature_2', 'target']]
test_df['last_sales'].iloc[0] = df['target'].iloc[-1] # the first last_sales in test is the last target in train
test_df[:5]

Unnamed: 0,last_sales,other_feature_1,other_feature_2,target
0,8.0,0,2,
1,,1,5,
2,,1,2,
3,,0,3,
4,,1,1,


In [16]:
for i in range(len(test_df) - 1):
    test_df['target'].iloc[i] = np.round(np.mean([bst.predict(df[features].iloc[i]) for bst in models])) # predict the target
    test_df['last_sales'].iloc[i+1] = test_df['target'].iloc[i] # last_sales in i+1 = target in i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [17]:
test_df

Unnamed: 0,last_sales,other_feature_1,other_feature_2,target
0,8.0,0,2,4.0
1,4.0,1,5,4.0
2,4.0,1,2,5.0
3,5.0,0,3,5.0
4,5.0,1,1,5.0
5,5.0,0,4,5.0
6,5.0,0,5,4.0
7,4.0,0,3,4.0
8,4.0,0,2,3.0
9,3.0,1,1,4.0
