## Imports

In [9]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

pd.options.display.max_columns = 999

## Load Data

In [27]:
train_data = pd.read_csv('../data/demo/train_data.csv')
weights = pd.read_csv('../data/demo/buurt_weights.csv')

## Preprocess

In [29]:
train_data = train_data.drop("WijkenEnBuurten", axis=1).drop("SoortRegio", axis=1).drop(train_data.columns[0], axis=1).drop("attributes.TELJAAR",axis=1)
msk = np.random.rand(len(train_data)) < 0.9
train_set = train_data[msk]
test_set = train_data[~msk]

y_train = train_set['attributes.AANTAL_SUMMED'].values
X_train = train_set.drop('attributes.AANTAL_SUMMED', axis=1).drop('attributes.AANTAL',axis=1).values
y_test = test_set['attributes.AANTAL_SUMMED'].values
X_test = test_set.drop('attributes.AANTAL_SUMMED', axis=1).drop('attributes.AANTAL',axis=1).values

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

## Train

In [30]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'rmse'},
    'num_leaves': 127,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=lgb_eval,
                early_stopping_rounds=10)

Start training...
[1]	valid_0's rmse: 197.019	valid_0's l2: 38816.7
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's rmse: 192.147	valid_0's l2: 36920.6
[3]	valid_0's rmse: 186.738	valid_0's l2: 34871
[4]	valid_0's rmse: 182.389	valid_0's l2: 33265.6
[5]	valid_0's rmse: 176.8	valid_0's l2: 31258.1
[6]	valid_0's rmse: 171.366	valid_0's l2: 29366.4
[7]	valid_0's rmse: 166.797	valid_0's l2: 27821.3
[8]	valid_0's rmse: 160.377	valid_0's l2: 25720.9
[9]	valid_0's rmse: 156.548	valid_0's l2: 24507.2
[10]	valid_0's rmse: 152.467	valid_0's l2: 23246.1
[11]	valid_0's rmse: 151.808	valid_0's l2: 23045.5
[12]	valid_0's rmse: 151.164	valid_0's l2: 22850.5
[13]	valid_0's rmse: 150.733	valid_0's l2: 22720.3
[14]	valid_0's rmse: 147.71	valid_0's l2: 21818.4
[15]	valid_0's rmse: 147.465	valid_0's l2: 21745.8
[16]	valid_0's rmse: 145.328	valid_0's l2: 21120.3
[17]	valid_0's rmse: 142.889	valid_0's l2: 20417.4
[18]	valid_0's rmse: 141.209	valid_0's l2: 19940.1
[19]	valid_0's r

## Predict

In [31]:
print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
print(y_pred)
print(y_test)
baseline = [np.mean(y_train)] * len(y_test)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('The rmse of baseline is:', mean_squared_error(baseline, y_test) ** 0.5)

Start predicting...
[ 160.46000968   90.9784461   458.32850491  221.24756678  213.6460343
  438.13907425   26.85982057  287.6866179   204.09359427  491.97877468
  128.15078647  131.60888832   40.67174245  -16.31587309  423.15508441
   56.8447592    94.29284608   -4.57766604  422.72380683   91.35714189
   -1.41634664  292.92853837  414.45237911  183.52680257]
[  48.    0.  608.   77.  312.  379.    0.  426.    0.  669.  169.   47.
   47.   27.  521.  116.  127.    0.  157.   17.    0.  262.  350.  105.]
The rmse of prediction is: 108.488003085
The rmse of baseline is: 202.519381167


## Validate

In [None]:
zipped = zip(gbm.feature_importance(),train_set.drop('attributes.AANTAL_SUMMED', axis=1).drop('attributes.AANTAL',axis=1).columns[1:])
zipped = sorted(zipped, key = lambda t: t[0], reverse=True)
for importance, column in zipped:
    print("Feature {} has an importance weight of {}".format(column,importance))

## Future predictions pipeline

In [115]:
buurtinfo2016 = pd.read_csv('../data/demo/buurtinfo2016.csv')
true_labels = pd.read_csv('../data/demo/true_labels.csv',index_col=0, header=None)
predictions = pd.read_csv('../data/demo/pred_labels.csv',index_col=0, header=None)
weights = weights[weights['WijkenEnBuurten'].isin(buurtinfo2016['WijkenEnBuurten'].values)]
weights.index = buurtinfo2016.index

In [116]:
def fix_negatives(y_pred, true_2016):
    return_list = []
    for i, value in enumerate(y_pred):
        if value < true_2016[i][0]:
            return_list.append(true_2016[i][0])
        else:
            return_list.append(value)
    return return_list

In [119]:
def pipeline(weights, buurtinfo2016, true_2016, pred_2016):
    for i in range(1,14):
        weights_only = weights.copy()
        weights_only = weights_only * i
        weights_only['WijkenEnBuurten'] = 0
        buurtinfo2016_multiplied = buurtinfo2016.add(weights_only,fill_value=0)
        diff = true_2016.values - pred_2016.values
        diff = np.array(list(map(lambda x: x[0], diff)))
        y_pred = gbm.predict(buurtinfo2016_multiplied[buurtinfo2016_multiplied.columns[2:]].values)
        
        y_pred = y_pred + diff
        df_predicted = buurtinfo2016_multiplied[['WijkenEnBuurten']].copy()
        df_predicted['year'] = 2016 + i
        y_pred[y_pred<0] = 0
        y_pred = fix_negatives(y_pred, true_2016.values)
        df_predicted['predicted_nr_panels'] = y_pred
        df_predicted.to_csv('../data/demo/predictions_{}.csv'.format(2016 + i))
        print("Predicted {} and wrote to file".format(2016+i))

In [120]:
pipeline(weights, buurtinfo2016, true_labels, predictions)

Predicted 2017 and wrote to file
Predicted 2018 and wrote to file
Predicted 2019 and wrote to file
Predicted 2020 and wrote to file
Predicted 2021 and wrote to file
Predicted 2022 and wrote to file
Predicted 2023 and wrote to file
Predicted 2024 and wrote to file
Predicted 2025 and wrote to file
Predicted 2026 and wrote to file
Predicted 2027 and wrote to file
Predicted 2028 and wrote to file
Predicted 2029 and wrote to file
