## Imports

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

pd.options.display.max_columns = 999

## Load Data

In [None]:
train_data = pd.read_csv('../data/demo/train_data.csv')
weights = pd.read_csv('../data/demo/buurt_weights.csv')

## Preprocess

In [None]:
train_data = train_data.drop("WijkenEnBuurten", axis=1).drop("SoortRegio", axis=1).drop(train_data.columns[0], axis=1).drop("attributes.TELJAAR",axis=1)
msk = np.random.rand(len(train_data)) < 0.9
train_set = train_data[msk]
test_set = train_data[~msk]

y_train = train_set['attributes.AANTAL_SUMMED'].values
X_train = train_set.drop('attributes.AANTAL_SUMMED', axis=1).drop('attributes.AANTAL',axis=1).values
y_test = test_set['attributes.AANTAL_SUMMED'].values
X_test = test_set.drop('attributes.AANTAL_SUMMED', axis=1).drop('attributes.AANTAL',axis=1).values

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

## Train

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'rmse'},
    'num_leaves': 127,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=lgb_eval,
                early_stopping_rounds=10)

## Predict

In [None]:
print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
print(y_pred)
print(y_test)
baseline = [np.mean(y_train)] * len(y_test)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('The rmse of baseline is:', mean_squared_error(baseline, y_test) ** 0.5)

## Validate

In [None]:
zipped = zip(gbm.feature_importance(),train_set.drop('attributes.AANTAL_SUMMED', axis=1).drop('attributes.AANTAL',axis=1).columns[1:])
zipped = sorted(zipped, key = lambda t: t[0], reverse=True)
for importance, column in zipped:
    print("Feature {} has an importance weight of {}".format(column,importance))

## Future predictions pipeline

In [None]:
buurtinfo2016 = pd.read_csv('../data/demo/buurtinfo2016.csv')
true_labels = pd.read_csv('../data/demo/true_labels.csv',index_col=0, header=None)
predictions = pd.read_csv('../data/demo/pred_labels.csv',index_col=0, header=None)
weights = weights[weights['WijkenEnBuurten'].isin(buurtinfo2016['WijkenEnBuurten'].values)]
weights.index = buurtinfo2016.index

In [None]:
def fix_negatives(y_pred, true_2016):
    return_list = []
    for i, value in enumerate(y_pred):
        if value < true_2016[i][0]:
            return_list.append(true_2016[i][0])
        else:
            return_list.append(value)
    return return_list

In [None]:
def pipeline(weights, buurtinfo2016, true_2016, pred_2016):
    for i in range(1,14):
        weights_only = weights.copy()
        weights_only = weights_only * i
        weights_only['WijkenEnBuurten'] = 0
        buurtinfo2016_multiplied = buurtinfo2016.add(weights_only,fill_value=0)
        diff = true_2016.values - pred_2016.values
        diff = np.array(list(map(lambda x: x[0], diff)))
        y_pred = gbm.predict(buurtinfo2016_multiplied[buurtinfo2016_multiplied.columns[2:]].values)
        
        y_pred = y_pred + diff
        df_predicted = buurtinfo2016_multiplied[['WijkenEnBuurten']].copy()
        df_predicted['year'] = 2016 + i
        y_pred[y_pred<0] = 0
        y_pred = fix_negatives(y_pred, true_2016.values)
        df_predicted['predicted_nr_panels'] = y_pred
        df_predicted.to_csv('../data/demo/predictions_{}.csv'.format(2016 + i))
        print("Predicted {} and wrote to file".format(2016+i))

In [None]:
pipeline(weights, buurtinfo2016, true_labels, predictions)