## Gradient Boosting
Notebook that trains an XGBoost regressor on image features. The best preforming ones are Histogram + fourier transform

In [1]:
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import pickle

In [2]:
scored_features = pd.read_csv("../../data/scored_features_psd1d_windowed5.csv", index_col="Id")

query_features = pd.read_csv("../../data/query_features_psd1d_windowed5.csv", index_col="Id")

X = scored_features.drop(columns=['Actual'])
y = pd.DataFrame(scored_features['Actual'])

X_test = query_features.drop(columns=['Actual'])
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [3]:
X

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,742,743,744,745,746,747,748,749,750,751
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1002900,998221,847,280,160,83,61,52,25,21,9,...,4.626796e+07,4.642812e+07,4.593790e+07,4.585595e+07,4.578204e+07,4.603637e+07,4.560498e+07,4.559972e+07,4.549583e+07,4.549273e+07
1003620,997074,1457,438,220,134,87,68,57,32,29,...,3.623885e+07,3.612533e+07,3.608879e+07,3.599915e+07,3.579921e+07,3.587868e+07,3.583183e+07,3.580298e+07,3.604408e+07,3.617877e+07
1005208,3,997571,1318,360,161,99,60,42,33,25,...,1.144406e+08,1.142740e+08,1.120636e+08,1.111863e+08,1.102576e+08,1.104660e+08,1.090938e+08,1.084936e+08,1.069432e+08,1.065354e+08
1005849,1,28,997343,1189,443,242,143,99,78,58,...,1.659868e+08,1.680895e+08,1.669631e+08,1.676491e+08,1.684652e+08,1.703079e+08,1.699725e+08,1.703015e+08,1.701955e+08,1.707331e+08
1006237,996531,2199,480,212,114,79,50,34,16,16,...,3.151079e+06,3.166774e+06,3.152987e+06,3.167793e+06,3.173603e+06,3.188755e+06,3.181852e+06,3.182310e+06,3.152251e+06,3.151417e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9997019,995952,1952,586,303,187,145,96,67,64,51,...,2.051379e+06,2.041626e+06,2.016874e+06,1.996301e+06,1.975905e+06,1.994323e+06,1.993819e+06,1.986311e+06,2.009180e+06,2.004598e+06
9997923,997741,1228,346,157,88,78,45,33,34,25,...,1.150899e+08,1.154802e+08,1.141978e+08,1.144072e+08,1.143112e+08,1.150610e+08,1.145714e+08,1.144504e+08,1.134603e+08,1.136776e+08
9998327,1,4,996439,2257,391,202,107,71,63,39,...,3.963732e+08,3.980647e+08,3.927951e+08,3.943544e+08,3.929636e+08,3.954161e+08,3.943769e+08,3.951336e+08,3.914972e+08,3.949623e+08
9998462,998428,816,227,118,57,42,26,27,16,21,...,2.059148e+08,2.083334e+08,2.071618e+08,2.087574e+08,2.100891e+08,2.131048e+08,2.137442e+08,2.155414e+08,2.150978e+08,2.168884e+08


In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.85)

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [6]:
hyperparams = {
    "clf__n_estimators": [200, 300, 400],
    "clf__max_depth": [3, 6, 9],
    "clf__colsample_bytree": [0.7],
    "clf__reg_lambda": [2],
    "clf__reg_alpha" : [2],
    "clf__learning_rate": [0.1],
    "clf__objective": ["reg:squarederror"]
}

fast_hyperparams = {
    "clf__n_estimators": [300],
    "clf__max_depth": [9],
    "clf__colsample_bytree": [0.7],
    "clf__reg_lambda": [2],
    "clf__reg_alpha" : [2],
    "clf__learning_rate": [0.1],
    "clf__objective": ["reg:squarederror"]
}

pipeline = Pipeline([('transformer', StandardScaler()),
                         ('clf', XGBRegressor(n_jobs=2))])

search = GridSearchCV(pipeline, fast_hyperparams, cv=5, verbose=10, n_jobs=4, refit=True, scoring=scorer)
search.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:  1.1min remaining:  1.6min
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:  1.1min remaining:   43.7s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:  1.1min remaining:    0.0s


KeyboardInterrupt: 

In [24]:
search.__dict__

{'scoring': make_scorer(mean_absolute_error, greater_is_better=False),
 'estimator': Pipeline(memory=None,
          steps=[('transformer',
                  StandardScaler(copy=True, with_mean=True, with_std=True)),
                 ('clf',
                  XGBRegressor(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0,
                               importance_type='gain', learning_rate=0.1,
                               max_delta_step=0, max_depth=3, min_child_weight=1,
                               missing=None, n_estimators=100, n_jobs=2,
                               nthread=None, objective='reg:linear',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, seed=None, silent=None,
                               subsample=1, verbosity=1))],
          verbose=False),
 'n_jobs': 4,
 'iid': 'depre

In [236]:
test_predictions = search.predict(X_test)
output = pd.DataFrame(columns=["Predicted"])
output["Id"] = X_test.index
output["Predicted"] = test_predictions
output.set_index("Id", inplace=True)
output.to_csv("res.csv")

In [25]:
with open("random_forest.pkl", "wb") as f:
    pickle.dump(search.best_estimator_, f)