In [125]:
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [192]:
scored_hist = pd.read_csv("../data/scored_histograms.csv", index_col="Id")
query_hist = pd.read_csv("../data/query_histograms.csv", index_col="Id")
# scored_pca = pd.read_csv("../data/scored_features.csv", index_col="Id")
# query_pca = pd.read_csv("../data/query_features.csv", index_col="Id")
scored_features = pd.read_csv("../data/scored_features.csv", index_col="Id")
scored_features.columns = [col + "_f" for col in scored_hog.columns]
query_features = pd.read_csv("../data/query_features.csv", index_col="Id")
query_features.columns = [col + "_f" for col in query_hog.columns]

X = scored_hist.drop(columns=['Unnamed: 0', 'Actual'])
# X['pca'] = scored_hist.iloc[:, 1]
X = X.join(scored_features)
y = pd.DataFrame(scored_hist['Actual'])

X_test = query_hist.drop(columns=['Unnamed: 0', 'Actual'])
# X_test['pca'] = query_hist.iloc[:, 1]
X_test = X_test.join(query_features)

scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [112]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.85)

In [114]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [193]:
hyperparams = {
    "clf__n_estimators": [50, 100, 500, 750],
    "clf__max_depth": [6, 9],
    "clf__colsample_bytree": [0.7, 1],
    "clf__reg_lambda": [1, 2],
    "clf__reg_alpha" : [2, 3],
    "clf__learning_rate": [0.1],
    "clf__objective": ["reg:squarederror"]
}

fast_hyperparams = {
    "clf__n_estimators": [500],
    "clf__max_depth": [9],
    "clf__colsample_bytree": [1],
    "clf__reg_lambda": [1],
    "clf__reg_alpha" : [2],
    "clf__learning_rate": [0.1],
    "clf__objective": ["reg:squarederror"]
}

pipeline = Pipeline([('transformer', StandardScaler()),
                         ('clf', XGBRegressor())])

search = GridSearchCV(pipeline, hyperparams, cv=5, verbose=10, n_jobs=-1, refit=True, scoring=scorer)
search.fit(X, y)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   39.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   56.7s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 22.9min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 27.9min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 29

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('transformer',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('clf',
                                        XGBRegressor(base_score=0.5,
                                                     booster='gbtree',
                                                     colsample_bylevel=1,
                                                     colsample_bynode=1,
                                                     colsample_bytree=1,
                                                     gamma=0,
                                                     importance_type='gain',
                                                     learning_rate=0.1,
                                     

In [194]:
search.__dict__

{'scoring': make_scorer(mean_absolute_error, greater_is_better=False),
 'estimator': Pipeline(memory=None,
          steps=[('transformer',
                  StandardScaler(copy=True, with_mean=True, with_std=True)),
                 ('clf',
                  XGBRegressor(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0,
                               importance_type='gain', learning_rate=0.1,
                               max_delta_step=0, max_depth=3, min_child_weight=1,
                               missing=None, n_estimators=100, n_jobs=1,
                               nthread=None, objective='reg:linear',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, seed=None, silent=None,
                               subsample=1, verbosity=1))],
          verbose=False),
 'n_jobs': -1,
 'iid': 'depr

In [195]:
test_predictions = search.predict(X_test)
output = pd.DataFrame(columns=["Predicted"])
output["Id"] = X_test.index
output["Predicted"] = test_predictions
output.set_index("Id", inplace=True)
output.to_csv("res.csv")