In [1]:
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [10]:
# scored_hist = pd.read_csv("../data/scored_histograms.csv", index_col="Id")
# query_hist = pd.read_csv("../data/query_histograms.csv", index_col="Id")
# scored_pca = pd.read_csv("../data/scored_features.csv", index_col="Id")
# query_pca = pd.read_csv("../data/query_features.csv", index_col="Id")
scored_features = pd.read_csv("../data/scored_features_400_bins.csv", index_col="Id")
# scored_features.drop(columns=['Actual', 'Unnamed: 0'], inplace=True)
# scored_features.columns = [col + "_f" for col in scored_features.columns]
query_features = pd.read_csv("../data/query_features_400_bins.csv", index_col="Id")
# query_features.drop(columns=['Unnamed: 0'], inplace=True)
# query_features.columns = [col + "_f" for col in query_features.columns]

X = scored_features.drop(columns=['Actual'])
# reduce this histogram to fewer buckets, should divide 256
num_buckets = 32
# X = X.groupby(np.tile(np.arange(num_buckets), 256//num_buckets), axis=1).sum()
# X['pca'] = scored_hist.iloc[:, 1]
# X = X.join(scored_features)
y = pd.DataFrame(scored_features['Actual'])

X_test = query_features.drop(columns=['Actual'])
# X_test = X_test.groupby(np.tile(np.arange(num_buckets), 256//num_buckets), axis=1).sum()
# X_test['pca'] = query_hist.iloc[:, 1]
# X_test = X_test.join(query_features)

scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [112]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.85)

In [114]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [11]:
hyperparams = {
    "clf__n_estimators": [200, 300, 400],
    "clf__max_depth": [3, 6, 9],
    "clf__colsample_bytree": [0.7],
    "clf__reg_lambda": [2],
    "clf__reg_alpha" : [2],
    "clf__learning_rate": [0.1],
    "clf__objective": ["reg:squarederror"]
}

fast_hyperparams = {
    "clf__n_estimators": [300],
    "clf__max_depth": [9],
    "clf__colsample_bytree": [0.7],
    "clf__reg_lambda": [2],
    "clf__reg_alpha" : [2],
    "clf__learning_rate": [0.1],
    "clf__objective": ["reg:squarederror"]
}

pipeline = Pipeline([('transformer', StandardScaler()),
                         ('clf', XGBRegressor(n_jobs=2))])

search = GridSearchCV(pipeline, fast_hyperparams, cv=5, verbose=10, n_jobs=4, refit=True, scoring=scorer)
search.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:  1.8min remaining:  2.7min
[Parallel(n_jobs=4)]: Done   3 out of   5 | elapsed:  1.8min remaining:  1.2min
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:  2.5min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:  2.5min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('transformer',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('clf',
                                        XGBRegressor(base_score=0.5,
                                                     booster='gbtree',
                                                     colsample_bylevel=1,
                                                     colsample_bynode=1,
                                                     colsample_bytree=1,
                                                     gamma=0,
                                                     importance_type='gain',
                                                     learning_rate=0.1,
                                     

In [12]:
search.__dict__

{'scoring': make_scorer(mean_absolute_error, greater_is_better=False),
 'estimator': Pipeline(memory=None,
          steps=[('transformer',
                  StandardScaler(copy=True, with_mean=True, with_std=True)),
                 ('clf',
                  XGBRegressor(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0,
                               importance_type='gain', learning_rate=0.1,
                               max_delta_step=0, max_depth=3, min_child_weight=1,
                               missing=None, n_estimators=100, n_jobs=2,
                               nthread=None, objective='reg:linear',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, seed=None, silent=None,
                               subsample=1, verbosity=1))],
          verbose=False),
 'n_jobs': 4,
 'iid': 'depre

In [236]:
test_predictions = search.predict(X_test)
output = pd.DataFrame(columns=["Predicted"])
output["Id"] = X_test.index
output["Predicted"] = test_predictions
output.set_index("Id", inplace=True)
output.to_csv("res.csv")