In [53]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import cross_validate, GridSearchCV

from sklearn.metrics.pairwise import cosine_distances


In [54]:
wavelet_features = np.load('data/wavelet_embedding/X_train.npy')
fft_features = np.load('data/fft_embedding/X_train.npy')
signal_features = np.load('data/signal_embedding/X_train.npy')

features = np.concatenate([wavelet_features, fft_features, signal_features], axis=1)

labels = pd.read_csv('data/y_train.csv')['surface'].values

In [55]:
cv = GridSearchCV(
    GradientBoostingClassifier(),
    {
        'n_estimators': [200, 300, 400, 500, 600],
        'max_features': [2, 3, 4],
    },
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    refit=True
)
cv.fit(features, labels)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [200, 300, 400, 500, 600], 'max_features': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [57]:
cv.best_params_

{'max_features': 2, 'n_estimators': 300}

In [50]:
predictions = cv.best_estimator_.predict(np.concatenate([
    np.load('data/wavelet_embedding/X_test.npy'), 
    np.load('data/fft_embedding/X_test.npy'), 
    np.load('data/signal_embedding/X_test.npy')
], axis=1))

submission = pd.DataFrame({'surface': predictions})
submission.index.name = 'series_id'
submission.to_csv('data/ensemble_embedding/y_test.csv')

In [22]:
from pipelines.preprocess import fetch_training
from sklearn.feature_selection import SelectFromModel
from joblib import load

In [28]:
feature_selector = SelectFromModel(load('models/random_forest/model-0.8193.list')[0], prefit=True, max_features=800)
features, labels = fetch_training()

In [32]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate

cv = cross_validate(
    XGBClassifier(max_depth=5, n_estimators=200, n_jobs=4), 
    feature_selector.transform(features), labels,
    cv=StratifiedKFold(n_splits=5),
    n_jobs=-1,
    return_estimator=True
)

KeyboardInterrupt: 