In [3]:
from pprint import pprint

import sklearn.datasets
import sklearn.metrics

import autosklearn.regression

In [14]:
def unique_cols(df):
    a = df.to_numpy() # df.values (pandas<0.24)
    return (a[0] == a).all(0)


def create_supervised_dataset(df, target, feats, n_in=1, n_out=1):
    cols, names = list(), list()
    n_vars = len(feats)
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df[feats].shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df[target].shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(1)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(1)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    agg.dropna(inplace=True)
    return agg.values

In [15]:
from settings import INPUT_FILE
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv(INPUT_FILE, index_col='ts')
#df = df.drop('Unnamed: 0', 1)
df.index = pd.to_datetime(df.index)

df_2 = df.loc[:,np.invert(unique_cols(df))]

# Average window
# df_2 = df.groupby(np.arange(len(df))//60).mean()

scaler = MinMaxScaler()
d = scaler.fit_transform(df_2)
scaled_df = pd.DataFrame(d, columns=df_2.columns, index=df_2.index)

In [4]:
from settings import CORR_GROUP

for k in CORR_GROUP:
    values = create_supervised_dataset(scaled_df, k, CORR_GROUP[k], n_in=15, n_out=1)
    len_values = values.shape[0]
    # split into train and test sets 
    n_train_seconds = int(0.7*len_values) #70% dos valores
    n_cv_seconds =  int(1*len_values) #20% dos valores
    train = values[:n_train_seconds, :]
    cv = values[n_train_seconds:n_cv_seconds, :]

    # split into input and outputs
    train_X, train_y = train[:, :-1], train[:, -1:]
    test_X, test_y = cv[:, :-1], cv[:, -1:]
    automl = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task=120,
        per_run_time_limit=30,
        tmp_folder='./tmp/autosklearn_regression_example_tmp',
    )
    automl.fit(train_X, train_y, dataset_name='diabetes')
    break



In [5]:
automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    tmp_folder='./tmp/autosklearn_regression_example_tmp',
)
automl.fit(X_train, y_train, dataset_name='diabetes')

  self.metafeatures = self.metafeatures.append(metafeatures)
  self.algorithm_runs[metric].append(runs)


AutoSklearnRegressor(per_run_time_limit=30, time_left_for_this_task=120,
                     tmp_folder='./tmp/autosklearn_regression_example_tmp')

In [6]:
print(automl.leaderboard())


          rank  ensemble_weight               type      cost   duration
model_id                                                               
25           1             0.46                sgd  0.436679   0.740504
6            2             0.32     ard_regression  0.455042   0.839090
27           3             0.14     ard_regression  0.462249   0.732829
11           4             0.02      random_forest  0.507400  11.670796
7            5             0.06  gradient_boosting  0.518673   1.535592


In [7]:
pprint(automl.show_models(), indent=4)

{   6: {   'cost': 0.4550418898836528,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fdb3f7e7310>,
           'ensemble_weight': 0.32,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fdb3fa7e970>,
           'model_id': 6,
           'rank': 2,
           'regressor': <autosklearn.pipeline.components.regression.RegressorChoice object at 0x7fdb3fa7ecd0>,
           'sklearn_regressor': ARDRegression(alpha_1=0.0003701926442639788, alpha_2=2.2118001735899097e-07,
              copy_X=False, lambda_1=1.2037591637980971e-06,
              lambda_2=4.358378124977852e-09,
              threshold_lambda=1136.5286041327277, tol=0.021944240404849075)},
    7: {   'cost': 0.5186726734789994,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fdb3f7eb1c0>,
           'ensemble_

In [13]:
train_predictions = automl.predict(X_train)
print("Train R2 score:", sklearn.metrics.mean_squared_error(y_train, train_predictions, squared=False))
test_predictions = automl.predict(X_test)
print("Test R2 score:", sklearn.metrics.mean_squared_error(y_test, test_predictions, squared=False))

Train R2 score: 49.97336721874017
Test R2 score: 56.15867945482282
