In [1]:
import featuretools as ft
import flaml
import numpy as np
import pandas as pd
from flaml.automl.ml import sklearn_metric_loss_score

In [2]:
df = pd.read_csv("wind-turbine/train.csv")
numerical_columns = df.describe().columns.values
categorical_columns = ["turbine_status", "cloud_level"]

In [3]:
def clean_outliers(frame, feature):
    column_data = frame[feature]
    column_data = column_data[~np.isnan(column_data)]

    mean, std = np.mean(column_data), np.std(column_data)

    lower_bound = mean - std * 3
    upper_bound = mean + std * 3
    frame.loc[((frame[feature] < lower_bound) | (frame[feature] > upper_bound))] = np.nan
    return frame


def clean_data(frame, is_test=False, skip_impute=False):
    for feature in categorical_columns:
        frame[feature] = pd.Series(frame[feature], dtype="category")
    frame["datetime"] = pd.to_datetime(frame["datetime"])
    for f in numerical_columns:
        if is_test and f == "windmill_generated_power(kW/h)":
            pass
        frame.loc[frame[f] == -99.0, f] = np.nan
        frame.loc[frame[f] == 99.0, f] = np.nan
        frame.loc[frame[f] == -999.0, f] = np.nan
        frame.loc[frame[f] == 999.0, f] = np.nan

        frame = clean_outliers(frame, f)

    frame.loc[frame["wind_speed(m/s)"] < 0, "wind_speed(m/s)"] = 0
    frame.loc[frame["wind_speed(m/s)"] > 113, "wind_speed(m/s)"] = 113

    frame.loc[frame["blade_length(m)"] < 0, "blade_length(m)"] = 0
    frame.loc[frame["windmill_height(m)"] < 0, "windmill_height(m)"] = 0
    frame.loc[frame["resistance(ohm)"] < 0, "resistance(ohm)"] = 0

    if skip_impute:
        for f in frame.columns:
            if (is_test and f == "windmill_generated_power(kW/h)") or f == "tracking_id":
                pass
            if f in numerical_columns:
                frame[f].fillna(frame[f].median(), inplace=True)
            else:
                frame[f].fillna(frame[f].mode()[0], inplace=True)
    frame.drop_duplicates(subset="tracking_id", keep="last", inplace=True)
    frame.drop(frame[frame["tracking_id"].isnull()].index, inplace=True)
    return frame

In [4]:
df = clean_data(df)

In [5]:
es = ft.EntitySet(id="wind-turbine")
es = es.add_dataframe(
    dataframe_name="wind-turbine",
    dataframe=df,
    index="tracking_id"
)
feature_matrix, feature_defs = ft.dfs(
    entityset=es, target_dataframe_name="wind-turbine",
    trans_primitives=["day", "year", "month", "weekday"],
    max_depth=1)
feature_matrix_enc, features_enc = ft.encode_features(feature_matrix, feature_defs)

In [6]:
X = feature_matrix_enc.drop(columns=["windmill_generated_power(kW/h)", "generator_temperature(°C)"], axis=1)
y = feature_matrix_enc["windmill_generated_power(kW/h)"]

## AutoML

In [8]:
automl = flaml.AutoML()
automl.fit(X, y, task="regression", time_budget=60)

[flaml.automl.logger: 05-23 11:57:29] {1693} INFO - task = regression
[flaml.automl.logger: 05-23 11:57:29] {1700} INFO - Data split method: uniform
[flaml.automl.logger: 05-23 11:57:29] {1703} INFO - Evaluation method: holdout
[flaml.automl.logger: 05-23 11:57:29] {1801} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 05-23 11:57:29] {1911} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 05-23 11:57:29] {2221} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 05-23 11:57:29] {2347} INFO - Estimated sufficient time budget=675s. Estimated necessary time budget=5s.
[flaml.automl.logger: 05-23 11:57:29] {2394} INFO -  at 0.3s,	estimator lgbm's best error=0.6249,	best estimator lgbm's best error=0.6249
[flaml.automl.logger: 05-23 11:57:29] {2221} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 05-23 11:57:29] {2394} INFO -  at 0.4s,	estimator lgbm's best error=0.6249,	best estim

In [9]:
y_pred = automl.predict(X)
print(f"r2: {1 - sklearn_metric_loss_score('r2', y_pred, y)}")
print(f"MSE: {sklearn_metric_loss_score('mse', y_pred, y)}")

r2: 0.9878605489721696
MSE: 0.08090827806554425


In [10]:
print(automl.best_config)
print(automl.best_config_per_estimator)
print(automl.time_to_find_best_model)

{'n_estimators': 85, 'max_features': 0.4977274222126191, 'max_leaves': 1083}
{'lgbm': {'n_estimators': 248, 'num_leaves': 4, 'min_child_samples': 3, 'learning_rate': 0.3239297756196601, 'log_max_bin': 9, 'colsample_bytree': 0.7330450035989674, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.02411339190861214}, 'rf': {'n_estimators': 85, 'max_features': 0.4977274222126191, 'max_leaves': 1083}, 'xgboost': {'n_estimators': 43, 'max_leaves': 50, 'min_child_weight': 31.942115732268565, 'learning_rate': 0.09485720113896808, 'subsample': 0.8895588746662894, 'colsample_bylevel': 0.847756342161632, 'colsample_bytree': 0.8136549849411188, 'reg_alpha': 0.019387994312089204, 'reg_lambda': 0.08649036623112866}, 'extra_tree': {'n_estimators': 147, 'max_features': 0.5744573026671392, 'max_leaves': 3932}, 'xgb_limitdepth': {'n_estimators': 10, 'max_depth': 7, 'min_child_weight': 1.4414106781003007, 'learning_rate': 0.36537736318193215, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'colsample_bytree': 0.94681

## Customizing fit

In [11]:
automl = flaml.AutoML()
custom_hp = {
    "learning_rate": {
        "domain": flaml.tune.loguniform(0.0001, 0.05)
    }
}
automl.fit(X, y, task="regression", time_budget=120,
           metric="mse",
           estimator_list=["lgbm", "xgboost", "rf"],
           custom_hp={
               "lgbm": custom_hp
           },
           hpo_method="bs")


## Zero-Shot AutoML

In [12]:
from flaml.default import LGBMRegressor

zs_model = LGBMRegressor()
zs_model.fit(X, y)

In [13]:
y_pred = zs_model.predict(X)
print(f"r2: {1 - sklearn_metric_loss_score('r2', y_pred, y)}")
print(f"MSE: {sklearn_metric_loss_score('mse', y_pred, y)}")

r2: 0.9999520158503908
MSE: 0.0003198097599648468


## Scikit-learn pipelines

In [28]:
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

set_config(display="diagram")

In [30]:
df = pd.read_csv("wind-turbine/train.csv")
numerical_columns = df.describe().columns.values
categorical_columns = ["turbine_status", "cloud_level"]

In [31]:
df = clean_data(df, skip_impute=True)

In [29]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        es = ft.EntitySet(id="wind-turbine")
        es = es.add_dataframe(
            dataframe_name="wind-turbine",
            dataframe=df,
            index="tracking_id"
        )

        feature_matrix, feature_defs = ft.dfs(
            entityset=es, target_dataframe_name="wind-turbine",
            trans_primitives=["day", "year", "month", "weekday"],
            max_depth=1)
        feature_matrix_enc, features_enc = ft.encode_features(feature_matrix, feature_defs)
        return feature_matrix_enc

In [34]:
automl_pipeline = Pipeline([
    ("custom", CustomTransformer()),
    ("automl", flaml.AutoML())
])
automl_pipeline

In [35]:
X = feature_matrix_enc.drop(columns=["windmill_generated_power(kW/h)", "generator_temperature(°C)"], axis=1)
y = feature_matrix_enc["windmill_generated_power(kW/h)"]

In [36]:
automl_settings = {
    "task": "regression", "time_budget": 5
}
pipeline_settings = {
    f"automl__{key}": value for key, value in automl_settings.items()
}

In [37]:
automl_pipeline.fit(X, y, **pipeline_settings)

[flaml.automl.logger: 05-23 13:30:43] {1693} INFO - task = regression
[flaml.automl.logger: 05-23 13:30:43] {1700} INFO - Data split method: uniform
[flaml.automl.logger: 05-23 13:30:43] {1703} INFO - Evaluation method: holdout
[flaml.automl.logger: 05-23 13:30:43] {1801} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 05-23 13:30:43] {1911} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 05-23 13:30:43] {2221} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 05-23 13:30:43] {2347} INFO - Estimated sufficient time budget=466s. Estimated necessary time budget=3s.
[flaml.automl.logger: 05-23 13:30:43] {2394} INFO -  at 0.3s,	estimator lgbm's best error=0.4744,	best estimator lgbm's best error=0.4744
[flaml.automl.logger: 05-23 13:30:43] {2221} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 05-23 13:30:43] {2394} INFO -  at 0.3s,	estimator lgbm's best error=0.4744,	best estim

In [39]:
automl = automl_pipeline.steps[1][1]
# Get the best config and best learner
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1 - automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 99, 'num_leaves': 4, 'min_child_samples': 3, 'learning_rate': 1.0, 'log_max_bin': 10, 'colsample_bytree': 0.7656467580403348, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.006290083866428512}
Best accuracy on validation data: 0.9998
Training duration of best run: 0.1274 s
