In [None]:
import sys
print(f'Interpreter dir: {sys.executable}')
import os
import warnings
warnings.filterwarnings("ignore")
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('../')
    
print(f'Working dir: {os.getcwd()}')
%load_ext autoreload
%autoreload 2


In [None]:
import xgboost as xgb
import lightgbm as lgb
import pandas as pd
import numpy as np
from fbprophet import Prophet
from sklearn.preprocessing import StandardScaler

In [None]:
from bayes_opt import BayesianOptimization
# Ejemplo de bayessian metaparameter optimization por si quieres usarlo para buscar parametros
def bayes_parameter_opt_lgb(X, y,
                            init_round=15,
                            opt_round=25, 
                            n_folds=5, 
                            random_seed=6, 
                            n_estimators=10000, 
                            learning_rate=0.02, 
                            output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y)
    # parameters
    def lgb_eval(num_leaves, feature_fraction,
                 bagging_fraction, max_depth,
                 lambda_l1, lambda_l2,
                 min_split_gain,
                 min_child_weight):
        params = {'application':'binary',
                  'num_iterations': n_estimators, 
                  'learning_rate':learning_rate, 
                  'early_stopping_round':100, 
                  'metric':'binary'}
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        params["is_unbalance"] = True
        cv_result = lgb.cv(params, train_data, nfold=n_folds,
                           seed=random_seed,
                           stratified=True, 
                           verbose_eval =200,
                           metrics=['auc'])
        return max(cv_result['auc-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
                                             'feature_fraction': (0.1, 0.9),
                                             'bagging_fraction': (0.8, 1),
                                             'max_depth': (5, 8.99),
                                             'lambda_l1': (0, 5),
                                             'lambda_l2': (0, 3),
                                             'min_split_gain': (0.001, 0.1),
                                             'min_child_weight': (5, 50)}, random_state=0)
     # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
   
     # output optimization process
    if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")
    
     # return best parameters
    return lgbBO#.res['max']['max_params']



### Load data with outliers remove as in previous version

In [None]:
# clean data has outlier of std > 2 already removed
raw = pd.read_csv("data/processed/clean_data.csv", index_col=["Timestamp"], parse_dates=["Timestamp"])
df = raw.rename(columns={"is_leakage":"target"})
last_leakage_period = "6H" # Create target using rolling window of 6 hours
df["target"] = df[["target"]].rolling(last_leakage_period).max().copy()


In [None]:
# Standarize time series
scaler = StandardScaler().fit(df.values[:, 1:])
df.iloc[:, 1:] = scaler.transform(df.values[:, 1:])
df.head()

In [None]:
# Remove 2020 to avoid the coronavirus effect
df_2019 = df[df.index < df.index[-14000]]
df_2019.shape

In [None]:
def split_datasets(df, test_examples=25000):
    df_train = df[df.index < df.index[-test_examples]]
    df_test = df[df.index > df.index[-test_examples]]
    df_val = df_test.iloc[test_examples // 2:].copy()
    df_test = df_test.iloc[:test_examples // 2].copy()
    return df_train, df_test, df_val

In [None]:
test_examples = 25000
df_train, df_test, df_val = split_datasets(df_2019)
df_train.head()

## Extract prophet features

In [None]:
# These are all the non-zero features provided by prophet
cols = ["ds", 'trend', 'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
        'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
        'daily', 'daily_lower', 'daily_upper', 'weekly', 'weekly_lower',
        'weekly_upper', 'yearly', 'yearly_lower', 'yearly_upper',
        'yhat'] 
# Choosing only these ones we get pretty much the same metrics than using all the previous ones
small_cols = ["ds", 'trend', 'additive_terms', 'daily', 'weekly', 'yearly', 'yhat']
def extract_prophet_features(df, column, model, cols):
    """Fit a prophet model to the desired column. Return its predictions and fitter model."""
    pdf = df[[column]].reset_index().rename(columns={"Timestamp":"ds", column:"y"})
    m =  Prophet(**model) if isinstance(model, dict) else model
    model = m.fit(pdf)  if isinstance(model, dict) else m
    return model, model.predict(pdf)[cols].set_index("ds")
    

In [None]:
# Prpphet can be fine-tuned to get better forecastings
prophet_params = dict(yearly_seasonality=True,
                      weekly_seasonality=True,
                      daily_seasonality=True,
                     )

In [None]:
column = "PressureBar"
model_pres, press_train = extract_prophet_features(df_train, column=column, model=prophet_params, cols=small_cols)
column = "m3Volume"
model_vol, volume_train = extract_prophet_features(df_train, column=column, model=prophet_params, cols=small_cols)

In [None]:
# Use the models fit on training set to extract prophet features on the test set.
# This is a conservative assumption, as the prophet models could be continuously trained in 
# in production to provide more accurate forecastings.
column = "PressureBar"
_, press_test = extract_prophet_features(df_test, column=column, model=model_pres, cols=small_cols)
column = "m3Volume"
_, volume_test = extract_prophet_features(df_test, column=column, model=model_vol, cols=small_cols)

## Add rolling statistics (Optional)

In [None]:
def add_rolling_means(df, periods):
    """Add features representing rolling mean aggregation during the provided periods."""
    data = [df.rolling(period).mean() for period in periods]
    df_c = df.copy()
    for new_df, p in zip(data, periods):
        df_c = pd.merge(df_c, new_df, left_index=True, right_index=True, how="inner", suffixes=('', "_%s" % p))
    return df_c

In [None]:
periods = ["1H", "2H", "6H", "12H", "24H"]
press_feats_train = add_rolling_means(press_train, periods)
vol_feats_train = add_rolling_means(volume_train, periods)
press_feats_test = add_rolling_means(press_test, periods)
vol_feats_test = add_rolling_means(volume_test, periods)

In [None]:
press_feats_train = press_train
vol_feats_train = volume_train
press_feats_test = press_test
vol_feats_test = volume_test

## Create train and test sets

In [None]:
train_features = pd.merge(df_train, press_feats_train,
                          right_index=True,
                          left_index=True,
                          how="inner", suffixes=('', "_press"))
train_features = pd.merge(train_features,
                          vol_feats_train,
                          right_index=True,
                          left_index=True,
                          how="inner",
                          suffixes=('', "_vol"))

In [None]:
test_features = pd.merge(df_test, press_feats_test,
                          right_index=True,
                          left_index=True,
                          how="inner", suffixes=('', "_press"))
test_features = pd.merge(test_features,
                          vol_feats_test,
                          right_index=True,
                          left_index=True,
                          how="inner",
                          suffixes=('', "_vol"))

In [None]:
train_x = train_features.drop("target", axis=1)
train_y = train_features["target"]
test_x = test_features.drop("target", axis=1)
test_y = test_features["target"]

In [None]:
from sklearn.metrics import accuracy_score,roc_auc_score,confusion_matrix,classification_report
def print_report(model):
    y_train_pred = model.predict(train_x)
    y_test_pred = model.predict(test_x)
    #y_val_pred = gbm_2.predict(x_val)

    print("TRAIN SET")
    print(classification_report(train_y.values.astype(int), y_train_pred.astype(int)))
    print("\nTEST SET")
    print(classification_report(test_y.values.astype(int), y_test_pred.astype(int)))
    print("\nTRAIN SET")
    print(confusion_matrix(train_y.values.astype(int), y_train_pred.astype(int)))
    print("\nTEST DATSET")
    print(confusion_matrix(test_y.values.astype(int), y_test_pred.astype(int)))
    

In [None]:
import imblearn
from sklearn.ensemble import BaggingClassifier
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
n_jobs = 64
pipeline = make_pipeline(NearMiss(version=2, n_jobs=n_jobs),
                         LogisticRegression(max_iter=500,
                                            C=0.1,
                                            class_weight='balanced',
                                            n_jobs=n_jobs,
                                            penalty='elasticnet',
                                            solver="saga",
                                            l1_ratio=0.2))

In [None]:
pipeline.fit(train_x.values, train_y.values.astype(int))

In [None]:
print_report(pipeline)

In [None]:
params = {#'pos_bagging_fraction':0.4,
          #"bagging_fraction":0.5,
   'feature_fraction': 0.3721514930979355,
   'lambda_l1': 3.0138168803582195,
   'lambda_l2': 1.6346495489906907,
   'max_depth': None,
   'min_child_weight': 1.065235087999525,
   'min_split_gain': 0.04432113391500656,
   'num_leaves': 42}
gbm_2 = lgb.LGBMClassifier(objective='binary',metric='binary',
                           n_estimators=50,
                           bagging_fraction=0.5,
                           scale_pos_weight=1000, # tweaking this has a direct effect on prec/recall tradeoff
                           is_unbalance=False, **params)

gbm_2 = make_pipeline(imblearn.combine.SMOTEENN(n_jobs=n_jobs),
                      gbm_2)

In [None]:

gbm_2.fit(train_x.values, train_y.values.astype(int))

In [None]:
print_report(gbm_2)

In [None]:
from collections import defaultdict
import ray
import tqdm
def get_cum_metrics(y_true, y_pred):
    metrics = defaultdict(list)
    for i in tqdm.autonotebook.trange(1, len(x)):
        mets = classification_report(y_true[:i], y_pred[:i], output_dict=True)
        for k, v in mets.items():
            if k == "1":
                for ki, vi in v.items():
                    metrics[ki].append(vi)
    return metrics



@ray.remote
def calculate_metrics(i, y_true, y_pred):
    return classification_report(y_true[:i], y_pred[:i], output_dict=True)


y_test_pred = gbm_2.predict(test_x)
ray.init(ignore_reinit_error=True)
proc_ids = [calculate_metrics.remote(i, test_y.astype(int), y_test_pred.astype(int)) for i in range(1, len(test_y))]
results = ray.get(proc_ids)
results[0].keys()




In [None]:
cum_mets = pd.DataFrame.from_records([r["1"] for r in results if "1" in r])
cum_mets.iloc[:, :3].plot()