In [1]:
import warnings

import pandas as pd
import numpy as np

from functions.loading import load_data

from functions.preprocessing import outliers_preprocess
from functions.training_pipeline import training_pipeline
from functions.models import xgboost_model, catboost_model, lgbm_model

warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None


In [2]:
path_rawdata = 'data/raw_data/'
path_models = 'models/proprietary_data/'
path_Benchmark = 'Benchmark/'
path_results = 'results/proprietary_data/'
path_plot = path_results +'plot/'
path_intermediary = 'data/intermediary_data/proprietary_data/'
path_plot = 'results/proprietary_data/plot/'
# ,"CF3_log", "CF123_log"
targets = ["CF1_log","CF2_log","CF3_log", "CF123_log"]
models = {
        # "xgboost": xgboost_model,
        "catboost": catboost_model,
        "lgbm": lgbm_model,
}
training_parameters = {
    "seed":0,
    "n_iter":10,
    "extended_features": [
            "Revenue_log",
            "EMP_log",
            "Asset_log",
            "NPPE_log",
            "CapEx_log",
            "Age",
            "CapInten",
            "GMAR",
            "Leverage",
            "Price",
            "FuelIntensity",
            "FiscalYear",
            "ENEConsume_log",
            "ENEProduce_log",
            "INTAN_log",
            "AccuDep_log",
            "COGS_log",
        ],
    "selec_sect":["GICSSubInd", "GICSInd", "GICSGroup"],
    "fill_grp":"",
    "old_pipe":False,  
    "cross_val": False,
}

use_weights= None
companies=True
Summary_Final=[]
Summary_Final_train = []
ensemble =[]
summary_metrics_detailed = pd.DataFrame()
estimated_scopes = []

In [3]:
preprocessed_dataset = load_data(path_rawdata, save=True)

In [4]:
preprocessed_dataset["CF1"] = preprocessed_dataset["CF1_merge"] 
preprocessed_dataset["CF2"] = preprocessed_dataset["CF2_merge"] 
preprocessed_dataset["CF3"] = preprocessed_dataset["CF3_merge"] 
preprocessed_dataset["CF123"] = preprocessed_dataset["CF123_merge"] 
preprocessed_dataset["CDP_CF2"] = preprocessed_dataset["CDP_CF2_location"]
preprocessed_dataset["country_sector"] = preprocessed_dataset["CountryHQ"].astype(str) + "_" + preprocessed_dataset["GICSSubInd"].astype(str)
# 50 sec
threshold_under=1.5
threshold_over=2.5
for target in ["CF1_merge", "CF2_merge", "CF3_merge", "CF123_merge"] : 
    preprocessed_dataset = outliers_preprocess(preprocessed_dataset, target, threshold_under=threshold_under, threshold_over=threshold_over)

In [10]:
# test de base  
targets = ["CF1_log","CF2_log","CF3_log", "CF123_log"]

best_scores, best_stds, summary_global, summary_metrics_detailed = training_pipeline(
    name_experiment="restriction_CF123_test_base_563232",
    path_Benchmark=path_Benchmark,
    path_results=path_results,
    path_models=path_models,
    path_intermediary=path_intermediary,
    path_plot = path_plot,
    targets=targets,
    models=models,
    Summary_Final=Summary_Final,
    # Summary_Final_train=Summary_Final_train,
    ensemble=ensemble,
    summary_metrics_detailed=summary_metrics_detailed,
    estimated_scopes = estimated_scopes,
    preprocessed_dataset=preprocessed_dataset,
    training_parameters=training_parameters,
    open_data=False,
    save=False,
    use_weights=None,
    companies=True,
)

CF1_log
Files not found, constructing them
preprocessing done


Registered model 'catboost' already exists. Creating a new version of this model...
2023/08/04 15:14:44 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: catboost, version 67
Created version '67' of model 'catboost'.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


Registered model 'lgbm' already exists. Creating a new version of this model...
2023/08/04 15:14:48 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: lgbm, version 67
Created version '67' of model 'lgbm'.


modelisation done
CF2_log
Using pre created preprocessed files
preprocessing done


Registered model 'catboost' already exists. Creating a new version of this model...
2023/08/04 15:15:05 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: catboost, version 68
Created version '68' of model 'catboost'.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


Registered model 'lgbm' already exists. Creating a new version of this model...
2023/08/04 15:15:09 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: lgbm, version 68
Created version '68' of model 'lgbm'.


modelisation done
CF3_log
Using pre created preprocessed files
preprocessing done


Registered model 'catboost' already exists. Creating a new version of this model...
2023/08/04 15:15:22 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: catboost, version 69
Created version '69' of model 'catboost'.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


Registered model 'lgbm' already exists. Creating a new version of this model...
2023/08/04 15:15:26 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: lgbm, version 69
Created version '69' of model 'lgbm'.


modelisation done
CF123_log
Using pre created preprocessed files
preprocessing done


Registered model 'catboost' already exists. Creating a new version of this model...
2023/08/04 15:15:38 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: catboost, version 70
Created version '70' of model 'catboost'.


You can set `force_col_wise=true` to remove the overhead.


Registered model 'lgbm' already exists. Creating a new version of this model...
2023/08/04 15:15:43 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: lgbm, version 70
Created version '70' of model 'lgbm'.


modelisation done


In [20]:
companies

True

In [13]:
best_scores
# base 
# [0.5233991620098789, 0.40811509811421237, 0.85109105089482, 0.5229286278975259]

# weights company True
# [0.5210419733631437, 0.4159629294599453, 0.8637620192374258, 0.529039907808258]

# weights company False
# [0.5282566564237224, 0.4276567152521008,  0.8902519108927333, 0.5443683264545418]

# weights company False gradient L1

# weights company False gradient L2

# weights company True gradient L2

[0.5233991620098789, 0.40811509811421237, 0.85109105089482, 0.5229286278975259]

In [6]:
# test weights  
targets = ["CF1_log","CF2_log","CF3_log", "CF123_log"]

best_scores, best_stds, summary_global, summary_metrics_detailed = training_pipeline(
    name_experiment="restriction_CF123_test_gradient_0",
    path_Benchmark=path_Benchmark,
    path_results=path_results,
    path_models=path_models,
    path_intermediary=path_intermediary,
    path_plot = path_plot,
    targets=targets,
    models=models,
    Summary_Final=Summary_Final,
    # Summary_Final_train=Summary_Final_train,
    ensemble=ensemble,
    summary_metrics_detailed=summary_metrics_detailed,
    estimated_scopes = estimated_scopes,
    preprocessed_dataset=preprocessed_dataset,
    training_parameters=training_parameters,
    open_data=False,
    save=False,
    use_weights=True,
    companies=False,
    custom_gradient="L1", # False, "L1", "L2"
)

CF1_log
Using pre created preprocessed files
preprocessing done


CatBoostError: Invalid weight value type=<class 'numpy.ndarray'>: must be 1 dimensional data with int, float or long types.

In [18]:
best_scores

[0.5210419733631437, 0.4159629294599453, 0.8637620192374258, 0.529039907808258]

In [36]:
# target=targets[0]
# scope=target[:-4]
# a = preprocessed_dataset[["FinalEikonID","CDP_"+scope, scope, "country_sector"]]
# weights_creation(a, scope, companies=True)

In [21]:
import mlflow
import numpy as np

from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_error,
    mean_absolute_percentage_error,
)
from functions.preprocessing import custom_train_split
from functions.results import best_model_analysis, metrics, results


def weights_creation(df, scope, path_intermediary, companies=True):
    df["weight_reliability"] = np.ones(len(df))
    CDP_indexes = df[df[scope] == df["CDP_" + scope]].index
    df.loc[CDP_indexes, "weight_reliability"] = [2 for i in range(len(CDP_indexes))]

    nb_occurences = df["country_sector"].value_counts()
    df["weight_country_sector"] = df.apply(lambda row: 1 / nb_occurences[row["country_sector"]], axis=1)

    if companies:
        nb_occurences = df.FinalEikonID.value_counts()
        df["weight_companies"] = df.apply(lambda row: 1 / nb_occurences[row["FinalEikonID"]], axis=1)

        df["weight_final"] = df["weight_reliability"] * df["weight_companies"] * df["weight_country_sector"]
    else:
        df["weight_final"] = df["weight_reliability"] * df["weight_country_sector"]

    return df["weight_final"].reset_index(drop=True)


def training_pipeline(
    name_experiment,
    path_Benchmark,
    path_results,
    path_models,
    path_intermediary,
    path_plot,
    targets,
    models,
    Summary_Final,
    ensemble,
    summary_metrics_detailed,
    estimated_scopes,
    preprocessed_dataset,
    training_parameters,
    open_data=False,
    save=False,
    use_weights=None,
    companies=True,
    custom_gradient=False,
):
    """
    Apply a training pipeline for the imputes targets, models and parameters.
    """
    best_scores = []
    best_stds = []
    mlflow.create_experiment("" f"Models_{name_experiment}")
    mlflow.set_experiment("" f"Models_{name_experiment}")

    for target in targets:
        print(target)
        test_scores = []
        test_stds = []
        (
            X_train,
            y_train,
            X_test,
            y_test,
            df_test,
        ) = custom_train_split(
            preprocessed_dataset,
            path_Benchmark,
            path_intermediary,
            target,
            # threshold_under=training_parameters["threshold_under"],
            # threshold_over=training_parameters["threshold_over"],
            extended_features=training_parameters["extended_features"],
            selec_sect=training_parameters["selec_sect"],
            fill_grp=training_parameters["fill_grp"],
            old_pipe=training_parameters["old_pipe"],
            open_data=open_data,
        )
        print("preprocessing done")
        if use_weights:
            scope = target[:-4]
            df_train_merged = X_train.join(preprocessed_dataset[["FinalEikonID","CDP_"+scope, scope, "country_sector"]])
            weights = weights_creation(df_train_merged, scope, companies)
            df_train_merged.FinalEikonID.to_csv("data/intermediary_data/companies_ids.csv",index=False)
        else:
            weights=None

        seed = training_parameters["seed"]
        n_iter = training_parameters["n_iter"]
        for i, (model_name, model) in enumerate(models.items()):
            with mlflow.start_run() as _:
                model_i = model(
                    X_train,
                    y_train,
                    cross_val=training_parameters["cross_val"],
                    n_jobs=-1,
                    verbose=0,
                    n_iter=n_iter,
                    seed=seed,
                    weights=weights,
                    custom_gradient=custom_gradient,
                )
                y_pred = model_i.predict(X_test)

                summary_global, rmse, std = metrics(y_test, y_pred, Summary_Final, target, model_name)
                mlflow.log_metric("mae", mean_absolute_error(y_test, y_pred))
                mlflow.log_metric("rmse", mean_squared_error(y_test, y_pred, squared=False))
                mlflow.log_metric("mse", mean_squared_error(y_test, y_pred))
                mlflow.log_metric("r2", r2_score(y_test, y_pred))
                mlflow.log_metric("mape", mean_absolute_percentage_error(y_test, y_pred))
                mlflow.log_param("target", target)
                mlflow.log_param("model", model_name)
                mlflow.sklearn.log_model(model, "models", registered_model_name=model_name)
                ensemble.append(model_i)
                # model_name_lst.append(model_name)
                test_scores.append(rmse)
                test_stds.append(std)

        best_scores.append(test_scores[test_scores.index(min(test_scores))])
        best_stds.append(test_stds[test_scores.index(min(test_scores))])
        print("modelisation done")

        if save:
            best_model_index = test_scores.index(min(test_scores))
            best_model = ensemble[best_model_index]
            summary_metrics_detailed, estimated_scopes, lst = best_model_analysis(
                best_model,
                X_test,
                X_train,
                y_test,
                df_test,
                target,
                path_plot,
                preprocessed_dataset,
                path_intermediary,
                summary_metrics_detailed,
                estimated_scopes,
                training_parameters,
                open_data,
                path_models,
            )

            results(estimated_scopes, path_results, summary_metrics_detailed, Summary_Final, lst)

    return best_scores, best_stds, summary_global, summary_metrics_detailed


In [7]:
custom_gradient="L1"
use_weights=True
best_scores = []
best_stds = []
models = {
        # "xgboost": xgboost_model,
        "catboost": catboost_model,
        "lgbm": lgbm_model,
}

In [23]:
for target in targets:
    print(target)
    test_scores = []
    test_stds = []
    (
        X_train,
        y_train,
        X_test,
        y_test,
        df_test,
    ) = custom_train_split(
        preprocessed_dataset,
        path_Benchmark,
        path_intermediary,
        target,
        extended_features=training_parameters["extended_features"],
        selec_sect=training_parameters["selec_sect"],
        fill_grp=training_parameters["fill_grp"],
        old_pipe=training_parameters["old_pipe"],
        open_data=False,
    )
    print("preprocessing done")
    if use_weights:
        scope = target[:-4]
        df_train_merged = X_train.join(preprocessed_dataset[["FinalEikonID","CDP_"+scope, scope, "country_sector"]]) 
        weights = weights_creation(df_train_merged, scope, companies)
        df_train_merged.FinalEikonID.to_csv("data/intermediary_data/companies_ids.csv",index=False)

    seed = training_parameters["seed"]
    n_iter = training_parameters["n_iter"]
    for i, (model_name, model) in enumerate(models.items()):
        with mlflow.start_run() as _:
            model_i = model(
                X_train,
                y_train,
                cross_val=training_parameters["cross_val"],
                n_jobs=-1,
                verbose=0,
                n_iter=n_iter,
                seed=seed,
                weights=weights,
                custom_gradient=custom_gradient,
            )
            y_pred = model_i.predict(X_test)

            summary_global, rmse, std = metrics(y_test, y_pred, Summary_Final, target, model_name)
            mlflow.log_metric("mae", mean_absolute_error(y_test, y_pred))
            mlflow.log_metric("rmse", mean_squared_error(y_test, y_pred, squared=False))
            mlflow.log_metric("mse", mean_squared_error(y_test, y_pred))
            mlflow.log_metric("r2", r2_score(y_test, y_pred))
            mlflow.log_metric("mape", mean_absolute_percentage_error(y_test, y_pred))
            mlflow.log_param("target", target)
            mlflow.log_param("model", model_name)
            mlflow.sklearn.log_model(model, "models", registered_model_name=model_name)
            ensemble.append(model_i)
            # model_name_lst.append(model_name)
            test_scores.append(rmse)
            test_stds.append(std)

    best_scores.append(test_scores[test_scores.index(min(test_scores))])
    best_stds.append(test_stds[test_scores.index(min(test_scores))])
    print("modelisation done")


CF1_log
Using pre created preprocessed files
preprocessing done


ValueError: If using all scalar values, you must pass an index

In [20]:
# companies_ids = pd.read_csv("data/intermediary_data/companies_ids.csv")
companies_ids

Unnamed: 0,FinalEikonID
0,0812.HK
1,IPO.L
2,IPO.L
3,AGNC.OQ
4,AGNC.OQ
...,...
22304,FWONA.OQ
22305,FWONA.OQ
22306,FWONA.OQ
22307,MAHAa.ST
