# Imports

In [1]:
import warnings

import pandas as pd

from functions.loading import load_data

from functions.training_pipeline import training_pipeline
from functions.models import catboost_model

warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None

# Parameters definition

In [2]:
# Training parameters
restricted_features=False
save=True
customized_model=True

models = {
        "catboost": catboost_model,
}

targets = ["cf1", "cf2", "cf3", "cf123"]

# Results containers
summary_final=[]
Summary_Final_train = []
summary_metrics_detailed = pd.DataFrame()
estimated_scopes = []

# Data Loading 

In [3]:
preprocessed_dataset = load_data(save=True)
predict_dataset = load_data(filter_outliers=False, save=False)

File not found, constructing it


In [4]:
preprocessed_dataset.columns

Index(['company_id', 'company_name', 'isin', 'country_hq', 'gics_sector',
       'gics_group', 'gics_ind', 'gics_sub_ind', 'gics_name', 'fiscal_year',
       'ref_cf1', 'ref_cf2', 'ref_cf3', 'ref_cf123', 'revenue', 'ebitda',
       'ebit', 'capex', 'gppe', 'nppe', 'accu_dep', 'intan', 'cogs', 'gmar',
       'asset', 'lt_debt', 'employees', 'energy_produced', 'energy_consumed',
       'cdp_cf1', 'cdp_cf2', 'cdp_cf3', 'cdp_cf123', 'cf1', 'cf2', 'cf3',
       'cf123', 'ticker', 'lei', 'country_code', 'co2_law', 'co2_scheme',
       'co2_status', 'co2_coverage', 'start_year', 'status', 'price_index',
       'year', 'fuel_intensity', 'country', 'income_group', 'region', 'age',
       'cap_inten', 'leverage', 'intensity_cf1', 'intensity_cf2',
       'intensity_cf3', 'intensity_cf123'],
      dtype='object')

In [5]:
preprocessed_dataset.company_id.nunique()

47185

# Training pipeline 

In [None]:
best_scores, best_stds, summary_global, summary_metrics_detailed = training_pipeline(
    targets=targets,
    models=models,
    summary_final=summary_final,
    summary_metrics_detailed=summary_metrics_detailed,
    estimated_scopes=estimated_scopes,
    preprocessed_dataset=preprocessed_dataset,
    predict_dataset=predict_dataset,
    restricted_features=restricted_features,
    save=save,
    customized_model=customized_model,    
)

In [7]:
best_scores

[0.5159522342875701,
 0.4249486997368801,
 0.8797295366181928,
 0.5355338542112806]

In [8]:
best_stds

[0.09570897043567152,
 0.07097603476619248,
 0.07225981433916971,
 0.05061696541726524]

# Model application

In [None]:
from functions.apply_model import apply_model_on_raw_data

In [10]:
raw_dataset = load_data(filter_outliers=False, save=False)
estimations = apply_model_on_raw_data(raw_dataset,
    save=False,
    restricted_features=restricted_features,
)
estimations