# Imports

In [9]:
import pandas as pd
import numpy as np
import optuna
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import plotly

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet


import logging

# Definir o nível de log para WARNING
optuna.logging.set_verbosity(optuna.logging.WARNING)
#import seaborn as sns
#from matplotlib import pyplot
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)


# Helper Functions

In [10]:
def calculate_metrics(y_true, y_pred, model_name):
    # Calcular R2
    r2 = r2_score(y_true, y_pred)
    
    # Calcular MSE
    mse = mean_squared_error(y_true, y_pred)
    
    # Calcular RMSE
    rmse = np.sqrt(mse)
    
    # Calcular MAE
    mae = mean_absolute_error(y_true, y_pred)
    
    # Calcular MAPE
    y_true_arr, y_pred_arr = np.array(y_true), np.array(y_pred)
    mape = np.mean(np.abs((y_true_arr - y_pred_arr) / y_true_arr)) * 100
    
    return {
        "model_name": model_name,
        "R2": r2,
        "MSE": mse,
        "RMSE": rmse,
        "MAE": mae,
        "MAPE": mape
    }


# Data load

In [11]:
# Dados de Treinamento
X_train = pd.read_csv('../../data/Regressao/X_training.csv')
y_train = pd.read_csv('../../data/Regressao/y_training.csv').values.ravel()

# Dados de Test
X_test = pd.read_csv('../../data/Regressao/X_test.csv')
y_test = pd.read_csv('../../data/Regressao/y_test.csv').values.ravel()

# Dados de Validacao
X_val = pd.read_csv('../../data/Regressao/X_validation.csv')
y_val = pd.read_csv('../../data/Regressao/y_val.csv').values.ravel()

# Machine Learning

## Decision Tree Regression

In [12]:
def dt_fine_tuning(trial):
    max_depth = trial.suggest_int('max_depth', 2,100)
    dt_model = DecisionTreeRegressor(max_depth=max_depth)
    dt_model.fit(X_train, y_train)
    y_pred = dt_model.predict(X_val)

    return calculate_metrics(y_val, y_pred, 'Decision Tree')['RMSE']


In [13]:
study = optuna.create_study(direction='minimize')
study.optimize(dt_fine_tuning, n_trials=100)

In [14]:
optuna.visualization.plot_optimization_history(study)

In [15]:
dt_best_params = study.best_params
dt_best_params

{'max_depth': 5}

## Random Forest Regressor

In [16]:
def rf_fine_tuning(trial):
    n_estimators = trial.suggest_int('n_estimators', 10,200)
    max_depth = trial.suggest_int('max_depth', 2,50)
    rf_model = RandomForestRegressor(n_estimators = n_estimators, max_depth=max_depth)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_val)

    return calculate_metrics(y_val, y_pred, 'Random Forest')['RMSE']

In [17]:
study = optuna.create_study(direction='minimize')
study.optimize(rf_fine_tuning, n_trials=10)

In [18]:
optuna.visualization.plot_optimization_history(study)

In [19]:
rf_best_params = study.best_params
rf_best_params

{'n_estimators': 149, 'max_depth': 36}

## Polinomial Regression

In [20]:
# poly = PolynomialFeatures(degree=5)
# poly_features = poly.fit_transform(X_train)
# X_poly_val = poly.transform(X_val)

# model = LinearRegression()
# model.fit(poly_features, y_train)
# y_pred = model.predict(X_poly_val)


In [21]:
def pol_reg_fine_tuning(trial):
    degree = trial.suggest_int('degree', 2,4)

    poly = PolynomialFeatures(degree=degree)
    poly_features = poly.fit_transform(X_train)
    X_poly_val = poly.transform(X_val)

    model = LinearRegression()
    model.fit(poly_features, y_train)
    y_pred = model.predict(X_poly_val)

    return calculate_metrics(y_val, y_pred, 'Polinomial Regression')['RMSE']


In [22]:
study = optuna.create_study(direction='minimize')
study.optimize(pol_reg_fine_tuning, n_trials=5)

In [23]:
optuna.visualization.plot_optimization_history(study)

In [24]:
pol_best_params = study.best_params
pol_best_params

{'degree': 2}

## Lasso

In [25]:
def lasso_fine_tuning(trial):
    #degree = trial.suggest_int('degree', 2,4)
    alpha = trial.suggest_float('alpha', 0.1,5.0)
    #l1_ratio = trial.suggest_int('l1_ratio', 1,10)
    max_iter = trial.suggest_int('max_iter', 500,3000)

    model_lasso = Lasso(alpha = alpha, max_iter = max_iter)
    model_lasso.fit(X_train, y_train)

    y_pred = model_lasso.predict(X_val)

    return calculate_metrics(y_val, y_pred, 'Lasso')['RMSE']


In [26]:
study = optuna.create_study(direction='minimize')
study.optimize(lasso_fine_tuning, n_trials=500)

In [27]:
optuna.visualization.plot_optimization_history(study)

In [28]:
lasso_best_params = study.best_params
lasso_best_params

{'alpha': 0.10004232782398756, 'max_iter': 2559}

## Ridge

In [29]:
def ridge_fine_tuning(trial):
    #degree = trial.suggest_int('degree', 2,4)
    alpha = trial.suggest_float('alpha', 0.1,5.0)
    #l1_ratio = trial.suggest_int('l1_ratio', 1,10)
    max_iter = trial.suggest_int('max_iter', 500,3000)

    model_ridge = Ridge(alpha = alpha, max_iter = max_iter)
    model_ridge.fit(X_train, y_train)

    y_pred = model_ridge.predict(X_val)

    return calculate_metrics(y_val, y_pred, 'Lasso')['RMSE']


In [30]:
study = optuna.create_study(direction='minimize')
study.optimize(ridge_fine_tuning, n_trials=500)

In [31]:
optuna.visualization.plot_optimization_history(study)

In [32]:
ridge_best_params = study.best_params
ridge_best_params

{'alpha': 4.999979427501249, 'max_iter': 1379}

## Elastic Net

In [33]:
def elasticnet_fine_tuning(trial):
    #degree = trial.suggest_int('degree', 2,4)
    alpha = trial.suggest_float('alpha', 0.1,5.0)
    l1_ratio = trial.suggest_int('l1_ratio', 0,1)
    max_iter = trial.suggest_int('max_iter', 500,3000)

    model_elasticnet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=max_iter)
    model_elasticnet.fit(X_train, y_train)

    y_pred = model_elasticnet.predict(X_val)

    return calculate_metrics(y_val, y_pred, 'Lasso')['RMSE']


In [34]:
study = optuna.create_study(direction='minimize')
study.optimize(elasticnet_fine_tuning, n_trials=10)

In [35]:
optuna.visualization.plot_optimization_history(study)

In [36]:
elastic_best_params = study.best_params
elastic_best_params

{'alpha': 1.3228068169885963, 'l1_ratio': 1, 'max_iter': 2450}

# All Results

In [37]:
model_elasticnet = ElasticNet(alpha=elastic_best_params['alpha'], l1_ratio=elastic_best_params['l1_ratio'], max_iter=elastic_best_params['max_iter'])
model_ridge = Ridge(alpha = ridge_best_params['alpha'], max_iter = ridge_best_params['max_iter'])
model_lasso = Lasso(alpha = lasso_best_params['alpha'], max_iter = lasso_best_params['max_iter'])
rf_model = RandomForestRegressor(n_estimators = rf_best_params['n_estimators'], max_depth=rf_best_params['max_depth'])
dt_model = DecisionTreeRegressor(max_depth=dt_best_params['max_depth'])



In [38]:
model_elasticnet.fit(X_train, y_train)
model_ridge.fit(X_train, y_train)
model_lasso.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)

In [39]:
model_list = [model_elasticnet, model_ridge, model_lasso, rf_model, dt_model]
model_names = ['ElasticNet', 'Ridge', 'Lasso', 'Random Forest', 'Decision Tree']

# Nos dados de Treinamento
train_dataset = pd.DataFrame()
test_dataset = pd.DataFrame()
val_dataset = pd.DataFrame()

In [40]:
for model, model_name in zip(model_list, model_names):
    y_pred = model.predict(X_train)
    results = calculate_metrics(y_train, y_pred, model_name)
    train_dataset = pd.concat([train_dataset, pd.DataFrame([results])])
train_dataset['Dataset'] = 'Train'
train_dataset

Unnamed: 0,model_name,R2,MSE,RMSE,MAE,MAPE,Dataset
0,ElasticNet,0.005832,475.224652,21.799648,17.315087,873.812735,Train
0,Ridge,0.046045,456.002667,21.354219,16.998596,865.427546,Train
0,Lasso,0.041215,458.311099,21.408202,17.046806,866.796443,Train
0,Random Forest,0.903966,45.905463,6.775357,4.837067,260.748776,Train
0,Decision Tree,0.113523,423.747268,20.585122,16.368766,786.953603,Train


In [41]:
# Nos dados de Validation
for model, model_name in zip(model_list, model_names):
    print(model_name)
    y_pred = model.predict(np.array(X_val))
    results = calculate_metrics(y_val, y_pred, model_name)
    val_dataset = pd.concat([val_dataset, pd.DataFrame([results])])
val_dataset['Dataset'] = 'Validation'
val_dataset

ElasticNet
Ridge
Lasso
Random Forest
Decision Tree



X does not have valid feature names, but ElasticNet was fitted with feature names


X does not have valid feature names, but Ridge was fitted with feature names


X does not have valid feature names, but Lasso was fitted with feature names


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


X does not have valid feature names, but DecisionTreeRegressor was fitted with feature names



Unnamed: 0,model_name,R2,MSE,RMSE,MAE,MAPE,Dataset
0,ElasticNet,0.006162,474.569322,21.784612,17.282933,869.079029,Validation
0,Ridge,0.039936,458.441514,21.411247,17.038532,868.191846,Validation
0,Lasso,0.037193,459.751563,21.441818,17.047462,868.689804,Validation
0,Random Forest,0.331565,319.185493,17.865763,13.016816,704.813992,Validation
0,Decision Tree,0.063559,447.161319,21.146189,16.843452,839.577848,Validation


In [42]:
for model, model_name in zip(model_list, model_names):
    print(model_name)
    y_pred = model.predict(np.array(X_test))
    results = calculate_metrics(y_test, y_pred, model_name)
    test_dataset = pd.concat([test_dataset, pd.DataFrame([results])])
test_dataset['Dataset'] = 'Test'
test_dataset

ElasticNet
Ridge
Lasso
Random Forest



X does not have valid feature names, but ElasticNet was fitted with feature names


X does not have valid feature names, but Ridge was fitted with feature names


X does not have valid feature names, but Lasso was fitted with feature names


X does not have valid feature names, but RandomForestRegressor was fitted with feature names



Decision Tree



X does not have valid feature names, but DecisionTreeRegressor was fitted with feature names



Unnamed: 0,model_name,R2,MSE,RMSE,MAE,MAPE,Dataset
0,ElasticNet,0.00597,483.994404,21.999873,17.489541,874.224401,Test
0,Ridge,0.052265,461.45297,21.481456,17.128819,852.645702,Test
0,Lasso,0.044723,465.125274,21.566763,17.175636,859.296661,Test
0,Random Forest,0.351922,315.549949,17.763726,13.012314,652.351427,Test
0,Decision Tree,0.072181,451.755789,21.254547,17.010757,783.395225,Test


# Join Results

In [43]:
full_results = pd.concat([train_dataset, val_dataset, test_dataset])
full_results

Unnamed: 0,model_name,R2,MSE,RMSE,MAE,MAPE,Dataset
0,ElasticNet,0.005832,475.224652,21.799648,17.315087,873.812735,Train
0,Ridge,0.046045,456.002667,21.354219,16.998596,865.427546,Train
0,Lasso,0.041215,458.311099,21.408202,17.046806,866.796443,Train
0,Random Forest,0.903966,45.905463,6.775357,4.837067,260.748776,Train
0,Decision Tree,0.113523,423.747268,20.585122,16.368766,786.953603,Train
0,ElasticNet,0.006162,474.569322,21.784612,17.282933,869.079029,Validation
0,Ridge,0.039936,458.441514,21.411247,17.038532,868.191846,Validation
0,Lasso,0.037193,459.751563,21.441818,17.047462,868.689804,Validation
0,Random Forest,0.331565,319.185493,17.865763,13.016816,704.813992,Validation
0,Decision Tree,0.063559,447.161319,21.146189,16.843452,839.577848,Validation


## Comparing Performance

In [44]:
list_datasets = np.unique(full_results['Dataset'])
for i in list_datasets:
    subset = full_results.loc[full_results['Dataset'] == i]

In [47]:
subset

Unnamed: 0,model_name,R2,MSE,RMSE,MAE,MAPE,Dataset
0,ElasticNet,0.006162,474.569322,21.784612,17.282933,869.079029,Validation
0,Ridge,0.039936,458.441514,21.411247,17.038532,868.191846,Validation
0,Lasso,0.037193,459.751563,21.441818,17.047462,868.689804,Validation
0,Random Forest,0.331565,319.185493,17.865763,13.016816,704.813992,Validation
0,Decision Tree,0.063559,447.161319,21.146189,16.843452,839.577848,Validation


In [48]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_metrics_comparison_plotly(full_results, metrics=["R2", "MSE", "RMSE", "MAE", "MAPE"]):
    fig = make_subplots(rows=2, cols=2, subplot_titles=metrics, shared_yaxes=True)

    for idx, metric in enumerate(metrics, 1):
        for dataset in full_results["Dataset"].unique():
            subset = full_results[full_results["Dataset"] == dataset]
            fig.add_trace(
                go.Bar(x=subset["model_name"], y=subset[metric], name=f"{dataset} - {metric}"),
                row=(idx-1)//2 + 1,
                col=(idx-1)%2 + 1
            )

    fig.update_layout(title_text="Comparison of Metrics Among Models", barmode='group')
    fig.show()

# Use the function
plot_metrics_comparison_plotly(full_results)


Exception: The (row, col) pair sent is out of range. Use Figure.print_grid to view the subplot grid. 

In [None]:
!pip install seaborn