# ElasticNet

### 1. Introdução ao ElasticNet

### 2. 

In [1]:
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from optuna import create_study
from optuna.pruners import HyperbandPruner
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('../dataset_processing/xtb_dataset.csv')
df

Unnamed: 0,Dipole,E_HOMO,E_LUMO,gap_HOMO-LUMO,ZPE,H,U,U0,G,Delta
0,0.727,-10.6203,-2.7950,-7.8253,0.154214,-27.917739,-27.918683,-28.080950,-27.959630,-12.397982
1,1.905,-10.4336,-2.2949,-8.1387,0.199866,-28.104404,-28.105348,-28.313524,-28.146070,-12.165407
2,4.475,-10.5746,-6.9494,-3.6252,0.120064,-27.272560,-27.273504,-27.400818,-27.312535,-13.078113
3,2.100,-9.8173,-5.5314,-4.2859,0.177677,-27.068399,-27.069343,-27.254803,-27.108720,-13.224129
4,0.872,-9.9722,-6.6172,-3.3550,0.133244,-24.148876,-24.149821,-24.292155,-24.192788,-16.186776
...,...,...,...,...,...,...,...,...,...,...
129152,4.710,-11.3840,-7.8335,-3.5505,0.121817,-26.808903,-26.809847,-26.939965,-26.851229,-13.538966
129153,4.996,-10.8793,-7.1097,-3.7696,0.141742,-28.278349,-28.279293,-28.428099,-28.317523,-12.050832
129154,3.973,-10.9497,-7.7346,-3.2151,0.124894,-24.819326,-24.820270,-24.952019,-24.858199,-15.526912
129155,0.918,-10.2869,-1.8611,-8.4258,0.195977,-30.002576,-30.003520,-30.208931,-30.047056,-10.270000


In [3]:
X = df.drop("Delta", axis=1)
y = df["Delta"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=27)

In [4]:
def inst_elasticnet(trial):
    parametros = {
        'alpha': trial.suggest_float('alpha', 1e-5, 1e3, log=True),
        'l1_ratio': trial.suggest_float('l1_ratio', 0, 1),
        'fit_intercept': trial.suggest_categorical("fit_intercept", [True, False]),
        'positive': trial.suggest_categorical("positive", [False, True]),
        'max_iter': trial.suggest_int('max_iter', 500, 20000),
        'tol': trial.suggest_float('tol', 1e-8, 1e-3, log=True),
        'selection': trial.suggest_categorical('selection', ['cyclic', 'random'])
    }

    normalizar = trial.suggest_categorical('normalizar', [None, 'stds' 'pca', 'limiar', 'rfe'])
    if normalizar == 'stds':
        modelo = make_pipeline(StandardScaler(), ElasticNet(**parametros))
    elif normalizar == "pca":
        # Definindo o número de componentes a serem mantidas
        components = trial.suggest_int("pca_components", 2, 50)
        modelo = make_pipeline(
            StandardScaler(),
            PCA(components),
            ElasticNet(**parametros))
    elif normalizar == "limiar":
        # Definindo o Limiar a ser utilizado
        threshold = trial.suggest_float("variance_threshold", 0, 0.1)
        modelo = make_pipeline(
            StandardScaler(),
            VarianceThreshold(threshold),
            ElasticNet(**parametros)
        )
    elif normalizar == "rfe": 
        # Criando o modelo para parametro do RFE
        estimator = RandomForestRegressor()
        # Definindo o número de atributos a serem mantidos
        n_features_to_select = trial.suggest_int("rfe_features", 2, 50)
        modelo = make_pipeline(
            StandardScaler(),
            RFE(estimator=estimator, n_features_to_select=n_features_to_select),
            ElasticNet(**parametros)
        )
    else:
        modelo = ElasticNet(**parametros)

    return modelo

In [5]:
def funcao_objetivo(trial, X, y, num_folds, instanciador):
    modelo = instanciador(trial)

    metricas = -cross_val_score(
        modelo,
        X,
        y,
        scoring='neg_root_mean_squared_error',
        cv=num_folds,
        n_jobs=-1
    )
    return metricas.mean()


In [6]:
def rodar_optuna(nome_estudo, X, y, instanciador, num_folds=10, n_trials=100):
    study = create_study(
        direction='minimize',
        study_name=nome_estudo,
        storage=f'sqlite:///{nome_estudo}.db',
        load_if_exists=True,
        pruner=HyperbandPruner(min_resource=1, max_resource=100, reduction_factor=3)
    )

    def objetivo_parcial(trial):
        return funcao_objetivo(trial, X, y, num_folds, instanciador)

    study.optimize(objetivo_parcial, n_trials=n_trials)
    return study


In [7]:
study_en = rodar_optuna('elasticnet_xtb', X_train, y_train, inst_elasticnet)

[I 2025-10-25 16:46:07,455] A new study created in RDB with name: elasticnet_xtb
[I 2025-10-25 16:46:11,278] Trial 0 finished with value: 1.928424063704068 and parameters: {'alpha': 0.0015084426797705019, 'l1_ratio': 0.7766894898882084, 'fit_intercept': True, 'positive': True, 'max_iter': 18599, 'tol': 1.1201421196237564e-07, 'selection': 'random', 'normalizar': 'stdspca'}. Best is trial 0 with value: 1.928424063704068.
[I 2025-10-25 16:46:13,336] Trial 1 finished with value: 2.0309682675706826 and parameters: {'alpha': 0.006608637717050191, 'l1_ratio': 0.7257734930607228, 'fit_intercept': True, 'positive': True, 'max_iter': 4545, 'tol': 0.00040650799839273656, 'selection': 'cyclic', 'normalizar': 'stdspca'}. Best is trial 0 with value: 1.928424063704068.
[I 2025-10-25 16:46:13,571] Trial 2 finished with value: 0.1021674647212935 and parameters: {'alpha': 0.5721709648601488, 'l1_ratio': 0.17275760728374123, 'fit_intercept': True, 'positive': False, 'max_iter': 4132, 'tol': 3.2605701473

In [None]:
0.0002077568156952986