In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

df = pd.read_csv(r'C:\Users\fabri\Desktop\MarcoPatierno_DepositoCorsoPython\env\Giorno 19 29-04\esercitazione_house\kc_house_data.csv')

# Conversione data in datetime
df['date'] = df['date'].str.replace('T000000', '', regex=False)

df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
df["date"].head()

# Sostituzione dei valori di yr_renovated
df['yr_renovated'] = pd.cut(df['yr_renovated'], bins=[-1, 0, 1980, 2000, float('inf')], labels=[0, 1, 2, 3])

# Visualizza le prime righe per verificare la sostituzione
df['yr_renovated'].head()

X = df.drop(columns=["price", "date", "id"]) #, "lat", "long", "zipcode"
y = df["price"]

scaler = StandardScaler()   
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y.values.reshape(-1, 1))

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

In [2]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

def elimina_variabili_vif_pvalue(X_train, y_train,
                                 vif_threshold=10.0,
                                 pvalue_threshold=0.05):
    # Se non è già un DataFrame, lo trasformo
    if not isinstance(X_train, pd.DataFrame):
        X_current = pd.DataFrame(
            X_train,
            columns=[f"x{i}" for i in range(X_train.shape[1])]
        )
    else:
        X_current = X_train.copy()

    while True:
        # Aggiungo costante (ora X_current è DataFrame)
        X_const = sm.add_constant(X_current)
        model   = sm.OLS(y_train, X_const).fit()

        # Prendo i p-value come Series e tolgo la costante
        pvalues = pd.Series(model.pvalues, index=X_const.columns)
        pvalues = pvalues.drop('const')

        # Calcolo il VIF
        vif = pd.DataFrame({
            "Feature": X_current.columns,
            "VIF": [
                variance_inflation_factor(X_current.values, i)
                for i in range(X_current.shape[1])
            ]
        })

        # Unisco VIF e p-value
        stats = vif.copy()
        stats["p-value"] = pvalues.values

        # Seleziono candidati con VIF e p-value alti
        candidates = stats[
            (stats["VIF"] > vif_threshold) &
            (stats["p-value"] > pvalue_threshold)
        ]
        if candidates.empty:
            print("\nNessuna variabile da eliminare. Selezione completata.")
            break

        # Rimuovo la feature peggiore
        worst = candidates.sort_values(by="VIF", ascending=False).iloc[0]
        feat = worst["Feature"]
        print(f"Rimuovo '{feat}' (VIF={worst['VIF']:.2f}, p-value={worst['p-value']:.4f})")

        X_current = X_current.drop(columns=[feat])

    print("\nFeature finali selezionate:")
    print(X_current.columns.tolist())
    return X_current


In [3]:
X_current = elimina_variabili_vif_pvalue(X_train, y_train, vif_threshold=10.0, pvalue_threshold=0.05)

  vif = 1. / (1. - r_squared_i)



Nessuna variabile da eliminare. Selezione completata.

Feature finali selezionate:
['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17']


In [11]:
import xgboost as xgb
from itertools import product

# --- Assumiamo che X_current e y_train siano già definiti ---
# dtrain = DMatrix dei tuoi dati
dtrain = xgb.DMatrix(X_current, label=y_train)

# --- Griglia di tutti gli iperparametri da esplorare ---
param_grid = {
    'alpha':          [0, 0.1, 0.3, 0.5, 0.7, 1],
    'lambda':         [0, 0.1, 0.3, 0.5, 0.7, 1],
    'max_depth':      [3, 4, 5, 6],
    'eta':            [0.01, 0.05, 0.1, 0.2],
    'subsample':      [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree':[0.7, 0.8, 0.9, 1.0]
}

best_params = {}
best_rmse = float("inf")

print("Stage unico: ottimizzazione completa su X_current, y_train")

# --- Ciclo su ogni combinazione di iperparametri ---
for alpha, lambda_val, max_depth, eta, subsample, colsample in product(
        param_grid['alpha'],
        param_grid['lambda'],
        param_grid['max_depth'],
        param_grid['eta'],
        param_grid['subsample'],
        param_grid['colsample_bytree']
    ):
    # Costruisco i parametri per questa iterazione
    params = {
        'objective':        'reg:squarederror',
        'alpha':            alpha,
        'lambda':           lambda_val,
        'max_depth':        max_depth,
        'eta':               eta,
        'subsample':        subsample,
        'colsample_bytree': colsample
    }

    # Eseguo la cross‐validation con early stopping
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=100,
        nfold=5,
        metrics="rmse",
        early_stopping_rounds=10,
        seed=42,
        verbose_eval=False
    )
    # Estraggo il valore minimo di test-rmse-mean
    mean_rmse = cv_results['test-rmse-mean'].min()

    # Stampo lo stato corrente (opzionale ma utile per monitorare)
    print(f"α={alpha:.1f}, λ={lambda_val:.1f}, depth={max_depth}, η={eta:.2f}, "
          f"sub={subsample}, col={colsample} → RMSE: {mean_rmse:.4f}")

    # Aggiorno il miglior risultato trovato
    if mean_rmse < best_rmse:
        best_rmse = mean_rmse
        best_params = {
            'alpha':            alpha,
            'lambda':           lambda_val,
            'max_depth':        max_depth,
            'eta':               eta,
            'subsample':        subsample,
            'colsample_bytree': colsample
        }

# --- Output dei migliori iperparametri ---
print("\nMigliori parametri trovati:")
for k, v in best_params.items():
    print(f"  • {k}: {v}")
print(f"Con RMSE = {best_rmse:.4f}")


Stage unico: ottimizzazione completa su X_current, y_train
α=0.0, λ=0.0, depth=3, η=0.01, sub=0.7, col=0.7 → RMSE: 60.3890
α=0.0, λ=0.0, depth=3, η=0.01, sub=0.7, col=0.8 → RMSE: 60.3273
α=0.0, λ=0.0, depth=3, η=0.01, sub=0.7, col=0.9 → RMSE: 60.0812
α=0.0, λ=0.0, depth=3, η=0.01, sub=0.7, col=1.0 → RMSE: 59.8904
α=0.0, λ=0.0, depth=3, η=0.01, sub=0.8, col=0.7 → RMSE: 60.4859
α=0.0, λ=0.0, depth=3, η=0.01, sub=0.8, col=0.8 → RMSE: 60.2801
α=0.0, λ=0.0, depth=3, η=0.01, sub=0.8, col=0.9 → RMSE: 60.1921
α=0.0, λ=0.0, depth=3, η=0.01, sub=0.8, col=1.0 → RMSE: 60.1197
α=0.0, λ=0.0, depth=3, η=0.01, sub=0.9, col=0.7 → RMSE: 60.5695
α=0.0, λ=0.0, depth=3, η=0.01, sub=0.9, col=0.8 → RMSE: 60.3906
α=0.0, λ=0.0, depth=3, η=0.01, sub=0.9, col=0.9 → RMSE: 60.2182
α=0.0, λ=0.0, depth=3, η=0.01, sub=0.9, col=1.0 → RMSE: 60.1731
α=0.0, λ=0.0, depth=3, η=0.01, sub=1.0, col=0.7 → RMSE: 60.8071
α=0.0, λ=0.0, depth=3, η=0.01, sub=1.0, col=0.8 → RMSE: 60.7164
α=0.0, λ=0.0, depth=3, η=0.01, sub=1.0, col=0

KeyboardInterrupt: 