In [25]:
import pandas as pd

df = pd.read_csv('../datasets/processed_data.csv')

In [26]:
from sklearn.model_selection import train_test_split
# Separazione in features e target
X = df.drop(columns=['prezzo'])
y = df['prezzo']

# Suddivisione in training e test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print('Valori mancanti train set: ' + str(X_train.isnull().sum().sum()))
print('Valori mancanti test set: ' + str(X_test.isnull().sum().sum()))

Valori mancanti train set: 80459
Valori mancanti test set: 34073


In [27]:
def data_preparation_train(X_train, y_train):
    calculate_modes_and_means(X_train)
    print("Shape before data_imputation:", X_train.shape[0])
    print("Shape of y_train:", y_train.shape[0])
    X_train = data_imputation(X_train)
    print("Shape after data_imputation:", X_train.shape[0])
    X_train = feature_engineer(X_train)
    print("Shape after feature_engineer:", X_train.shape[0])
    y_train = y_train[X_train.index]
    X_train = encode_train(X_train, y_train)
    X_train = normalize_train(X_train)
    #X_train = remove_outliers(X_train)
    X_train = select_features_train(X_train)

    return X_train

In [28]:
def data_preparation_test(X_test):
    X_test = data_imputation(X_test)
    print("Shape before data_imputation:", X_test.shape[0])
    X_test = data_imputation(X_test)
    print("Shape after data_imputation:", X_test.shape[0])
    X_test = feature_engineer(X_test)
    print("Shape after feature_engineer:", X_test.shape[0])
    X_test = encode_test(X_test)
    X_test = normalize_test(X_test)
    X_test = select_features_test(X_test)

    return X_test

In [29]:
group_modes = {}
mean_condizione_by_anno = {}
mean_chilometraggio_by_anno = {}
mean_chilometraggio_by_eta = {}
mean_condizione_by_eta = {}
overall_modes = {}
overall_condizione_mean = None
overall_chilometraggio_mean = None

In [30]:
def calculate_modes_and_means(X_train):
    global group_modes, mean_condizione_by_anno, mean_chilometraggio_by_anno, mean_chilometraggio_by_eta, mean_condizione_by_eta, overall_modes, overall_condizione_mean, overall_chilometraggio_mean

    X = X_train.copy()
    group_modes['carrozzeria'] = X.groupby('modello')['carrozzeria'].agg(
        lambda x: x.mode()[0] if not x.mode().empty else None).to_dict()
    group_modes['trasmissione'] = X.groupby('modello')['trasmissione'].agg(
        lambda x: x.mode()[0] if not x.mode().empty else None).to_dict()
    group_modes['colorazione'] = X.groupby('marca')['colorazione'].agg(
        lambda x: x.mode()[0] if not x.mode().empty else None).to_dict()
    group_modes['colore interni'] = X.groupby('marca')['colore interni'].agg(
        lambda x: x.mode()[0] if not x.mode().empty else None).to_dict()
    group_modes['modello'] = X.groupby('marca')['modello'].agg(
        lambda x: x.mode()[0] if not x.mode().empty else None).to_dict()
    group_modes['marca'] = X.groupby('modello')['marca'].agg(
        lambda x: x.mode()[0] if not x.mode().empty else None).to_dict()

    # Calcola le medie per anno
    mean_condizione_by_anno = X.groupby('anno produzione')['condizione'].mean().to_dict()
    mean_chilometraggio_by_anno = X.groupby('anno produzione')['chilometraggio'].mean().to_dict()

    mean_condizione_by_eta = {
        2015 - anno: media for anno, media in mean_condizione_by_anno.items()
    }

    mean_chilometraggio_by_eta = {
        2015 - anno: media for anno, media in mean_chilometraggio_by_anno.items()
    }

    # Calcola le mode complessive e le medie dopo imputazione
    overall_modes = {
        'carrozzeria': X['carrozzeria'].mode()[0],
        'trasmissione': X['trasmissione'].mode()[0],
        'colorazione': X['colorazione'].mode()[0],
        'colore interni': X['colore interni'].mode()[0],
        'modello': X['modello'].mode()[0],
        'marca': X['marca'].mode()[0]
    }

    # Calcola le medie complessive dopo imputazione per anno
    temp_condizione = X['condizione'].fillna(X['anno produzione'].map(mean_condizione_by_anno))
    overall_condizione_mean = temp_condizione.mean()

    temp_chilometraggio = X['chilometraggio'].fillna(X['anno produzione'].map(mean_chilometraggio_by_anno))
    overall_chilometraggio_mean = temp_chilometraggio.mean()

In [31]:
def data_imputation(X_train):
    global group_modes, mean_condizione_by_anno, mean_chilometraggio_by_anno, mean_chilometraggio_by_eta, mean_condizione_by_eta, overall_modes, overall_condizione_mean, overall_chilometraggio_mean

    X = X_train.copy()

    # Drop delle righe dove sia "Marca" che "Modello" sono null
    X = X.dropna(subset=["marca", "modello"], how="all")


    if 'trasmissione' in X.columns:
        X['trasmissione'] = X['trasmissione'].fillna(X['modello'].map(group_modes['trasmissione'])).fillna(
            overall_modes['trasmissione'])

    # Applica l'imputazione basata sui gruppi
    X['marca'] = X['marca'].fillna(X['marca'].map(group_modes['marca'])).fillna(overall_modes['marca'])
    X['modello'] = X['modello'].fillna(X['modello'].map(group_modes['modello'])).fillna(overall_modes['modello'])

    X['carrozzeria'] = X['carrozzeria'].fillna(X['modello'].map(group_modes['carrozzeria'])).fillna(overall_modes['carrozzeria'])

    X['colorazione'] = X['colorazione'].fillna(X['marca'].map(group_modes['colorazione'])).fillna(overall_modes['colorazione'])
    X['colore interni'] = X['colore interni'].fillna(X['marca'].map(group_modes['colore interni'])).fillna(overall_modes['colore interni'])

    X['allestimento'] = X['allestimento'].fillna('base')

    if('anno produzione' not in X.columns):
        # Imputa condizione e chilometraggio
        X['condizione'] = X['condizione'].fillna(X['età'].map(mean_condizione_by_eta)).fillna(overall_condizione_mean)
        X['chilometraggio'] = X['chilometraggio'].fillna(X['età'].map(mean_chilometraggio_by_eta)).fillna(overall_chilometraggio_mean)
    else:
        X['condizione'] = X['condizione'].fillna(X['anno produzione'].map(mean_condizione_by_anno)).fillna(
            overall_condizione_mean)
        X['chilometraggio'] = X['chilometraggio'].fillna(X['anno produzione'].map(mean_chilometraggio_by_anno)).fillna(
            overall_chilometraggio_mean)

    print("Duplicati prima eventuale drop")
    print(X.duplicated().sum().sum())
    X.drop_duplicates( inplace=True)
    print("Duplicati dopo eventuale drop")
    print(X.duplicated().sum().sum())

    return X

In [32]:
def feature_engineer(data):
    anno_rif = 2015
    data['età'] = anno_rif - data['anno produzione']
    data.drop(columns=['anno produzione'], inplace=True)

    return data

In [33]:
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

label_encoder = LabelEncoder()
catboost_encoder = ce.TargetEncoder(smoothing=15)
marca_means = None
global_mean = None
known_makes = None
known_models = None
cat_cols = None

In [34]:
def encode_train(X_train, y_train):
    global marca_means, global_mean, known_makes, known_models, cat_cols, label_encoder, catboost_encoder

    if 'trasmissione' in X_train.columns:
        #Codifica 'trasmissione' con LabelEncoder
        X_train['trasmissione'] = label_encoder.fit_transform(X_train['trasmissione'])

    # Colonne categoriche da codificare con CatBoostEncoder
    cat_cols = [col for col in X_train.select_dtypes(include=['object', 'category']).columns if col != 'trasmissione']

    # Fit CatBoostEncoder
    encoded = catboost_encoder.fit_transform(X_train[cat_cols], y_train)

    # Prepara sostituzioni per modelli non visti
    marca_means = X_train.join(y_train.rename('target')).groupby('marca')['target'].mean()
    global_mean = y_train.mean()
    known_makes = X_train['marca'].unique()
    known_models = X_train['modello'].unique()

    X_train[cat_cols] = encoded
    return X_train

In [35]:
def encode_test(X_test):
    global marca_means, global_mean, known_makes, known_models, cat_cols, label_encoder, catboost_encoder

    X = X_test.copy()

    if 'trasmissione' in X.columns:
        # Applica LabelEncoder a 'trasmissione'
        X['trasmissione'] = label_encoder.transform(X['trasmissione'])

    # Applica CatBoostEncoder alle altre colonne categoriche
    encoded = catboost_encoder.transform(X[cat_cols])

    # Gestisci modelli non presenti nel training set
    new_models_mask = ~X['modello'].isin(known_models)
    if new_models_mask.any():
        replacements = X.loc[new_models_mask, 'marca'].map(marca_means).fillna(global_mean)
        encoded.loc[new_models_mask, 'modello'] = replacements.values

    X[cat_cols] = encoded
    return X

In [36]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_cols = None

In [37]:
def normalize_train(X_train):
    global scaler, numeric_cols

    numeric_cols = X_train.select_dtypes(include=['number']).columns
    X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])

    return X_train

In [38]:
def normalize_test(X_test):
    global scaler, numeric_cols

    X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

    return X_test

In [39]:
# def remove_outliers(X_train):
#     threshold = 3
#     numeric_cols = X_train.select_dtypes(include=['number']).columns
#     z_scores = (X_train[numeric_cols] - X_train[numeric_cols].mean()) / X_train[numeric_cols].std()
#     mask = (z_scores.abs() <= threshold).all(axis=1)
#     return X_train[mask]

In [40]:
threshold_variance = 0.1
features_to_keep = None

In [41]:
def select_features_train(X_train):
    global threshold_variance, features_to_keep
    variances = X_train.var()
    features_to_keep = variances[variances > threshold_variance].index.tolist()

    return X_train[features_to_keep]

In [42]:
def select_features_test(X_test):
    global features_to_keep

    return X_test[features_to_keep]

In [43]:
X_train = data_preparation_train(X_train, y_train)
y_train = y_train.loc[X_train.index]
X_test = data_preparation_test(X_test)
y_test = y_test.loc[X_test.index]

Shape before data_imputation: 376399
Shape of y_train: 376399
Duplicati prima eventuale drop
119
Duplicati dopo eventuale drop
0
Shape after data_imputation: 376280
Shape after feature_engineer: 376280
Duplicati prima eventuale drop
25
Duplicati dopo eventuale drop
0
Shape before data_imputation: 161290
Duplicati prima eventuale drop
0
Duplicati dopo eventuale drop
0
Shape after data_imputation: 161290
Shape after feature_engineer: 161290


In [44]:
X_train

Unnamed: 0,marca,modello,allestimento,carrozzeria,trasmissione,condizione,chilometraggio,colorazione,colore interni,età
370378,0.198867,-0.243119,-0.012623,-0.714301,-0.177937,0.616314,-0.816936,1.145138,0.785960,-1.000578
229362,1.609838,1.385847,-1.200588,1.128978,-0.177937,1.664729,-1.039917,-1.104849,0.785960,-0.732552
387435,-0.376516,-0.421358,-0.045251,-0.714301,-0.177937,-0.432101,-0.613041,0.081862,0.785960,-1.000578
285284,-0.511207,-0.096200,-1.200588,-0.714301,-0.177937,-0.432101,1.079015,-0.960529,0.785960,1.143635
145008,1.581548,0.838703,-0.045251,-0.714301,-0.177937,-1.480515,-0.937793,-0.960529,0.785960,-0.464525
...,...,...,...,...,...,...,...,...,...,...
110268,-0.423442,-0.261969,-0.045251,0.959498,-0.177937,-1.480515,1.125937,-0.960529,-1.165360,0.607582
259178,1.609838,-1.140939,2.194834,-0.714301,-0.177937,0.616314,-0.209167,-0.960529,0.785960,1.411662
365838,1.563948,0.727657,-0.012623,-0.714301,-0.177937,0.616314,-0.522702,0.707748,3.020166,-0.464525
131932,1.609838,0.968844,2.194834,-0.714301,-0.177937,0.616314,-0.851036,1.145138,0.785960,-0.732552


In [45]:
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

def random_forest_evaluation_depth(data, target_column='prezzo'):
    """
    Valuta l'effetto della profondità degli alberi (max_depth) su RandomForestRegressor.
    Traccia MAE e RMSE per training e validazione in funzione della profondità.

    Parametri:
      - data: DataFrame contenente i dati.
      - target_column: Nome della colonna target (default 'prezzo').
    """
    train_mae, val_mae = [], []
    train_rmse, val_rmse = [], []
    depths = range(1, 40)

    for depth in depths:
        # Inizializzazione e training del modello per la profondità corrente
        model = RandomForestRegressor(
            max_depth=depth,
            n_estimators=500,
            min_samples_split=10,
            min_samples_leaf=4,
            max_features='sqrt',
            n_jobs=-1,
            random_state=20,
            bootstrap=True,
            criterion="squared_error"
        )
        model.fit(X_train, y_train)

        # Predizioni
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Calcolo delle metriche
        train_mae.append(mean_absolute_error(y_train, y_train_pred))
        val_mae.append(mean_absolute_error(y_test, y_test_pred))

        train_rmse.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
        val_rmse.append(np.sqrt(mean_squared_error(y_test, y_test_pred)))

    # Tracciamento dei risultati per MAE
    plt.figure(figsize=(10, 6))
    plt.plot(depths, train_mae, label='Training MAE', marker='o', linestyle='--', color='blue')
    plt.plot(depths, val_mae, label='Validation MAE', marker='o', linestyle='-', color='red')
    plt.xlabel('Tree Depth')
    plt.ylabel('Mean Absolute Error (MAE)')
    plt.title('Training & Validation MAE vs. Tree Depth')
    plt.legend()
    plt.grid(True)
    plt.savefig("../diagrams/regression/mae_vs_depth.jpg")
    plt.show()

    # Tracciamento dei risultati per RMSE
    plt.figure(figsize=(10, 6))
    plt.plot(depths, train_rmse, label='Training RMSE', marker='o', linestyle='--', color='blue')
    plt.plot(depths, val_rmse, label='Validation RMSE', marker='o', linestyle='-', color='red')
    plt.xlabel('Tree Depth')
    plt.ylabel('Root Mean Squared Error (RMSE)')
    plt.title('Training & Validation RMSE vs. Tree Depth')
    plt.legend()
    plt.grid(True)
    plt.savefig("../diagrams/regression/rmse_vs_depth.jpg")
    plt.show()

# Esegui la funzione
random_forest_evaluation_depth(df)


KeyboardInterrupt: 

In [29]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    max_depth=25,
    n_estimators=500,
    min_samples_split=10,
    max_features='sqrt',
    min_samples_leaf=4,
    n_jobs=-1,
    random_state=42,
    bootstrap=True
)

model.fit(X_train, y_train)

In [30]:
from sklearn.metrics import *
import math

# Calcolo delle metriche
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

rmse_train = math.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = math.sqrt(mean_squared_error(y_test, y_test_pred))

mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)

# Creazione di un DataFrame con etichette chiare
metrics = pd.DataFrame({
    "MAE": [mae_train, mae_test],
    "RMSE": [rmse_train, rmse_test],
    "MAPE": [mape_train, mape_test],
}, index=["Train", "Test"])

metrics

Unnamed: 0,MAE,RMSE,MAPE
Train,1209.178208,1975.232011,0.132934
Test,1438.835982,2399.818098,0.161805
