In [None]:
import pandas as pd

input_filename = "C:\\Users\\Pc\\Downloads\\FR_CO2_2023.csv"
output_filename = "C:\\Users\\Pc\\Downloads\\CLEAN_FR_CO2_2023.csv"
titre_graph = "FR_C02_2023"

df = pd.read_csv(input_filename)

In [None]:
df.isna().sum()

In [None]:
col_to_del = ['MMS', 'Enedc (g/km)', 'W (mm)', 'At1 (mm)', 'At2 (mm)', 'Ernedc (g/km)', 'De', 'Vf']

In [None]:
# colonnes à supprimer car pas assez remplies (à valider, on peut peut être aussi inclure IT et Erwltp (g/km))
col_to_del.append('RLFI')

In [None]:
# colonnes à supprimer car doublons d'information
col_to_del.extend(('Mp', 'Mh', 'Man', 'Cr', 'm (kg)', 'Fm'))

In [None]:
# colonnes à supprimer car non pertinentes (infos administratives ou relatives au jeu de données en lui-même)
col_to_del.extend(('ID', 'Status', 'r', 'year', 'Tan', 'Va', 'Ve', 'Ct', 'Cr'))

In [None]:
# suppression des lignes concernant les véhicules électriques ainsi que des colonnes concernant ces véhicules uniquement
df = df[df['Ft'] != 'electric']
col_to_del.append('Electric range (km)')

In [None]:
# suppresion de colonnes sélectionnées
df = df.drop(col_to_del, axis=1)

In [None]:
# suppresion des doublons
df = df.drop_duplicates()

In [None]:
df.head(5)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Étape 1 : Exclusion des carburants rares
exclusions = ['lpg', 'e85', 'ng', 'hydrogen']
df_filtered = df[~df['Ft'].isin(exclusions)]

# Étape 2 : Suppression des outliers PAR groupe de carburant
def remove_outliers_iqr_per_group(df, group_col, value_col):
    df_clean = pd.DataFrame()
    for name, group in df.groupby(group_col):
        Q1 = group[value_col].quantile(0.25)
        Q3 = group[value_col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        group_clean = group[(group[value_col] >= lower) & (group[value_col] <= upper)]
        df_clean = pd.concat([df_clean, group_clean], axis=0)
    return df_clean.reset_index(drop=True)

df_no_outliers = remove_outliers_iqr_per_group(df_filtered, 'Ft', 'Ewltp (g/km)')



df_no_outliers.to_csv(f"{output_filename}", index=False)

# Étape 3 : Affichage final sans outliers
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_no_outliers, x='Ft', y='Ewltp (g/km)', showfliers=False)

plt.title(f'Émissions selon le type de carburant (sans outliers visibles) — fichier : {titre_graph}')

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
sns.lmplot(data=df_no_outliers, x='ec (cm3)', y='Ewltp (g/km)', hue='Ft', aspect=1.5)
plt.title(f'Cylindrée vs Émissions par carburant — fichier : {titre_graph}')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_no_outliers, x='ech', y='Ewltp (g/km)')
plt.xticks(rotation=45)
plt.title(f'Émissions selon les normes environnementales — fichier : {titre_graph}')
plt.tight_layout()
plt.show()

In [None]:
# Calcul de la puissance spécifique
df_no_outliers['power_to_weight'] = df_no_outliers['ep (KW)'] / df['Mt']
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_no_outliers, x='power_to_weight', y='Ewltp (g/km)', hue='Ft')
plt.title(f'Puissance spécifique vs Émissions — fichier : {titre_graph}')
plt.show()

In [None]:
top5_polluants = df_no_outliers.sort_values(by='Ewltp (g/km)', ascending=False).head(5)
top5_sobres = df_no_outliers.sort_values(by='Ewltp (g/km)', ascending=True).head(5)
print(titre_graph)
print("Top 5 véhicules les plus émetteurs :")
print(top5_polluants[['Mk', 'Cn', 'Ewltp (g/km)', 'Ft']])
print("\nTop 5 véhicules les plus sobres :")
print(top5_sobres[['Mk', 'Cn', 'Ewltp (g/km)', 'Ft']])

In [None]:
mean_emissions_by_brand = df_no_outliers.groupby('Mk')['Ewltp (g/km)'].mean().sort_values()
mean_emissions_by_brand.head(10).plot(kind='barh', figsize=(10, 6), title=f'Top 10 Marques les plus sobres — fichier : {titre_graph}')
plt.xlabel('Ewltp (g/km)')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
df_no_outliers.columns = df_no_outliers.columns.str.strip()

sns.scatterplot(data=df_no_outliers, 
                x='Fuel consumption', 
                y='Ewltp (g/km)', 
                hue='Ft')

plt.title(f'Lien entre consommation et émissions — fichier : {titre_graph}')
plt.xlabel('Consommation (L/100km)')
plt.ylabel('Émissions CO2 (g/km)')
plt.show()

In [None]:
# Nettoyage éventuel des noms de colonnes
df_no_outliers.columns = df_no_outliers.columns.str.strip()


# Ajout de la puissance spécifique
df_no_outliers['power_to_weight'] = df_no_outliers['ep (KW)'] / df_no_outliers['Mt']

# Sélection des colonnes à corréler
colonnes_corr = ['Ewltp (g/km)', 'Fuel consumption', 'ec (cm3)', 'ep (KW)', 'Mt', 'power_to_weight']
df_no_outliers_corr = df_no_outliers[colonnes_corr].copy()

# Calcul de la matrice de corrélation
correlation_matrix = df_no_outliers_corr.corr()

# Affichage de la heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title(f"Matrice de corrélation des variables techniques — fichier : {titre_graph}")
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# 👉 Suppose que ton DataFrame s'appelle df_no_outliers
# Si besoin, recharge ton CSV puis refais ton filtrage d'outliers.
df = df_no_outliers.copy()

# Sécurise les colonnes numériques (au cas où il reste des chaînes "N.A.")
num_cols_to_fix = ['Fuel consumption', 'Ewltp (g/km)', 'ec (cm3)', 'ep (kW)', 'Mt']
for c in num_cols_to_fix:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Recrée power_to_weight si besoin
if 'power_to_weight' not in df.columns and {'ep (kW)','Mt'}.issubset(df.columns):
    df['power_to_weight'] = df['ep (kW)'] / df['Mt']

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error

# --- 1. Préparation des données ---
data1 = df[['Fuel consumption','Ewltp (g/km)']].dropna()

X = data1[['Fuel consumption']]
y = data1['Ewltp (g/km)']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- 2. Entraînement ---
lin1 = LinearRegression()
lin1.fit(X_train, y_train)

# --- 3. Prédictions & métriques ---
y_pred = lin1.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)   # ✅ plus de warning

print("Coef (slope)       :", lin1.coef_[0])
print("Intercept          :", lin1.intercept_)
print("R² (test)          :", round(r2, 4))
print("RMSE (test, g/km)  :", round(rmse, 3))

# --- 4. Visualisation ---
plt.figure(figsize=(8,6))
plt.scatter(X_train, y_train, color="blue", alpha=0.5, label="Train")
plt.scatter(X_test, y_test, color="green", alpha=0.5, label="Test")

# Génération d'une plage de valeurs pour tracer la droite
x_range = pd.DataFrame(
    np.linspace(X['Fuel consumption'].min(), X['Fuel consumption'].max(), 100),
    columns=['Fuel consumption']
)
y_pred_line = lin1.predict(x_range)

plt.plot(x_range, y_pred_line, color="red", linewidth=2, label="Droite de régression")

plt.xlabel("Fuel consumption (L/100km)")
plt.ylabel("Ewltp (g/km)")
plt.title("Régression linéaire univariée : CO₂ ~ Consommation")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error

# --- 1. Préparation des données ---
features = ['Fuel consumption', 'ec (cm3)', 'ep (KW)', 'Mt', 'power_to_weight']
target = 'Ewltp (g/km)'

data2 = df_no_outliers[features + [target]].dropna()

X = data2[features]
y = data2[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- 2. Entraînement ---
lin2 = LinearRegression()
lin2.fit(X_train, y_train)

# --- 3. Prédictions & métriques ---
y_pred = lin2.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("=== Résultats Régression Multivariée ===")
print("Coefficients :", dict(zip(features, lin2.coef_)))
print("Intercept    :", lin2.intercept_)
print("R² (test)    :", round(r2, 4))
print("RMSE (g/km)  :", round(rmse, 3))

# --- 4. Visualisation (scatter y_test vs y_pred) ---
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.5, color="blue", label="Prédictions")
plt.plot([y.min(), y.max()], [y.min(), y.max()], color="red", linestyle="--", linewidth=2, label="Idéal")
plt.xlabel("Valeurs réelles (Ewltp g/km)")
plt.ylabel("Valeurs prédites (Ewltp g/km)")
plt.title("Régression linéaire multivariée : réel vs prédit")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error

# --- 1. Variables numériques + catégorielles ---
num_features = ['Fuel consumption', 'ec (cm3)', 'ep (KW)', 'Mt', 'power_to_weight']
cat_features = ['Ft', 'ech']   # type de carburant + norme Euro
target = 'Ewltp (g/km)'

data3 = df_no_outliers[num_features + cat_features + [target]].dropna()

# Encodage One-Hot des variables catégorielles
data3_encoded = pd.get_dummies(data3, columns=cat_features, drop_first=True)

X = data3_encoded.drop(columns=[target])
y = data3_encoded[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- 2. Entraînement ---
lin3 = LinearRegression()
lin3.fit(X_train, y_train)

# --- 3. Prédictions & métriques ---
y_pred = lin3.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print("=== Résultats Régression Multivariée (avec catégorielles) ===")
print("R² (test)    :", round(r2, 4))
print("RMSE (g/km)  :", round(rmse, 3))

# Pour voir les coefficients
coefs = pd.DataFrame({
    "Variable": X.columns,
    "Coefficient": lin3.coef_
}).sort_values(by="Coefficient", key=abs, ascending=False)

print("\nCoefficients les plus influents :")
print(coefs.head(15))

# --- 4. Visualisation ---
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.5, color="blue", label="Prédictions")
plt.plot([y.min(), y.max()], [y.min(), y.max()], color="red", linestyle="--", linewidth=2, label="Idéal")
plt.xlabel("Valeurs réelles (Ewltp g/km)")
plt.ylabel("Valeurs prédites (Ewltp g/km)")
plt.title("Régression multivariée (numériques + catégorielles)")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# --- 1) Sélection des colonnes utiles (évite Country, Mk, VFN, dates, etc.) ---
target = 'Ewltp (g/km)'
num_features = ['Fuel consumption', 'ec (cm3)', 'ep (KW)', 'Mt', 'power_to_weight']
cat_features = ['Ft', 'ech']   # on encode seulement ces deux catégories

# coerce au cas où il reste des strings numériques
dfm = df_no_outliers[num_features + cat_features + [target]].copy()
for c in num_features + [target]:
    dfm[c] = pd.to_numeric(dfm[c], errors='coerce')

# drop lignes incomplètes sur ces colonnes
dfm = dfm.dropna(subset=[target] + num_features + cat_features)

# --- 2) Encodage One-Hot des catégorielles choisies ---
df_encoded = pd.get_dummies(dfm, columns=cat_features, drop_first=True)

X = df_encoded.drop(columns=[target])
y = df_encoded[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- 3) Modèle linéaire ---
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

r2_lin = r2_score(y_test, y_pred_lin)
rmse_lin = mean_squared_error(y_test, y_pred_lin, squared=False)

# --- 4) Random Forest ---
rf = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)

print("=== Régression Linéaire ===")
print(f"R² (test): {r2_lin:.4f}")
print(f"RMSE (g/km): {rmse_lin:.3f}")

print("\n=== Random Forest ===")
print(f"R² (test): {r2_rf:.4f}")
print(f"RMSE (g/km): {rmse_rf:.3f}")

# --- 5) Visualisation comparée ---
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred_lin, alpha=0.4, label=f"Linéaire (R²={r2_lin:.3f})")
plt.scatter(y_test, y_pred_rf,  alpha=0.4, label=f"RandomForest (R²={r2_rf:.3f})")
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', label="Idéal")
plt.xlabel("Valeurs réelles (Ewltp g/km)")
plt.ylabel("Valeurs prédites (Ewltp g/km)")
plt.title("Comparaison : Linéaire vs RandomForest")
plt.legend()
plt.tight_layout()
plt.show()

# --- 6) Importances de variables (Random Forest) ---
imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop 15 features (RandomForest):")
print(imp.head(15))

plt.figure(figsize=(8,6))
imp.head(15).iloc[::-1].plot(kind='barh')
plt.title("Importances de variables (RF) – Top 15")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt

# --- 1) Sélection des variables ---
features_cluster = ['Fuel consumption', 'Mt', 'ep (KW)']
df_cluster = df_no_outliers[features_cluster + ['Ewltp (g/km)']].dropna()

# --- 2) Normalisation des données (important pour K-Means) ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_cluster[features_cluster])

# --- 3) Choix du nombre de clusters (méthode du coude) ---
inertia = []
K_range = range(2, 10)  # on teste entre 2 et 9 clusters
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled)
    inertia.append(km.inertia_)

plt.figure(figsize=(6,4))
plt.plot(K_range, inertia, 'o-')
plt.xlabel("Nombre de clusters (k)")
plt.ylabel("Inertie (within-cluster SSE)")
plt.title("Méthode du coude pour choisir k")
plt.show()

# --- 4) Appliquer K-Means avec un k choisi (ex: 3) ---
k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df_cluster['Cluster'] = kmeans.fit_predict(X_scaled)

# --- 5) Visualisation 2D (Fuel consumption vs Power) ---
plt.figure(figsize=(7,5))
sns.scatterplot(data=df_cluster, 
                x='Fuel consumption', y='ep (KW)', 
                hue='Cluster', palette='Set1', alpha=0.6)
plt.title("Clusters de véhicules (conso vs puissance)")
plt.show()

# --- 6) Analyse des clusters ---
cluster_summary = df_cluster.groupby('Cluster').agg({
    'Fuel consumption':'mean',
    'Mt':'mean',
    'ep (KW)':'mean',
    'Ewltp (g/km)':'mean',
    'Fuel consumption':'count'
}).rename(columns={'Fuel consumption':'Nb_vehicules'})

print("\nRésumé par cluster :")
print(cluster_summary)

In [None]:
# Si besoin: !pip install plotly==5.* scikit-learn

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import plotly.express as px

# --- 1) Préparer les données & (re)faire KMeans si nécessaire ---
features3d = ['Fuel consumption', 'ep (KW)', 'Mt']
aux_cols   = ['Ewltp (g/km)', 'Ft', 'ech', 'Mk']  # juste pour les infos au survol si dispo

df3d = df_no_outliers[features3d + [c for c in aux_cols if c in df_no_outliers.columns]].dropna().copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df3d[features3d])

k = 3  # adapte si tu as choisi un autre k
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df3d['Cluster'] = kmeans.fit_predict(X_scaled).astype(str)  # str pour une légende propre

# --- 2) Nuage 3D interactif ---
hover_cols = [c for c in ['Ewltp (g/km)', 'Ft', 'ech', 'Mk'] if c in df3d.columns]

fig = px.scatter_3d(
    df3d,
    x='Fuel consumption', y='ep (KW)', z='Mt',
    color='Cluster',
    hover_data=hover_cols,
    title="Clusters de véhicules (3D) — Consommation vs Puissance vs Masse",
    opacity=0.7
)
fig.update_traces(marker=dict(size=3))
fig.update_layout(scene=dict(
    xaxis_title="Fuel consumption (L/100km)",
    yaxis_title="Power (kW)",
    zaxis_title="Weight (kg)"
))
fig.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # nécessaire pour l'affichage 3D
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# --- 1) Données & clustering ---
features3d = ['Fuel consumption', 'ep (KW)', 'Mt']
df3d = df_no_outliers[features3d + ['Ewltp (g/km)']].dropna().copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df3d[features3d])

k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
df3d['Cluster'] = clusters

# (option) échantillonner pour lisibilité si dataset énorme
sample = min(20000, len(df3d))
dfp = df3d.sample(sample, random_state=42) if len(df3d) > sample else df3d

# --- 2) Plot 3D statique ---
fig = plt.figure(figsize=(9, 7))
ax = fig.add_subplot(111, projection='3d')

# palette simple
palette = ['tab:blue', 'tab:green', 'tab:red', 'tab:orange', 'tab:purple']

for c in sorted(dfp['Cluster'].unique()):
    sub = dfp[dfp['Cluster'] == c]
    ax.scatter(
        sub['Fuel consumption'], sub['ep (KW)'], sub['Mt'],
        s=8, alpha=0.7, label=f'Cluster {c}', color=palette[c % len(palette)]
    )

ax.set_xlabel('Fuel consumption (L/100km)')
ax.set_ylabel('Power (kW)')
ax.set_zlabel('Weight (kg)')
ax.set_title('Clusters de véhicules (3D) — Consommation vs Puissance vs Masse')

# angle de vue (ajuste si besoin)
ax.view_init(elev=22, azim=35)

ax.legend()
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
from matplotlib.lines import Line2D

# --- Reconstruction d'un DataFrame à partir de X_scaled ---
X_clustered = pd.DataFrame(
    X_scaled,
    columns=['Fuel consumption', 'ep (kW)', 'Mt']
)
X_clustered['Cluster'] = kmeans.labels_.astype(int)

# Prépare les masques/clusters et une palette stable
cluster_ids = sorted(X_clustered['Cluster'].unique())
colors = {cid: col for cid, col in zip(cluster_ids, ['purple', 'green', 'red', 'orange', 'blue'])}

# --- Liste des angles (élévation, azimut) ---
angles = [
    (20, 45),
    (35, 210),
    (10, 90),
    (40, 300),
]

# --- Plot ---
fig = plt.figure(figsize=(18, 12))

for i, (elev, azim) in enumerate(angles, 1):
    ax = fig.add_subplot(2, 2, i, projection='3d')

    # Scatter par cluster (pour pouvoir mettre une légende catégorielle)
    for cid in cluster_ids:
        sub = X_clustered[X_clustered['Cluster'] == cid]
        ax.scatter(
            sub['Fuel consumption'], sub['ep (kW)'], sub['Mt'],
            s=20, alpha=0.6, color=colors[cid], label=f"Cluster {cid}"
        )

    ax.set_xlabel("Fuel consumption (L/100km)")
    ax.set_ylabel("Power (kW)")
    ax.set_zlabel("Weight (kg)")
    ax.set_title(f"Vue {i} (elev={elev}, azim={azim})")
    ax.view_init(elev=elev, azim=azim)

    # Légende (catégorielle) — retire la colorbar
    ax.legend(loc="best", frameon=True)

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # nécessaire pour l'affichage 3D
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# --- 1) Données & clustering ---
features3d = ['Fuel consumption', 'ep (KW)', 'Mt']
df3d = df_no_outliers[features3d + ['Ewltp (g/km)']].dropna().copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df3d[features3d])

k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
df3d['Cluster'] = clusters

# (option) échantillonner pour lisibilité si dataset énorme
sample = min(20000, len(df3d))
dfp = df3d.sample(sample, random_state=42) if len(df3d) > sample else df3d

# --- 2) Plot 3D statique ---
fig = plt.figure(figsize=(9, 7))
ax = fig.add_subplot(111, projection='3d')

# palette simple
palette = ['tab:blue', 'tab:green', 'tab:red', 'tab:orange', 'tab:purple']

for c in sorted(dfp['Cluster'].unique()):
    sub = dfp[dfp['Cluster'] == c]
    ax.scatter(
        sub['Fuel consumption'], sub['ep (KW)'], sub['Mt'],
        s=8, alpha=0.7, label=f'Cluster {c}', color=palette[c % len(palette)]
    )

ax.set_xlabel('Fuel consumption (L/100km)')
ax.set_ylabel('Power (kW)')
ax.set_zlabel('Weight (kg)')
ax.set_title('Clusters de véhicules (3D) — Consommation vs Puissance vs Masse')

# angle de vue (ajuste si besoin)
ax.view_init(elev=10, azim=90)

ax.legend()
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import seaborn as sns

# -------- 1) Préparer exactement le même DF pour clustering & plots --------
features = ['Fuel consumption', 'ep (KW)', 'Mt']
dfc = df_no_outliers[features + ['Ewltp (g/km)']].dropna().copy()

# Standardisation pour l'entraînement KMeans (mais pas pour les plots)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(dfc[features])

# Clustering (fixe le hasard)
k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)

# Ajouter les labels AU DF EN UNITÉS RÉELLES
dfc['Cluster'] = labels.astype(int)

# Palette stable (même mappage 2D/3D)
cluster_ids = sorted(dfc['Cluster'].unique())
palette = {cid: col for cid, col in zip(cluster_ids, ['tab:blue','tab:green','tab:red','tab:orange','tab:purple'])}

# -------- 2) Plot 2D (mêmes points, mêmes labels, mêmes couleurs) --------
plt.figure(figsize=(7,5))
for cid in cluster_ids:
    sub = dfc[dfc['Cluster']==cid]
    plt.scatter(sub['Fuel consumption'], sub['ep (KW)'], s=12, alpha=0.6, color=palette[cid], label=f"Cluster {cid}")
plt.xlabel("Fuel consumption (L/100km)")
plt.ylabel("Power (kW)")
plt.title("Clusters (2D) – conso vs puissance")
plt.legend()
plt.tight_layout()
plt.show()

# -------- 3) Plot 3D en unités réelles, mêmes couleurs & labels --------
fig = plt.figure(figsize=(9,7))
ax = fig.add_subplot(111, projection='3d')
for cid in cluster_ids:
    sub = dfc[dfc['Cluster']==cid]
    ax.scatter(sub['Fuel consumption'], sub['ep (KW)'], sub['Mt'],
               s=10, alpha=0.6, color=palette[cid], label=f"Cluster {cid}")
ax.set_xlabel("Fuel consumption (L/100km)")
ax.set_ylabel("Power (kW)")
ax.set_zlabel("Weight (kg)")
ax.set_title("Clusters (3D) – conso vs puissance vs masse")
ax.view_init(elev=30, azim=210)  # ajuste si besoin
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# --- 1) Préparation du DF sur lequel on mettra les labels ---
# features utilisées pour le clustering
features = ['Fuel consumption', 'ep (KW)', 'Mt']

# colonnes meta pour analyses (marque, carburant, norme) si elles existent
meta_cols = [c for c in ['Mk', 'Ft', 'ech', 'Ewltp (g/km)'] if c in df_no_outliers.columns]

# on part d'un DF propre, en unités réelles
dfc = df_no_outliers[features + meta_cols].copy()

# conversion sûre des numériques
for c in features + ['Ewltp (g/km)']:
    if c in dfc.columns:
        dfc[c] = pd.to_numeric(dfc[c], errors='coerce')

# drop NA sur les features (très important : un seul dropna, réutilisé partout)
dfc = dfc.dropna(subset=features).reset_index(drop=True)

# --- 2) Clustering sur données standardisées, mais labels ajoutés au DF réel ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(dfc[features])

k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)
dfc['Cluster'] = labels.astype(int)   # <-- la colonne existe maintenant dans dfc

# --- 3) Résumé quanti + Top marques par cluster ---
def top_n(series, n=3):
    return series.value_counts(dropna=True).head(n).index.tolist()

groupers = {
    'Fuel consumption': 'mean',
    'ep (KW)': 'mean',
    'Mt': 'mean'
}
if 'Ewltp (g/km)' in dfc.columns:
    groupers['Ewltp (g/km)'] = 'mean'

summary = dfc.groupby('Cluster').agg(groupers)
summary.insert(0, 'Nb véhicules', dfc.groupby('Cluster').size())

if 'Mk' in dfc.columns:
    summary['Top 3 marques'] = (
        dfc.groupby('Cluster')['Mk']
           .apply(lambda s: top_n(s, 3))
    )

print("=== Résumé par cluster ===")
print(summary)

# --- 4) PCA 2D pour visualiser les clusters ---
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

palette = {cid: col for cid, col in zip(sorted(dfc['Cluster'].unique()),
                                        ['tab:blue','tab:green','tab:red','tab:orange','tab:purple'])}

plt.figure(figsize=(7,5))
for cid, col in palette.items():
    mask = (dfc['Cluster'] == cid)
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], s=10, alpha=0.6, color=col, label=f'Cluster {cid}')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA (2D) des clusters')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# ==== données & features ====
dfm = df_no_outliers.copy()
num = ['Fuel consumption', 'ec (cm3)', 'ep (KW)', 'Mt', 'power_to_weight']
cat = [c for c in ['Ft', 'ech'] if c in dfm.columns]
target = 'Ewltp (g/km)'

# conversions sûres
for c in num+[target]:
    if c in dfm.columns:
        dfm[c] = pd.to_numeric(dfm[c], errors='coerce')

dfm = dfm.dropna(subset=num+[target]+cat)

# ==== pipeline linéaire interprétable ====
num_tr = Pipeline([("scaler", StandardScaler())])
cat_tr = Pipeline([("oh", OneHotEncoder(handle_unknown="ignore"))])
pre = ColumnTransformer([("num", num_tr, num), ("cat", cat_tr, cat)])

model = Pipeline([("pre", pre), ("lin", LinearRegression())])
X = dfm[num+cat]
y = dfm[target]
model.fit(X, y)

# ==== CO2 attendu & sobriété ====
dfm["CO2_prédit"] = model.predict(X)
dfm["sobriete"] = dfm[target] / dfm["CO2_prédit"]

# Top / bottom par modèle (marque + modèle commercial si dispo)
group_key = [c for c in ['Mk','Cn'] if c in dfm.columns] or ['VFN']  # fallback
rank = (dfm.groupby(group_key)[['sobriete', target, 'CO2_prédit']]
          .mean().assign(N=dfm.groupby(group_key).size())
          .query("N >= 50")     # filtre: au moins 50 obs par modèle
          .sort_values('sobriete'))

top_sobres   = rank.head(10)
top_energiv  = rank.tail(10)

print("\nTop 10 modèles les PLUS sobres (sobriété < 1):")
print(top_sobres)

print("\nTop 10 modèles les MOINS sobres (sobriété > 1):")
print(top_energiv)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ===================== 1) Préparer les données =====================
dfc = df_no_outliers.copy()

num = ['Fuel consumption', 'ec (cm3)', 'ep (KW)', 'Mt', 'power_to_weight']
cat = [c for c in ['Ft', 'ech'] if c in dfc.columns]
target_cont = 'Ewltp (g/km)'

# conversions sûres
for c in num + [target_cont]:
    if c in dfc.columns:
        dfc[c] = pd.to_numeric(dfc[c], errors='coerce')

dfc = dfc.dropna(subset=num + cat + [target_cont]).copy()

# ===================== 2) Créer les classes CO2 =====================
# Seuils "réglementaires" simples
bins = [0, 95, 120, 150, dfc[target_cont].max()]
labels = ['A', 'B', 'C', 'D']
dfc['Classe_CO2'] = pd.cut(dfc[target_cont], bins=bins, labels=labels, include_lowest=True)

# Si besoin: vérifier le déséquilibre
print("Répartition des classes :\n", dfc['Classe_CO2'].value_counts().sort_index(), "\n")

# ===================== 3) Split & pipeline =====================
X = dfc[num + cat]
y = dfc['Classe_CO2']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

num_tr = Pipeline([
    ("scaler", StandardScaler())
])
cat_tr = Pipeline([
    ("oh", OneHotEncoder(handle_unknown="ignore"))
])

pre = ColumnTransformer([
    ("num", num_tr, num),
    ("cat", cat_tr, cat),
])

# ===================== 4) Modèles =====================
# 4a) Logistic Regression (baseline)
log_clf = Pipeline([
    ("pre", pre),
    ("clf", LogisticRegression(max_iter=200, class_weight="balanced", multi_class="auto"))
])
log_clf.fit(X_train, y_train)
y_pred_log = log_clf.predict(X_test)

# 4b) RandomForestClassifier
rf_clf = Pipeline([
    ("pre", pre),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        class_weight="balanced_subsample",
        random_state=42,
        n_jobs=-1
    ))
])
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

# ===================== 5) Évaluation =====================
def eval_and_plot(y_true, y_pred, title):
    acc = accuracy_score(y_true, y_pred)
    f1w = f1_score(y_true, y_pred, average="weighted")
    print(f"\n=== {title} ===")
    print(f"Accuracy : {acc:.4f}")
    print(f"F1 (pondéré) : {f1w:.4f}")
    print(classification_report(y_true, y_pred, digits=3))
    disp = ConfusionMatrixDisplay.from_predictions(y_true, y_pred)
    disp.ax_.set_title(f"Matrice de confusion — {title}")
    plt.tight_layout()
    plt.show()

eval_and_plot(y_test, y_pred_log, "Logistic Regression")
eval_and_plot(y_test, y_pred_rf, "RandomForestClassifier")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# ========= 0) Paramètres =========
TARGET = 'Fuel consumption'
num_cols = ['Mt', 'ec (cm3)', 'ep (KW)', 'power_to_weight']   # 'ep (KW)' (KW en majuscule dans ton DF)
cat_cols = [c for c in ['Ft', 'ech'] if c in df_no_outliers.columns]

# ========= 1) Copie & nettoyage minimal =========
dfc = df_no_outliers.copy()
# Crée power_to_weight si manquant
if 'power_to_weight' not in dfc.columns and {'ep (KW)','Mt'}.issubset(dfc.columns):
    dfc['power_to_weight'] = pd.to_numeric(dfc['ep (KW)'], errors='coerce') / pd.to_numeric(dfc['Mt'], errors='coerce')

# conversions sûres des numériques
for c in [TARGET] + [col for col in num_cols if col in dfc.columns]:
    dfc[c] = pd.to_numeric(dfc[c], errors='coerce')

# garde seulement les colonnes existantes
num_used = [c for c in num_cols if c in dfc.columns]
all_needed = [TARGET] + num_used + cat_cols
dfc = dfc[all_needed].dropna().copy()

print("Num features utilisées :", num_used)
print("Cat features utilisées :", cat_cols)
print("Taille après nettoyage :", dfc.shape)

# ========= 2) Split =========
X = dfc[num_used + cat_cols]
y = dfc[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ========= 3) Préprocessing commun =========
num_tr = Pipeline(steps=[
    ("scaler", StandardScaler())
])
cat_tr = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

pre = ColumnTransformer(
    transformers=[
        ("num", num_tr, num_used),
        ("cat", cat_tr, cat_cols)
    ]
)

# ========= 4) Modèles =========
lin_pipe = Pipeline(steps=[
    ("pre", pre),
    ("model", LinearRegression())
])

rf_pipe = Pipeline(steps=[
    ("pre", pre),
    ("model", RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ))
])

# ========= 5) Entraînement =========
lin_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train)

# ========= 6) Évaluation =========
def evaluate(name, pipe):
    y_pred = pipe.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"\n=== {name} ===")
    print(f"R²    : {r2:.4f}")
    print(f"RMSE  : {rmse:.3f} L/100km")
    print(f"MAE   : {mae:.3f} L/100km")
    # Scatter réel vs prédit
    plt.figure(figsize=(6,5))
    plt.scatter(y_test, y_pred, s=8, alpha=0.5)
    lims = [min(y_test.min(), y_pred.min()), max(y_test.max(), y_pred.max())]
    plt.plot(lims, lims, 'r--', lw=2, label='Idéal')
    plt.xlabel("Conso réelle (L/100km)")
    plt.ylabel("Conso prédite (L/100km)")
    plt.title(f"{name} — Réel vs Prédit")
    plt.legend()
    plt.tight_layout()
    plt.show()
    return y_pred, r2, rmse, mae

y_pred_lin, r2_lin, rmse_lin, mae_lin = evaluate("Régression linéaire", lin_pipe)
y_pred_rf,  r2_rf,  rmse_rf,  mae_rf  = evaluate("RandomForestRegressor", rf_pipe)

# ========= 7) Importances (RF) =========
# Récupération des noms de features après encodage pour afficher les importances
try:
    feat_names = rf_pipe.named_steps['pre'].get_feature_names_out()
except Exception:
    # compat skearn anciens
    num_names = num_used
    oh = rf_pipe.named_steps['pre'].named_transformers_['cat'].named_steps['onehot']
    cat_names = oh.get_feature_names_out(cat_cols) if cat_cols else []
    feat_names = np.r_[num_names, cat_names]

importances = rf_pipe.named_steps['model'].feature_importances_
imp = pd.Series(importances, index=feat_names).sort_values(ascending=False)
print("\nTop 15 features importantes (RF) :")
print(imp.head(15))

plt.figure(figsize=(7,6))
imp.head(15).iloc[::-1].plot(kind='barh')
plt.title("Importances des variables (RandomForest) — Top 15")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# -------- 1) Données & colonnes --------
TARGET = 'Fuel consumption'
num_cols = ['Mt', 'ec (cm3)', 'ep (KW)', 'power_to_weight']
cat_cols = [c for c in ['Ft', 'ech'] if c in df_no_outliers.columns]

dfc = df_no_outliers.copy()

# créer power_to_weight si absent
if 'power_to_weight' not in dfc.columns and {'ep (KW)','Mt'}.issubset(dfc.columns):
    dfc['power_to_weight'] = pd.to_numeric(dfc['ep (KW)'], errors='coerce') / pd.to_numeric(dfc['Mt'], errors='coerce')

# conversions numériques robustes
for c in [TARGET] + num_cols:
    if c in dfc.columns:
        dfc[c] = pd.to_numeric(dfc[c], errors='coerce')

# garder uniquement ce qui est utile & drop NA une seule fois
use_num = [c for c in num_cols if c in dfc.columns]
dfc = dfc[[TARGET] + use_num + cat_cols].dropna().copy()

X = dfc[use_num + cat_cols]
y = dfc[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------- 2) Prétraitement commun --------
pre = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), use_num),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# -------- 3) Modèles à comparer (dans des Pipelines) --------
models = {
    "Linéaire": Pipeline([("pre", pre), ("model", LinearRegression())]),
    "RandomForest": Pipeline([("pre", pre), ("model", RandomForestRegressor(
        n_estimators=300, random_state=42, n_jobs=-1))]),
    "GradientBoosting": Pipeline([("pre", pre), ("model", GradientBoostingRegressor(
        random_state=42))]),
}

results = []
plt.figure(figsize=(7,6))

for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results.append([name, r2, rmse, mae])

    plt.scatter(y_test, y_pred, s=8, alpha=0.5, label=f"{name} (R²={r2:.3f})")

# diagonale idéale
lims = [min(y_test.min(), min([r[2] for r in []], default=y_test.min())), max(y_test.max(), y_test.max())]
xline = np.linspace(y_test.min(), y_test.max(), 100)
plt.plot(xline, xline, 'r--', label="Idéal")
plt.xlabel("Conso réelle (L/100km)")
plt.ylabel("Conso prédite (L/100km)")
plt.title("Comparaison des modèles (prétraitement identique)")
plt.legend()
plt.tight_layout()
plt.show()

# -------- 4) Tableau récap --------
df_results = pd.DataFrame(results, columns=["Modèle", "R²", "RMSE (L/100km)", "MAE (L/100km)"])\
               .sort_values("R²", ascending=False)
print(df_results.to_string(index=False))

In [None]:
!pip install shap
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt

# Suppose qu'on a déjà entraîné le pipeline RandomForest:
# rf_pipe = Pipeline([("pre", pre), ("model", RandomForestRegressor(...))])
# rf_pipe.fit(X_train, y_train)

# 1) Séparer le préprocesseur et le modèle
preproc = rf_pipe.named_steps["pre"]
rf      = rf_pipe.named_steps["model"]

# 2) Transformer X_test -> matrice numérique
X_test_trans = preproc.transform(X_test)

# 3) Récupérer les noms de features après encodage (pour des plots lisibles)
try:
    feature_names = preproc.get_feature_names_out()
except Exception:
    # compat anciens sklearn
    num_names = preproc.named_transformers_["num"].get_feature_names_out()
    oh        = preproc.named_transformers_["cat"].named_steps["onehot"]
    cat_names = oh.get_feature_names_out(preproc.transformers_[1][2]) if oh is not None else []
    feature_names = np.r_[num_names, cat_names]

X_test_trans_df = pd.DataFrame(X_test_trans.toarray() if hasattr(X_test_trans, "toarray") else X_test_trans,
                               columns=feature_names)

# 4) Explainer SHAP pour modèles d’arbres
explainer = shap.TreeExplainer(rf)

# Astuce : échantillonner pour les plots si beaucoup d’observations
X_sample = X_test_trans_df.sample(min(2000, len(X_test_trans_df)), random_state=42)
shap_values = explainer.shap_values(X_sample)

# 5) Importance globale (bar plot)
shap.summary_plot(shap_values, X_sample, plot_type="bar", show=False)
plt.title("SHAP – Importance globale des variables (Random Forest)")
plt.tight_layout()
plt.show()

# 6) Beeswarm (distribution des impacts, couleur = valeur de la feature)
shap.summary_plot(shap_values, X_sample, show=False)
plt.title("SHAP – Distribution des contributions par variable")
plt.tight_layout()
plt.show()

# 7) Explication locale d’une observation
i = 0  # change l'index si tu veux une autre voiture
sv_i = explainer.shap_values(X_sample.iloc[[i]])
shap.force_plot(explainer.expected_value, sv_i, X_sample.iloc[[i]], matplotlib=True)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

# ========= 1) Dataframe: specs techniques uniquement =========
df_tech = df_no_outliers.copy()  # <- ton DF nettoyé

# Créer power_to_weight si absent
if 'power_to_weight' not in df_tech.columns and {'ep (KW)', 'Mt'}.issubset(df_tech.columns):
    df_tech['power_to_weight'] = (
        pd.to_numeric(df_tech['ep (KW)'], errors='coerce') /
        pd.to_numeric(df_tech['Mt'], errors='coerce')
    )

num_cols = [c for c in ['Mt', 'ec (cm3)', 'ep (KW)', 'power_to_weight'] if c in df_tech.columns]
target   = 'Fuel consumption'

# Conversions numériques robustes
for c in num_cols + [target]:
    if c in df_tech.columns:
        df_tech[c] = pd.to_numeric(df_tech[c], errors='coerce')

# ⚠️ On supprime uniquement les lignes où la CIBLE est NaN (on imputera X ensuite)
df_tech = df_tech.dropna(subset=[target]).reset_index(drop=True)

X = df_tech[num_cols]
y = df_tech[target]

print("Colonnes utilisées :", num_cols)
print("NaN restants dans X (seront imputés):\n", X.isna().sum(), "\n")

# ========= 2) Split =========
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ========= 3) Prétraitements =========
# Linéaire: Imputation (médiane) + Standardisation
pre_lin = ColumnTransformer(
    [("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]), num_cols)],
    remainder="drop"
)

# RandomForest: Imputation (médiane) seule
pre_rf = ColumnTransformer(
    [("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
    ]), num_cols)],
    remainder="drop"
)

# ========= 4) Modèles =========
lin_pipe = Pipeline([("pre", pre_lin), ("model", LinearRegression())])

rf_pipe = Pipeline([("pre", pre_rf), ("model", RandomForestRegressor(
    n_estimators=300, random_state=42, n_jobs=-1
))])

# ========= 5) Entraînement + Évaluation =========
def eval_model(name, pipe):
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    r2   = r2_score(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    print(f"{name} -> R²={r2:.3f} | RMSE={rmse:.3f} L/100km | MAE={mae:.3f} L/100km")
    return y_pred, r2, rmse, mae

y_pred_lin, r2_lin, rmse_lin, mae_lin = eval_model("Régression linéaire (tech only)", lin_pipe)
y_pred_rf,  r2_rf,  rmse_rf,  mae_rf  = eval_model("RandomForest (tech only)", rf_pipe)

# ========= 6) Scatter comparatif =========
plt.figure(figsize=(7,6))
plt.scatter(y_test, y_pred_lin, s=8, alpha=0.5, label=f"Linéaire (R²={r2_lin:.3f})")
plt.scatter(y_test, y_pred_rf,  s=8, alpha=0.5, label=f"RandomForest (R²={r2_rf:.3f})")
lims = [min(y_test.min(), y_pred_lin.min(), y_pred_rf.min()),
        max(y_test.max(), y_pred_lin.max(), y_pred_rf.max())]
plt.plot(lims, lims, 'r--', label="Idéal")
plt.xlabel("Conso réelle (L/100km)")
plt.ylabel("Conso prédite (L/100km)")
plt.title("Conso prédite — Specs techniques uniquement (imputation médiane)")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
!pip install xgboost
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


# =============== 1) Données : specs techniques uniquement ===============
dfx = df_no_outliers.copy()

# power_to_weight si absent
if 'power_to_weight' not in dfx.columns and {'ep (KW)', 'Mt'}.issubset(dfx.columns):
    dfx['power_to_weight'] = pd.to_numeric(dfx['ep (KW)'], errors='coerce') / pd.to_numeric(dfx['Mt'], errors='coerce')

num_cols = [c for c in ['Mt', 'ec (cm3)', 'ep (KW)', 'power_to_weight'] if c in dfx.columns]
target   = 'Fuel consumption'

# conversions numériques
for c in num_cols + [target]:
    dfx[c] = pd.to_numeric(dfx[c], errors='coerce')

# on ne garde que les lignes avec cible connue (X sera imputé)
dfx = dfx.dropna(subset=[target]).reset_index(drop=True)

X = dfx[num_cols]
y = dfx[target]

print("Features utilisées :", num_cols, "| Taille :", X.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Prétraitement commun pour arbres : imputation (médiane)
preproc = ColumnTransformer(
    [("num", SimpleImputer(strategy="median"), num_cols)],
    remainder="drop"
)

# =============== 2) Modèles ===============
models = {
    "RandomForest": Pipeline([
        ("pre", preproc),
        ("model", RandomForestRegressor(
            n_estimators=500, max_depth=None,
            random_state=42, n_jobs=-1
        ))
    ]),
    "GradientBoosting": Pipeline([
        ("pre", preproc),
        ("model", GradientBoostingRegressor(
            n_estimators=500, learning_rate=0.05,
            max_depth=3, subsample=0.9, random_state=42
        ))
    ])
}

# XGBoost si installé
try:
    from xgboost import XGBRegressor
    models["XGBoost"] = Pipeline([
        ("pre", preproc),
        ("model", XGBRegressor(
            n_estimators=800, learning_rate=0.05, max_depth=6,
            subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
            objective="reg:squarederror", random_state=42, n_jobs=-1,
            tree_method="hist"  # rapide
        ))
    ])
    has_xgb = True
except Exception as e:
    print("⚠️ XGBoost non disponible. Pour l’installer :  pip install xgboost")
    has_xgb = False

# =============== 3) Entraînement, évaluation, plots ===============
results = []
plt.figure(figsize=(7,6))

for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    r2   = r2_score(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    results.append([name, r2, rmse, mae])

    plt.scatter(y_test, y_pred, s=8, alpha=0.5, label=f"{name} (R²={r2:.3f})")

# diagonale idéale
lims = [min(y_test.min(), min([r[2] for r in results], default=y_test.min())),
        max(y_test.max(), y_test.max())]
xline = np.linspace(y_test.min(), y_test.max(), 100)
plt.plot(xline, xline, 'r--', label="Idéal")

plt.xlabel("Conso réelle (L/100km)")
plt.ylabel("Conso prédite (L/100km)")
plt.title("Comparaison RF vs GradientBoosting vs XGBoost (spécs techniques)")
plt.legend()
plt.tight_layout()
plt.show()

# Tableau récapitulatif
df_res = pd.DataFrame(results, columns=["Modèle","R²","RMSE (L/100km)","MAE (L/100km)"]).sort_values("R²", ascending=False)
print(df_res.to_string(index=False))

# =============== 4) Importances des variables du meilleur modèle ===============
best_name = df_res.iloc[0]["Modèle"]
best_pipe = models[best_name]
best_model = best_pipe.named_steps["model"]

# récup features après imputation
Xt = best_pipe.named_steps["pre"].transform(X_test)
feat_names = num_cols  # ici pas d'encodage cat, donc simple

# importance si le modèle la supporte
if hasattr(best_model, "feature_importances_"):
    importances = pd.Series(best_model.feature_importances_, index=feat_names).sort_values(ascending=True)
    plt.figure(figsize=(6,4))
    importances.plot(kind="barh")
    plt.title(f"Importances — {best_name}")
    plt.xlabel("Importance")
    plt.tight_layout()
    plt.show()
else:
    print(f"(Pas d'importances natives pour {best_name})")