# Mod√©lisation - Pr√©diction de Salaires

R√©gression lin√©aire pour pr√©dire les salaires des employ√©s.


## 1. Pr√©paration des Donn√©es


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Chargement
df = pd.read_csv('data/employee_salaries.csv')
print(f"Dataset : {df.shape}")
print(f"Valeurs manquantes : {df.isnull().sum().sum()}")
df.head()


In [None]:
# Traitement des valeurs manquantes
df['nb_projets'].fillna(df['nb_projets'].median(), inplace=True)
df['evaluation_performance'].fillna(df['evaluation_performance'].median(), inplace=True)

# Encodage des variables cat√©gorielles
df_encoded = pd.get_dummies(df, columns=['niveau_education', 'departement', 'poste', 'ville', 'secteur', 'taille_entreprise'], drop_first=True)

print(f"Dataset encod√© : {df_encoded.shape}")
df_encoded.head()


In [None]:
# S√©paration X et y
X = df_encoded.drop('salaire_annuel', axis=1)
y = df_encoded['salaire_annuel']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train : {X_train.shape}")
print(f"Test : {X_test.shape}")


## 2. Baseline - DummyRegressor


In [None]:
# Mod√®le baseline (pr√©dit toujours la moyenne)
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train_scaled, y_train)
y_pred_dummy = dummy.predict(X_test_scaled)

# M√©triques
rmse_dummy = np.sqrt(mean_squared_error(y_test, y_pred_dummy))
mae_dummy = mean_absolute_error(y_test, y_pred_dummy)
r2_dummy = r2_score(y_test, y_pred_dummy)

print("=" * 50)
print("BASELINE - DummyRegressor")
print("=" * 50)
print(f"RMSE : {rmse_dummy:,.2f} ‚Ç¨")
print(f"MAE  : {mae_dummy:,.2f} ‚Ç¨")
print(f"R¬≤   : {r2_dummy:.4f}")


## 3. R√©gression Lin√©aire Simple


In [None]:
# R√©gression lin√©aire
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

# M√©triques
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("=" * 50)
print("R√âGRESSION LIN√âAIRE")
print("=" * 50)
print(f"RMSE : {rmse_lr:,.2f} ‚Ç¨")
print(f"MAE  : {mae_lr:,.2f} ‚Ç¨")
print(f"R¬≤   : {r2_lr:.4f}")


## 4. Ridge Regression (L2)


In [None]:
# GridSearchCV pour Ridge
param_grid_ridge = {'alpha': [0.01, 0.1, 1, 10, 100, 1000]}
ridge = Ridge()
grid_ridge = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='neg_mean_squared_error')
grid_ridge.fit(X_train_scaled, y_train)

# Meilleur mod√®le
best_ridge = grid_ridge.best_estimator_
y_pred_ridge = best_ridge.predict(X_test_scaled)

# M√©triques
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("=" * 50)
print("RIDGE REGRESSION (L2)")
print("=" * 50)
print(f"Meilleur alpha : {grid_ridge.best_params_['alpha']}")
print(f"RMSE : {rmse_ridge:,.2f} ‚Ç¨")
print(f"MAE  : {mae_ridge:,.2f} ‚Ç¨")
print(f"R¬≤   : {r2_ridge:.4f}")


## 5. Lasso Regression (L1)


In [None]:
# GridSearchCV pour Lasso
param_grid_lasso = {'alpha': [0.01, 0.1, 1, 10, 100, 1000]}
lasso = Lasso(max_iter=10000)
grid_lasso = GridSearchCV(lasso, param_grid_lasso, cv=5, scoring='neg_mean_squared_error')
grid_lasso.fit(X_train_scaled, y_train)

# Meilleur mod√®le
best_lasso = grid_lasso.best_estimator_
y_pred_lasso = best_lasso.predict(X_test_scaled)

# M√©triques
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("=" * 50)
print("LASSO REGRESSION (L1)")
print("=" * 50)
print(f"Meilleur alpha : {grid_lasso.best_params_['alpha']}")
print(f"RMSE : {rmse_lasso:,.2f} ‚Ç¨")
print(f"MAE  : {mae_lasso:,.2f} ‚Ç¨")
print(f"R¬≤   : {r2_lasso:.4f}")

# Nombre de features s√©lectionn√©es
n_features_selected = np.sum(best_lasso.coef_ != 0)
print(f"\nFeatures s√©lectionn√©es : {n_features_selected}/{len(X.columns)}")


## 6. Comparaison Finale


In [None]:
# Tableau comparatif
resultats = pd.DataFrame({
    'Mod√®le': ['Baseline (Dummy)', 'R√©gression Lin√©aire', 'Ridge (L2)', 'Lasso (L1)'],
    'RMSE': [rmse_dummy, rmse_lr, rmse_ridge, rmse_lasso],
    'MAE': [mae_dummy, mae_lr, mae_ridge, mae_lasso],
    'R¬≤': [r2_dummy, r2_lr, r2_ridge, r2_lasso]
})

print("=" * 80)
print("COMPARAISON DES MOD√àLES")
print("=" * 80)
print(resultats.to_string(index=False))
print("=" * 80)

# Meilleur mod√®le
best_model_idx = resultats['R¬≤'].idxmax()
best_model_name = resultats.loc[best_model_idx, 'Mod√®le']
best_r2 = resultats.loc[best_model_idx, 'R¬≤']
print(f"\nüèÜ Meilleur mod√®le : {best_model_name} (R¬≤ = {best_r2:.4f})")


In [None]:
# Visualisation des pr√©dictions
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# R√©gression lin√©aire
axes[0].scatter(y_test, y_pred_lr, alpha=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Valeurs R√©elles')
axes[0].set_ylabel('Pr√©dictions')
axes[0].set_title(f'R√©gression Lin√©aire (R¬≤={r2_lr:.4f})')

# Ridge
axes[1].scatter(y_test, y_pred_ridge, alpha=0.5)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Valeurs R√©elles')
axes[1].set_ylabel('Pr√©dictions')
axes[1].set_title(f'Ridge (R¬≤={r2_ridge:.4f})')

# Lasso
axes[2].scatter(y_test, y_pred_lasso, alpha=0.5)
axes[2].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[2].set_xlabel('Valeurs R√©elles')
axes[2].set_ylabel('Pr√©dictions')
axes[2].set_title(f'Lasso (R¬≤={r2_lasso:.4f})')

plt.tight_layout()
plt.show()
