# Importer les bibliothèques de base

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib

import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress RuntimeWarnings and ConvergenceWarnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)


# Charger les données

In [None]:
df = pd.read_csv("dataset_revenu_marocains.csv") 
df = df.drop(['id','id_transaction', 'date_inscription','couleur_preferee', 'age_mois' ], axis=1)

df.head()

Unnamed: 0,milieu,sexe,age,categorie_age,niveau_education,annees_experience,etat_matrimonial,categorie_socioprofessionnelle,possession_voiture,possession_logement,possession_terrain,personnes_a_charge,secteur_activite,acces_services_financiers,revenu_annuel
0,Urbain,Homme,36,Adulte,Fondamental,17.0,Veuf,Groupe 2,1,1,1,2,Public,Basique,28477.735465
1,Urbain,Homme,45,Adulte,Fondamental,24.0,Marié,Groupe 5,0,0,0,0,Privé informel,Aucun,17503.035465
2,Urbain,Homme,35,Adulte,Secondaire,14.0,Marié,Groupe 2,1,0,1,1,Public,Basique,36705.135465
3,Urbain,Homme,18,Jeune,Secondaire,0.0,Célibataire,Groupe 2,0,0,0,1,Public,Avancé,24200.235465
4,Urbain,Homme,37,Adulte,Supérieur,11.0,Marié,Groupe 2,1,0,0,3,Privé formel,Aucun,39118.935465


# Prétraitement des données

In [3]:
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, z_thresh=3):
        self.z_thresh = z_thresh

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Only apply on numeric columns
        numeric_cols = X.select_dtypes(include=np.number).columns
        mask = (np.abs((X[numeric_cols] - X[numeric_cols].mean()) / X[numeric_cols].std()) < self.z_thresh).all(axis=1)
        return X[mask]

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()

        # Ensure numeric types
        for col in ['annees_experience', 'age', 'niveau_education']:
            X_transformed[col] = pd.to_numeric(X_transformed[col], errors='coerce')

        # Avoid division by zero or NaNs
        X_transformed['annees_experience'] = X_transformed['annees_experience'].replace(0, np.nan)
        X_transformed['age'] = X_transformed['age'].replace(0, np.nan)

        # Create ratios
        X_transformed['ratio_experience_age'] = X_transformed['annees_experience'] / X_transformed['age']
        X_transformed['ratio_education_experience'] = X_transformed['niveau_education'] / X_transformed['annees_experience']

        return X_transformed


        
# Separate features and target
X_train = df.drop(columns="revenu_annuel")
y_train = df["revenu_annuel"]

# Identify column types
numeric_features     = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=["object"]).columns.tolist()

# Preprocessing for numerical and categorical data
numeric_transformer = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Apply outlier remover manually before pipeline
outlier_remover = OutlierRemover()
mask = outlier_remover.transform(X_train).index
X_train_clean = X_train.loc[mask]
y_train_clean = y_train.loc[mask]

# Séparation des données

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_train_clean, y_train_clean, test_size=0.3, random_state=42)

# Créer le modèle

In [5]:
pipeline = make_pipeline(
    FeatureEngineering(),
    preprocessor,
    MLPRegressor(
        hidden_layer_sizes=(100, 100),
        activation='relu',
        solver='adam',
        alpha=0.0001,
        learning_rate='constant',
        learning_rate_init=0.001,
        max_iter=300,
        random_state=42
    )
)

# Fit the model
pipeline.fit(X_train_clean, y_train_clean)

# Predictions
y_pred = pipeline.predict(X_test)
print('\n############# MLP Regressor - Results:')
print(f"MAE:  {mean_absolute_error(y_test, y_pred):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
print(f"R²:   {r2_score(y_test, y_pred):.4f}")


############# MLP Regressor - Results:
MAE:  2568.45
RMSE: 6033.28
R²:   0.8503


# Enregistrer le modèle avec Joblib

In [6]:
# Save the best model with Joblib
joblib.dump(pipeline, "best_model.joblib")
print("\nModèle enregistré avec succès.")


Modèle enregistré avec succès.


# Charger le modèle

In [7]:
# Load the saved model for future predictions
loaded_model = joblib.load("best_model.joblib")
print("Modèle chargé avec succès.")

# Test a sample prediction
sample_pred = loaded_model.predict(X_test[:1])
print("\nPrédiction pour un exemple :", sample_pred)

Modèle chargé avec succès.

Prédiction pour un exemple : [24125.54487571]
