In [18]:
import   pandas as pd
import   numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)
from sklearn.model_selection import GridSearchCV

import plotly.express as px
import plotly.graph_objects as go

CHARGEMENT DES DONNÉES PRÉPARÉES

In [19]:
train_df = pd.read_csv('../data/processed/processed_train.csv')
test_df = pd.read_csv('../data/processed/processed_test.csv')

X_train = train_df.drop(columns=['Machine failure'])
y_train = train_df['Machine failure']

X_test = test_df.drop(columns=['Machine failure'])
y_test = test_df['Machine failure']

In [20]:
print(y_train.value_counts(normalize=True))

Machine failure
0    0.966125
1    0.033875
Name: proportion, dtype: float64


RANDOM FOREST

In [21]:
rf = RandomForestClassifier(
    n_estimators = 200,
    max_depth = None,
    class_weight = 'balanced',
    random_state = 42
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

ÉVALUATION

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1932
           1       0.94      0.65      0.77        68

    accuracy                           0.99      2000
   macro avg       0.96      0.82      0.88      2000
weighted avg       0.99      0.99      0.99      2000



MATRICE DE CONFUSION INTERACTIVE

In [23]:
cm = confusion_matrix(y_test, y_pred)

fig = px.imshow(
    cm,
    text_auto=True,
    color_continuous_scale='Blues',
    title='Confusion Matrix - Random Forest',
)
fig.show()

ROC Curve

In [24]:
fpr, tpr, _ = roc_curve(y_test, y_proba)

fig = go.Figure()
fig.add_trace(
    go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve')
)
fig.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)

fig.update_layout(
    title='ROC Curve',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate'
)
fig.show()

print("AUC:", roc_auc_score(y_test, y_proba))

AUC: 0.9760801059554257


FEATURE IMPORTANCE

In [25]:
importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

fig = px.bar(
    importance,
    x="Importance",
    y="Feature",
    orientation='h',
    title='Feature Importance - Random Forest'
)
fig.show()

HYPERPARAMETER TUNING (NIVEAU AVANCÉ)

In [26]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
}

grid = GridSearchCV(
    RandomForestClassifier(class_weight='balanced', random_state=42),
    param_grid,
    cv=3,
    scoring='recall',
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)

Best parameters: {'max_depth': 10, 'n_estimators': 100}
