# Test CODRA

In [19]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline


Get data

In [None]:

file_ps2 = "../data/PS2.txt"
file_fs1 = "../data/FS1.txt"
file_profile = "../data/profile.txt"

Load data and Feature ingeering

In [13]:

# Chemins des fichiers
ps2_path = "../data/PS2.txt"
fs1_path = "../data/FS1.txt"
profile_path = "../data/profile.txt"

# Vérifier existence
for path in [ps2_path, fs1_path, profile_path]:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Fichier introuvable : {path}")

# Charger les fichiers capteurs
ps2 = pd.read_csv(ps2_path, sep="\t", header=None)
fs1 = pd.read_csv(fs1_path, sep="\t", header=None)

# Charger le fichier profile
profile = pd.read_csv(profile_path, sep="\t", header=None)
profile.columns = ["cooler_condition", "valve_condition", "pump_leakage", "accumulator_pressure", "stable_flag"]

# Créer des features agrégées pour PS2 et FS1
ps2_features = ps2.apply([np.mean, np.std, np.min, np.max], axis=1)
fs1_features = fs1.apply([np.mean, np.std, np.min, np.max], axis=1)

# Renommer les colonnes
ps2_features.columns = ["ps2_mean", "ps2_std", "ps2_min", "ps2_max"]
fs1_features.columns = ["fs1_mean", "fs1_std", "fs1_min", "fs1_max"]

# Fusionner tout
features = pd.concat([ps2_features, fs1_features, profile], axis=1)

# Ajouter la cible (valve optimale ou non)
features["target"] = (features["valve_condition"] != 100).astype(int)

print(features.head())


     ps2_mean    ps2_std  ps2_min  ps2_max  fs1_mean   fs1_std  fs1_min  \
0  109.466914  47.110581      0.0   156.99  6.709815  3.012914      0.0   
1  109.354890  47.041690      0.0   157.56  6.715315  3.003742      0.0   
2  109.158845  46.988144      0.0   156.97  6.718522  3.011735      0.0   
3  109.064807  46.968307      0.0   156.44  6.720565  3.017658      0.0   
4  108.931434  46.871040      0.0   158.13  6.690308  3.009040      0.0   

   fs1_max  cooler_condition  valve_condition  pump_leakage  \
0   18.710                 3              100             0   
1   18.712                 3              100             0   
2   18.698                 3              100             0   
3   18.896                 3              100             0   
4   18.876                 3              100             0   

   accumulator_pressure  stable_flag  target  
0                   130            1       0  
1                   130            1       0  
2                   130      

Statistiques descriptives

In [21]:
features.describe()

Unnamed: 0,ps2_mean,ps2_std,ps2_min,ps2_max,fs1_mean,fs1_std,fs1_min,fs1_max,cooler_condition,valve_condition,pump_leakage,accumulator_pressure,stable_flag,target
count,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0
mean,109.379906,47.732184,0.0,166.520517,6.198549,3.037542,4.535147e-07,20.130214,41.240816,90.693878,0.669388,107.199546,0.342857,0.489796
std,4.986585,3.271676,0.0,0.977997,1.032883,0.188849,2.129589e-05,0.451934,42.383143,10.681802,0.817233,16.435848,0.474772,0.500009
min,104.406307,45.203186,0.0,155.04,2.018572,2.663675,0.0,18.698,3.0,73.0,0.0,90.0,0.0,0.0
25%,106.962382,46.205814,0.0,166.18,6.39167,2.9545,0.0,19.881,3.0,80.0,0.0,90.0,0.0,0.0
50%,107.730169,46.802086,0.0,166.65,6.576673,2.982333,0.0,20.363,20.0,100.0,0.0,100.0,0.0,0.0
75%,109.421612,47.203633,0.0,167.1,6.657508,3.01794,0.0,20.479,100.0,100.0,1.0,130.0,1.0,1.0
max,131.589089,59.548285,0.0,167.77,6.722707,3.739104,0.001,20.479,100.0,100.0,2.0,130.0,1.0,1.0


Vérification

In [11]:
features.shape

(2205, 14)

In [14]:
features.target.value_counts()

target
0    1125
1    1080
Name: count, dtype: int64

Données train et test

In [27]:
# Prendre les 2000 premiers cycles
features = features.head(2000)
# X = tes features (ex: profile_data)
X = features.drop(columns=[ "valve_condition","target"])
# y = ta cible
y = features["target"]


Définir le pipeline

In [26]:
# Exemple : X = vos features, y = votre cible
# X, y = ...
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Définition des pipelines
pipelines = {
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(max_iter=1000))
    ]),
    "Random Forest": Pipeline([
        ('scaler', StandardScaler()),  # Pas nécessaire mais pour homogénéité
        ('model', RandomForestClassifier(n_estimators=100))
    ]),
    "XGBoost": Pipeline([
        ('scaler', StandardScaler()),
        ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
    ]),
    "LightGBM": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LGBMClassifier())
    ])
}

# Entraînement et évaluation
results = []
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1]  # Pour ROC-AUC
    
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_proba)
    })

# Tableau comparatif
df_results = pd.DataFrame(results)
print(df_results)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 758, number of negative: 842
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1433
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.473750 -> initscore=-0.105097
[LightGBM] [Info] Start training from score -0.105097
                 Model  Accuracy        F1   ROC-AUC
0  Logistic Regression    0.9050  0.906863  0.986617
1        Random Forest    0.9900  0.989474  0.999762
2              XGBoost    0.9925  0.992126  0.999799
3             LightGBM    0.9900  0.989529  0.999724




In [None]:

from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
# Supposons que X_train et y_train soient vos données
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Récupérer l'importance des variables
importances = model.feature_importances_
feature_names = X_train.columns

# Créer un DataFrame pour trier
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)

# ✅ Affichage graphique
plt.figure(figsize=(8, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.gca().invert_yaxis()  # Pour avoir la plus importante en haut
plt.title("Importance des variables (Random Forest)")
plt.xlabel("Importance")
plt
