IMPORT DES BIBLIOTHÈQUES

In [50]:
# Analyse et manipulation
import pandas as pd
import numpy as np

# Visualisation interactive
import plotly.express as px
import plotly.graph_objects as go

# Pour affichage inline
from IPython.display import display

CHARGEMENT DES DONNÉES

In [51]:
# Chemin vers le dataset
data_path = "../data/raw/ai4i2020.csv"  # à adapter selon ton fichier

# Chargement des données
df = pd.read_csv(data_path)

# Aperçu des premières lignes
display(df.head(11))

# Infos générales
df.info()
df.describe()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
5,6,M14865,M,298.1,308.6,1425,41.9,11,0,0,0,0,0,0
6,7,L47186,L,298.1,308.6,1558,42.4,14,0,0,0,0,0,0
7,8,L47187,L,298.1,308.6,1527,40.2,16,0,0,0,0,0,0
8,9,M14868,M,298.3,308.7,1667,28.6,18,0,0,0,0,0,0
9,10,M14869,M,298.5,309.0,1741,28.0,21,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Machine failure          10000 non-null  int64  
 9   TWF                      10000 non-null  int64  
 10  HDF                      10000 non-null  int64  
 11  PWF                      10000 non-null  int64  
 12  OSF                      10000 non-null  int64  
 13  RNF                      10000 non-null  int64  
dtypes: float64(3), int64(9)

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339,0.0046,0.0115,0.0095,0.0098,0.0019
std,2886.89568,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981,0.067671,0.106625,0.097009,0.098514,0.04355
min,1.0,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2500.75,298.3,308.8,1423.0,33.2,53.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5000.5,300.1,310.1,1503.0,40.1,108.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7500.25,301.5,311.1,1612.0,46.8,162.0,0.0,0.0,0.0,0.0,0.0,0.0
max,10000.0,304.5,313.8,2886.0,76.6,253.0,1.0,1.0,1.0,1.0,1.0,1.0


ANALYSE DES VALEURS MANQUANTES ET TYPES

In [52]:
# Vérification des valeurs manquantes
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
display(missing)

# Types des colonnes
df.dtypes

Series([], dtype: int64)

UDI                          int64
Product ID                  object
Type                        object
Air temperature [K]        float64
Process temperature [K]    float64
Rotational speed [rpm]       int64
Torque [Nm]                float64
Tool wear [min]              int64
Machine failure              int64
TWF                          int64
HDF                          int64
PWF                          int64
OSF                          int64
RNF                          int64
dtype: object

In [53]:
print(df.columns.tolist())

['UDI', 'Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']


VISUALISATION DES VARIABLES CAPTEURS

In [54]:
# Histogramme interactif pour chaque capteur
sensor_col = [
    "Air temperature [K]",
    "Process temperature [K]",
    "Rotational speed [rpm]",
    "Torque [Nm]",
    "Tool wear [min]"
]

for col in sensor_col:
    fig = px.histogram(df, x=col, nbins=50, title=f'Distribution de {col}')
    fig.show()

# Réparttion de la variable cible 'Machine Failure'
fig = px.pie(df, names='Machine failure', title='Répartition des défaillances')
fig.show()

# Corrélation entre variables et capteurs cibles (Heatmap interactive)
corr_matrix = df[sensor_col + ['Machine failure']].corr() 
fig = px.imshow(corr_matrix, text_auto=True, color_continuous_scale='Viridis', title='Matrice de corrélation capteurs / Machine failure')
fig.show()

# Scatter plot pour comparer capteurs entre eux
fig = px.scatter(df, x='Rotational speed [rpm]', y='Torque [Nm]',
                 color='Machine failure', title='Rotational speed vs Torque')
fig.show()

# 5️⃣ Scatter matrix pour visualiser toutes les relations entre capteurs
fig = px.scatter_matrix(df,
                        dimensions=sensor_col,
                        color='Machine failure',
                        title='Relations entre capteurs')
fig.update_traces(diagonal_visible=False)
fig.show()