In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Importation des données

Après lecture des informations sur les colonnes, on choisis des noms de colonnes pour les données

In [2]:
from pathlib import Path
data_path = Path(r"../data/welddb.data")
column_names = [
    'C', 'Si', 'Mn', 'S', 'P', 'Ni', 'Cr', 'Mo', 'V', 'Cu', 'Co', 'W', 
    'O', 'Ti', 'N', 'Al', 'B', 'Nb', 'Sn', 'As', 'Sb', 
    'Current', 'Voltage', 'AC_DC', 'Electrode_polarity', 'Heat_input', 'Interpass_temp',  
    'Weld_type', 'PWHT_temp', 'PWHT_time', 
    'Yield_strength', 'UTS', 'Elongation', 'Reduction_area',  
    'Charpy_temp', 'Charpy_impact', 'Hardness', 'FATT_50', 
    'Primary_ferrite', 'Ferrite_2nd_phase', 'Acicular_ferrite', 'Martensite', 'Ferrite_carbide', 
    'Weld_ID' 
]
df = pd.read_csv(data_path, sep='\s+', names=column_names)
df.head(5)

  df = pd.read_csv(data_path, sep='\s+', names=column_names)


Unnamed: 0,C,Si,Mn,S,P,Ni,Cr,Mo,V,Cu,...,Charpy_temp,Charpy_impact,Hardness,FATT_50,Primary_ferrite,Ferrite_2nd_phase,Acicular_ferrite,Martensite,Ferrite_carbide,Weld_ID
0,0.037,0.3,0.65,0.008,0.012,0,N,N,N,N,...,N,N,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Aaw
1,0.037,0.3,0.65,0.008,0.012,0,N,N,N,N,...,-28,100,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Aawch
2,0.037,0.3,0.65,0.008,0.012,0,N,N,N,N,...,-38,100,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Aht
3,0.037,0.31,1.03,0.007,0.014,0,N,N,N,N,...,N,N,N,N,N,N,N,N,N,Evans-Ni/CMn-1990/1991-0Baw
4,0.037,0.31,1.03,0.007,0.014,0,N,N,N,N,...,-48,100,N,N,32,28,40,0,0,Evans-Ni/CMn-1990/1991-0Bawch


On remarque que toutes les valeurs vide sont notées "N" donc on les remplaces par NaN

In [3]:
df.replace('N', np.nan, inplace=True)
df.head(5)

Unnamed: 0,C,Si,Mn,S,P,Ni,Cr,Mo,V,Cu,...,Charpy_temp,Charpy_impact,Hardness,FATT_50,Primary_ferrite,Ferrite_2nd_phase,Acicular_ferrite,Martensite,Ferrite_carbide,Weld_ID
0,0.037,0.3,0.65,0.008,0.012,0,,,,,...,,,,,,,,,,Evans-Ni/CMn-1990/1991-0Aaw
1,0.037,0.3,0.65,0.008,0.012,0,,,,,...,-28.0,100.0,,,,,,,,Evans-Ni/CMn-1990/1991-0Aawch
2,0.037,0.3,0.65,0.008,0.012,0,,,,,...,-38.0,100.0,,,,,,,,Evans-Ni/CMn-1990/1991-0Aht
3,0.037,0.31,1.03,0.007,0.014,0,,,,,...,,,,,,,,,,Evans-Ni/CMn-1990/1991-0Baw
4,0.037,0.31,1.03,0.007,0.014,0,,,,,...,-48.0,100.0,,,32.0,28.0,40.0,0.0,0.0,Evans-Ni/CMn-1990/1991-0Bawch


# Data Cleaning


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 44 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   C                   1652 non-null   float64
 1   Si                  1652 non-null   float64
 2   Mn                  1652 non-null   float64
 3   S                   1648 non-null   object 
 4   P                   1642 non-null   object 
 5   Ni                  697 non-null    object 
 6   Cr                  784 non-null    object 
 7   Mo                  793 non-null    object 
 8   V                   928 non-null    object 
 9   Cu                  578 non-null    object 
 10  Co                  129 non-null    object 
 11  W                   75 non-null     object 
 12  O                   1256 non-null   object 
 13  Ti                  935 non-null    object 
 14  N                   1242 non-null   object 
 15  Al                  905 non-null    object 
 16  B     

On remarque que beaucoup de colonnes ont une majorité de valeurs manquantes. Nous allons nous débarasser de ces valeurs car elles sont trop peu exploitables, sauf pour les données chimiques car on peut imaginer que si elles sont à NaN c'est qu'elles ne sont pas mesurées car normalement introuvable de plus on remarque que dans les données, il y a parfois des "< seuil" ou des "54totndres" on va donc remplacer ces valeurs par 0 aussi.

In [5]:
chemical_cols = ['C','Si','Mn','S','P','Ni','Cr','Mo','V','Cu','Co',
                 'W','O','Ti','N','Al','B','Nb','Sn','As','Sb']

df_reduced = df.copy()
# Si colonne pas dans chemical_cols et mean < 0.4 on supprime la colonne
for col in df_reduced.columns:
    if col not in chemical_cols and df_reduced[col].isna().mean() > 0.6:
        df_reduced.drop(columns=[col], inplace=True)

df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 37 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   C                   1652 non-null   float64
 1   Si                  1652 non-null   float64
 2   Mn                  1652 non-null   float64
 3   S                   1648 non-null   object 
 4   P                   1642 non-null   object 
 5   Ni                  697 non-null    object 
 6   Cr                  784 non-null    object 
 7   Mo                  793 non-null    object 
 8   V                   928 non-null    object 
 9   Cu                  578 non-null    object 
 10  Co                  129 non-null    object 
 11  W                   75 non-null     object 
 12  O                   1256 non-null   object 
 13  Ti                  935 non-null    object 
 14  N                   1242 non-null   object 
 15  Al                  905 non-null    object 
 16  B     

On remarque que la plupart des valeurs sont de class "object" on va donc les modifier pour avoir les valeurs numériques en float afin de pouvoir faire nos calculs

In [6]:
def clean_chemical_value(val):
    if pd.isna(val): return 0.0
    val = str(val).strip()
    if val.startswith("<"):
        try: return float(val[1:]) / 2
        except: return 0.0
    if any(k in val.lower() for k in ["tot", "res", "trace", "none", "nd"]):
        return 0.0
    try: return float(val)
    except: return 0.0

In [7]:
categorical_cols = ['AC_DC', 'Electrode_polarity', 'Weld_type', 'Weld_ID']

for col in df_reduced.columns:
    if col not in categorical_cols:
        df_reduced[col] = df_reduced[col].apply(clean_chemical_value)
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 37 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   C                   1652 non-null   float64
 1   Si                  1652 non-null   float64
 2   Mn                  1652 non-null   float64
 3   S                   1652 non-null   float64
 4   P                   1652 non-null   float64
 5   Ni                  1652 non-null   float64
 6   Cr                  1652 non-null   float64
 7   Mo                  1652 non-null   float64
 8   V                   1652 non-null   float64
 9   Cu                  1652 non-null   float64
 10  Co                  1652 non-null   float64
 11  W                   1652 non-null   float64
 12  O                   1652 non-null   float64
 13  Ti                  1652 non-null   float64
 14  N                   1652 non-null   float64
 15  Al                  1652 non-null   float64
 16  B     

On sauvegarde dans un csv

In [9]:
# on met df_reduced dans un csv
processed_data_path = "../data/weld_data_processed.csv"

df_reduced.to_csv(processed_data_path, index=False)

PermissionError: [Errno 13] Permission denied: '../data/weld_data_processed.csv'

Pour les données catégoricielles on verra ensuite ce qu'on veut faire d'elles (supprimer, one-hot encoding ...)