In [None]:
from io import StringIO

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

## I.Import des données ##

In [None]:
# Nettoyage manuel des espaces multiples dans le fichier
file_path = "c:\\Users\\DF6610\\Documents\\ProjectAppauto\\welddb\\welddb.data"
with open(file_path, "r") as file:
    cleaned_lines = [" ".join(line.split()) for line in file]

# Conversion des lignes nettoyées en DataFrame

data = pd.read_csv(StringIO("\n".join(cleaned_lines)), sep=" ", na_values="N", header=None)

In [None]:
# La taille du dataset
print("Le nombre de lignes est: " + str(data.shape[0]))
print("Le nombre de colonnes est: " + str(data.shape[1]))

In [None]:
columns = [
    "Carbon concentration / (weight%)",
    "Silicon concentration / (weight%)",
    "Manganese concentration / (weight%)",
    "Sulphur concentration / (weight%)",
    "Phosphorus concentration / (weight%)",
    "Nickel concentration / (weight%)",
    "Chromium concentration / (weight%)",
    "Molybdenum concentration / (weight%)",
    "Vanadium concentration / (weight%)",
    "Copper concentration / (weight%)",
    "Cobalt concentration / (weight%)",
    "Tungsten concentration / (weight%)",
    "Oxygen concentration / parts per million by weight",
    "Titanium concentration / parts per million by weight",
    "Nitrogen concentration / parts per million by weight",
    "Aluminium concentration / parts per million by weight",
    "Boron concentration / parts per million by weight",
    "Niobium concentration / parts per million by weight",
    "Tin concentration / parts per million by weight",
    "Arsenic concentration / parts per million by weight",
    "Antimony concentration / parts per million by weight",
    "Current / A",
    "Voltage / V",
    "AC or DC",
    "Electrode positive or negative",
    "Heat input / kJ/mm",
    "Interpass temperature / deg C",
    "Type of weld",
    "Post weld heat treatment temperature / deg C",
    "Post weld heat treatment time / hours",
    "Yield strength / MPa",
    "Ultimate tensile strength / MPa",
    "Elongation / %",
    "Reduction of Area / %",
    "Charpy temperature / deg C",
    "Charpy impact toughness / J",
    "Hardness / kg/mm2",
    "50 % FATT",
    "Primary ferrite in microstructure / %",
    "Ferrite with second phase / %",
    "Acicular ferrite / %",
    "Martensite / %",
    "Ferrite with carbide aggregate / %",
    "Weld ID",
]

In [None]:
data.columns = columns
data.head()

## II.Groupement ID ##

In [None]:
# Exemple de DataFrame avec une colonne d'identifiants
data_exple = {"ID": ["abcd123", "abc124", "abcd125", "def456", "def457", "ghi789", "ghi7abc"]}

df_exple = pd.DataFrame(data_exple)


# Fonction pour trouver le plus long préfixe commun
def find_longest_common_prefix(ids):
    if not ids:
        return ""

    prefix = ids[0]  # Commencer avec le premier identifiant
    for id in ids[1:]:
        while (
            not id.startswith(prefix) and prefix
        ):  # Réduire le préfixe jusqu'à ce qu'il corresponde
            prefix = prefix[:-1]
    return prefix


# Appliquer la fonction à chaque identifiant pour déterminer le préfixe commun
def get_common_prefix(current_id, ids):
    # Chercher le préfixe commun uniquement parmi ceux qui partagent un préfixe similaire
    return find_longest_common_prefix([id for id in ids if id.startswith(current_id[:3])])


# Appliquer la fonction pour chaque ID
df_exple["Common_Prefix"] = df_exple["ID"].apply(
    lambda x: get_common_prefix(x, df_exple["ID"].tolist())
)

df_exple

In [None]:
data["Common_Prefix"] = data["Weld ID"].apply(
    lambda x: get_common_prefix(x, data["Weld ID"].tolist())
)
data["Common_Prefix"].value_counts()

In [None]:
# df_pivot= pd.pivot_table(df,values=['colonne1',...],index=,aggfunc={'colonne1':"mean",...})

## Remarque sur les variables cibles : 
- Hardness / kg/mm² : Une mesure de la dureté du matériau soudé, qui est souvent liée à la qualité de la soudure.
- Yield strength / MPa ou Ultimate tensile strength / MPa : Ces deux mesures de résistance pourraient également servir de variable cible, car elles reflètent la capacité du matériau à résister à la déformation ou à la rupture.
- Elongation / etou Reduction of Area / % : Ces mesures sont souvent utilisées pour évaluer la ductilité et la ténacité d'un matériau, ce qui peut être pertinent pour la qualité des soudure##

## Petit nettoyage ##

In [None]:
# Fonction pour nettoyer les valeurs dans les colonnes numériques
def clean_numeric_values(value):
    if isinstance(value, str) and "<" in value:
        # Enlève le symbole '<' et convertit en float
        return float(value.replace("<", "").strip())
    try:
        # Essaye de convertir la valeur en float directement
        return float(value)
    except (ValueError, TypeError):
        # Si la conversion échoue (ex. si c'est une chaîne non numérique), retourne la valeur originale
        return value


# Appliquer la fonction uniquement aux 21 premières colonnes
data.iloc[:, :21] = data.iloc[:, :21].applymap(clean_numeric_values)

# Affichage des premières lignes du DataFrame nettoyé
data

## III.Missing values ##

### III.Missing values : vision générale ###

In [None]:
data.isnull().sum()

In [None]:
df = data

#### Remarque: Beaucoup de valeurs manquantes ####

### III.Missing values :  Sulphur concentration / (weight%) ###


In [None]:
data[data["Sulphur concentration / (weight%)"].isnull()]

In [None]:
# Tracer le graphique de distribution
plt.figure(figsize=(30, 6))
sns.histplot(data["Sulphur concentration / (weight%)"], bins=10, kde=True)
plt.title("Distribution de la concentration de soufre")
plt.xlabel("Concentration de soufre / (weight%)")
plt.ylabel("Fréquence")
plt.show()

In [None]:
df["Sulphur concentration / (weight%)"].fillna(
    df["Sulphur concentration / (weight%)"].mean(), inplace=True
)

### III.Missing values :  Phosphorus concentration / (weight%) ###

In [None]:
df["Phosphorus concentration / (weight%)"].describe()

In [None]:
df[df["Phosphorus concentration / (weight%)"].isnull()]

In [None]:
m = df[df["Common_Prefix"] == "Gar&K-1975-"]["Phosphorus concentration / (weight%)"].mean()

In [None]:
# Fill NaN values with the mean in the original DataFrame
df.loc[df["Common_Prefix"] == "Gar&K-1975-", "Phosphorus concentration / (weight%)"] = df.loc[
    df["Common_Prefix"] == "Gar&K-1975-", "Phosphorus concentration / (weight%)"
].fillna(m)

In [None]:
df["Phosphorus concentration / (weight%)"].fillna(
    df["Phosphorus concentration / (weight%)"].mean(), inplace=True
)

### III.Missing values :  other concentrations / (weight%) ###

In [None]:
df["Nickel concentration / (weight%)"].fillna(
    df["Nickel concentration / (weight%)"].mean(), inplace=True
)
df["Chromium concentration / (weight%)"].fillna(
    df["Chromium concentration / (weight%)"].mean(), inplace=True
)
df["Molybdenum concentration / (weight%)"].fillna(
    df["Molybdenum concentration / (weight%)"].mean(), inplace=True
)
df["Vanadium concentration / (weight%)"].fillna(
    df["Vanadium concentration / (weight%)"].mean(), inplace=True
)
df["Copper concentration / (weight%)"].fillna(
    df["Copper concentration / (weight%)"].mean(), inplace=True
)
df["Cobalt concentration / (weight%)"].fillna(
    df["Cobalt concentration / (weight%)"].mean(), inplace=True
)
df["Tungsten concentration / (weight%)"].fillna(
    df["Tungsten concentration / (weight%)"].mean(), inplace=True
)
df["Oxygen concentration / parts per million by weight"].fillna(
    df["Oxygen concentration / parts per million by weight"].mean(), inplace=True
)
df["Titanium concentration / parts per million by weight"].fillna(
    df["Titanium concentration / parts per million by weight"].mean(), inplace=True
)

In [None]:
df["Nitrogen concentration / parts per million by weight"].fillna(
    df["Nitrogen concentration / parts per million by weight"].mean(), inplace=True
)
df["Aluminium concentration / parts per million by weight"].fillna(
    df["Aluminium concentration / parts per million by weight"].mean(), inplace=True
)
df["Boron concentration / parts per million by weight"].fillna(
    df["Boron concentration / parts per million by weight"].mean(), inplace=True
)
df["Niobium concentration / parts per million by weight"].fillna(
    df["Niobium concentration / parts per million by weight"].mean(), inplace=True
)
df["Tin concentration / parts per million by weight"].fillna(
    df["Tin concentration / parts per million by weight"].mean(), inplace=True
)
df["Arsenic concentration / parts per million by weight"].fillna(
    df["Arsenic concentration / parts per million by weight"].mean(), inplace=True
)
df["Antimony concentration / parts per million by weight"].fillna(
    df["Antimony concentration / parts per million by weight"].mean(), inplace=True
)

In [None]:
# Convert to numeric, coercing errors to NaN
df["Nitrogen concentration / parts per million by weight"] = pd.to_numeric(
    df["Nitrogen concentration / parts per million by weight"], errors="coerce"
)

# Optionally, drop NaN values if you want to calculate the mean only for valid entries
mean_nitrogen_concentration = df["Nitrogen concentration / parts per million by weight"].mean()
print(mean_nitrogen_concentration)

In [None]:
df["Nitrogen concentration / parts per million by weight"].fillna(
    df["Nitrogen concentration / parts per million by weight"].mean(), inplace=True
)
df["Aluminium concentration / parts per million by weight"].fillna(
    df["Aluminium concentration / parts per million by weight"].mean(), inplace=True
)
df["Boron concentration / parts per million by weight"].fillna(
    df["Boron concentration / parts per million by weight"].mean(), inplace=True
)
df["Niobium concentration / parts per million by weight"].fillna(
    df["Niobium concentration / parts per million by weight"].mean(), inplace=True
)
df["Tin concentration / parts per million by weight"].fillna(
    df["Tin concentration / parts per million by weight"].mean(), inplace=True
)
df["Arsenic concentration / parts per million by weight"].fillna(
    df["Arsenic concentration / parts per million by weight"].mean(), inplace=True
)
df["Antimony concentration / parts per million by weight"].fillna(
    df["Antimony concentration / parts per million by weight"].mean(), inplace=True
)

### III.Missing values :  current & Voltage ###

In [None]:
df[df["Current / A"].isnull()]["Common_Prefix"].value_counts()

In [None]:
evans_c = df[df["Common_Prefix"] == "Evans"]["Current / A"].mean()
evans_v = df[df["Common_Prefix"] == "Evans"]["Voltage / V"].mean()

In [None]:
# Fill NaN values with the mean in the original DataFrame
df.loc[df["Common_Prefix"] == "Evans", "Current / A"] = df.loc[
    df["Common_Prefix"] == "Evans", "Current / A"
].fillna(evans_c)
df.loc[df["Common_Prefix"] == "Evans", "Voltage / V"] = df.loc[
    df["Common_Prefix"] == "Evans", "Voltage / V"
].fillna(evans_v)

In [None]:
df["Current / A"] = df["Current / A"].fillna(df["Current / A"].mean())
df["Voltage / V"] = df["Voltage / V"].fillna(df["Voltage / V"].mean())

### III.Missing values :  AC or DC ###

In [None]:
df[df["AC or DC"].isnull()]["Common_Prefix"].value_counts()

In [None]:
df[df["Common_Prefix"] == "Evans"]["AC or DC"].value_counts()

In [None]:
df.loc[df["Common_Prefix"] == "Evans", "AC or DC"] = df.loc[
    df["Common_Prefix"] == "Evans", "AC or DC"
].fillna("DC")

In [None]:
df["AC or DC"] = df["AC or DC"].fillna("o")

### III.Missing values :  Electrode positive ou négative ###

In [None]:
df[df["Electrode positive or negative"].isnull()]["Common_Prefix"].value_counts()

In [None]:
df[df["Common_Prefix"] == "Evans"]["Electrode positive or negative"].value_counts()

In [None]:
df["Electrode positive or negative"] = df["Electrode positive or negative"].fillna("+")

### III.Missing values :  Post weld heat treatment temperature / deg C et 

Post weld heat treatment time / hours  ###

In [None]:
df[df["Post weld heat treatment time / hours"].isnull()]["Common_Prefix"].value_counts()

In [None]:
df[df["Common_Prefix"] == "Gar&K-1975-"]["Post weld heat treatment temperature / deg C"].mean()

In [None]:
gark_t = df[df["Common_Prefix"] == "Gar&K-1975-"][
    "Post weld heat treatment temperature / deg C"
].mean()
gark_time = df[df["Common_Prefix"] == "Gar&K-1975-"][
    "Post weld heat treatment time / hours"
].mean()

In [None]:
df["Post weld heat treatment temperature / deg C"] = df[
    "Post weld heat treatment temperature / deg C"
].fillna(gark_t)
df["Post weld heat treatment time / hours"] = df["Post weld heat treatment time / hours"].fillna(
    gark_time
)

### III.Missing values : Charpy   ###

In [None]:
df[df["Charpy impact toughness / J"].isnull()]["Common_Prefix"].value_counts()

In [None]:
evans_charpyJ = df[df["Common_Prefix"] == "Evans"]["Charpy impact toughness / J"].mean()
evanc_charpyC = df[df["Common_Prefix"] == "Evans"]["Charpy temperature / deg C"].mean()

In [None]:
evans_charpyJ

In [None]:
df.loc[df["Common_Prefix"] == "Evans", "Charpy impact toughness / J"] = df.loc[
    df["Common_Prefix"] == "Evans", "Charpy impact toughness / J"
].fillna(evans_charpyJ)
df.loc[df["Common_Prefix"] == "Evans", "Charpy temperature / deg C"] = df.loc[
    df["Common_Prefix"] == "Evans", "Charpy temperature / deg C"
].fillna(evanc_charpyC)

In [None]:
pant_charpyJ = df[df["Common_Prefix"] == "PantK-1990-"]["Charpy impact toughness / J"].mean()
pant_charpyC = df[df["Common_Prefix"] == "PantK-1990-"]["Charpy temperature / deg C"].mean()

In [None]:
df.loc[df["Common_Prefix"] == "PantK-1990-", "Charpy impact toughness / J"] = df.loc[
    df["Common_Prefix"] == "PantK-1990-", "Charpy impact toughness / J"
].fillna(pant_charpyJ)
df.loc[df["Common_Prefix"] == "PantK-1990-", "Charpy temperature / deg C"] = df.loc[
    df["Common_Prefix"] == "PantK-1990-", "Charpy temperature / deg C"
].fillna(pant_charpyC)

In [None]:
sven_charpyJ = df[df["Common_Prefix"] == "SvenGret-1990-"]["Charpy impact toughness / J"].mean()
sven_charpyC = df[df["Common_Prefix"] == "SvenGret-1990-"]["Charpy temperature / deg C"].mean()

In [None]:
df.loc[df["Common_Prefix"] == "SvenGret-1990-", "Charpy impact toughness / J"] = df.loc[
    df["Common_Prefix"] == "SvenGret-1990-", "Charpy impact toughness / J"
].fillna(sven_charpyJ)
df.loc[df["Common_Prefix"] == "SvenGret-1990-", "Charpy temperature / deg C"] = df.loc[
    df["Common_Prefix"] == "SvenGret-1990-", "Charpy temperature / deg C"
].fillna(sven_charpyC)

In [None]:
pat_charpyJ = df[df["Common_Prefix"] == "Pat-1981-"]["Charpy impact toughness / J"].mean()
pat_charpyC = df[df["Common_Prefix"] == "Pat-1981-"]["Charpy temperature / deg C"].mean()

In [None]:
df.loc[df["Common_Prefix"] == "Pat-1981-", "Charpy impact toughness / J"] = df.loc[
    df["Common_Prefix"] == "Pat-1981-", "Charpy impact toughness / J"
].fillna(pat_charpyJ)
df.loc[df["Common_Prefix"] == "Pat-1981-", "Charpy temperature / deg C"] = df.loc[
    df["Common_Prefix"] == "Pat-1981-", "Charpy temperature / deg C"
].fillna(pat_charpyC)

In [None]:
df[df["Common_Prefix"] == "EvHtIp1979-"]["Charpy impact toughness / J"].mean()

In [None]:
evht_charpyJ = df[df["Common_Prefix"] == "EvHtIp1979-"]["Charpy impact toughness / J"].mean()
evht_charpyC = df[df["Common_Prefix"] == "EvHtIp1979-"]["Charpy temperature / deg C"].mean()

In [None]:
df.loc[df["Common_Prefix"] == "EvHtIp1979-", "Charpy impact toughness / J"] = df.loc[
    df["Common_Prefix"] == "EvHtIp1979-", "Charpy impact toughness / J"
].fillna(evht_charpyJ)
df.loc[df["Common_Prefix"] == "EvHtIp1979-", "Charpy temperature / deg C"] = df.loc[
    df["Common_Prefix"] == "EvHtIp1979-", "Charpy temperature / deg C"
].fillna(evht_charpyC)

In [None]:
df[df["Common_Prefix"] == "Icici&-1992-"]["Charpy impact toughness / J"].mean()

In [None]:
ici_charpyJ = df[df["Common_Prefix"] == "Icici&-1992-"]["Charpy impact toughness / J"].mean()
ici_charpyC = df[df["Common_Prefix"] == "Icici&-1992-"]["Charpy temperature / deg C"].mean()

In [None]:
df.loc[df["Common_Prefix"] == "Icici&-1992-", "Charpy impact toughness / J"] = df.loc[
    df["Common_Prefix"] == "Icici&-1992-", "Charpy impact toughness / J"
].fillna(ici_charpyJ)
df.loc[df["Common_Prefix"] == "Icici&-1992-", "Charpy temperature / deg C"] = df.loc[
    df["Common_Prefix"] == "Icici&-1992-", "Charpy temperature / deg C"
].fillna(ici_charpyC)

In [None]:
df[df["Common_Prefix"] == "Blond&-1984-"]["Charpy impact toughness / J"].mean()

In [None]:
bld_charpyJ = df[df["Common_Prefix"] == "Blond&-1984-"]["Charpy impact toughness / J"].mean()
bld_charpyC = df[df["Common_Prefix"] == "Blond&-1984-"]["Charpy temperature / deg C"].mean()

In [None]:
df.loc[df["Common_Prefix"] == "Blond&-1984-", "Charpy impact toughness / J"] = df.loc[
    df["Common_Prefix"] == "Blond&-1984-", "Charpy impact toughness / J"
].fillna(bld_charpyJ)
df.loc[df["Common_Prefix"] == "Blond&-1984-", "Charpy temperature / deg C"] = df.loc[
    df["Common_Prefix"] == "Blond&-1984-", "Charpy temperature / deg C"
].fillna(bld_charpyC)

In [None]:
df[df["Common_Prefix"] == "Mart-"]["Charpy impact toughness / J"].mean()

In [None]:
mrt_charpyJ = df[df["Common_Prefix"] == "Mart-"]["Charpy impact toughness / J"].mean()
mrt_charpyC = df[df["Common_Prefix"] == "Mart-"]["Charpy temperature / deg C"].mean()

In [None]:
df.loc[df["Common_Prefix"] == "Mart-", "Charpy impact toughness / J"] = df.loc[
    df["Common_Prefix"] == "Mart-", "Charpy impact toughness / J"
].fillna(mrt_charpyJ)
df.loc[df["Common_Prefix"] == "Mart-", "Charpy temperature / deg C"] = df.loc[
    df["Common_Prefix"] == "Mart-", "Charpy temperature / deg C"
].fillna(mrt_charpyC)

In [None]:
df[df["Common_Prefix"] == "Ditt-"]["Charpy impact toughness / J"].mean()

In [None]:
ditt_charpyJ = df[df["Common_Prefix"] == "Ditt-"]["Charpy impact toughness / J"].mean()
ditt_charpyC = df[df["Common_Prefix"] == "Ditt-"]["Charpy temperature / deg C"].mean()

In [None]:
df.loc[df["Common_Prefix"] == "Ditt-", "Charpy impact toughness / J"] = df.loc[
    df["Common_Prefix"] == "Ditt-", "Charpy impact toughness / J"
].fillna(ditt_charpyJ)
df.loc[df["Common_Prefix"] == "Ditt-", "Charpy temperature / deg C"] = df.loc[
    df["Common_Prefix"] == "Ditt-", "Charpy temperature / deg C"
].fillna(ditt_charpyC)

In [None]:
df[df["Common_Prefix"] == "Chandel&-1985W"]["Charpy impact toughness / J"].mean()

In [None]:
chdl_charpyJ = df[df["Common_Prefix"] == "Chandel&-1985W"]["Charpy impact toughness / J"].mean()
chdl_charpyC = df[df["Common_Prefix"] == "Chandel&-1985W"]["Charpy temperature / deg C"].mean()

In [None]:
df.loc[df["Common_Prefix"] == "Chandel&-1985W", "Charpy impact toughness / J"] = df.loc[
    df["Common_Prefix"] == "Chandel&-1985W", "Charpy impact toughness / J"
].fillna(chdl_charpyJ)
df.loc[df["Common_Prefix"] == "Chandel&-1985W", "Charpy temperature / deg C"] = df.loc[
    df["Common_Prefix"] == "Chandel&-1985W", "Charpy temperature / deg C"
].fillna(chdl_charpyC)

In [None]:
df["Charpy impact toughness / J"] = df["Charpy impact toughness / J"].fillna(
    df["Charpy impact toughness / J"].mean()
)
df["Charpy temperature / deg C"] = df["Charpy temperature / deg C"].fillna(
    df["Charpy temperature / deg C"].mean()
)

### III. Missing Values: features to delete ###

In [None]:
# 50 % FATT                                                1621
# Primary ferrite in microstructure / %                    1554
# Ferrite with second phase / %                            1562
# Acicular ferrite / %                                     1562
# Martensite / %                                           1563
# Ferrite with carbide aggregate / %                       1563

In [None]:
df = df.drop(
    columns=[
        "50 % FATT",
        "Primary ferrite in microstructure / %",
        "Ferrite with second phase / %",
        "Acicular ferrite / %",
        "Martensite / %",
        "Ferrite with carbide aggregate / %",
    ]
)

In [None]:
df

### III.Missing values: Résumé ###

In [None]:
df.isnull().sum()

# Les variables cibles : 

- **Yield strength** / MPa
- **Ultimate tensile strength** / MPa
- **Elongation** / %
- **Reduction of Area** / %
- **Hardness** / kg/mm²


## IV. Variables Catégoriques ##

### IV. Variables Catégoriques : Introduction

In [None]:
df.columns

In [None]:
#  'AC or DC'
# 'Electrode positive or negative'
# 'Type of weld'
# Sont des variables catégoriques

### IV. Variables Catégoriques : AC or DC

In [None]:
df["AC or DC"].value_counts()

In [None]:
df_encoded_1 = pd.get_dummies(df, columns=["AC or DC"])

In [None]:
df_encoded_1.columns

In [None]:
df_encoded_1.drop(columns=["AC or DC_o"], inplace=True)

### IV. Variables Catégoriques : 'Electrode positive or negative'

In [None]:
df["Electrode positive or negative"].value_counts()

In [None]:
df_encoded_2 = pd.get_dummies(df_encoded_1, columns=["Electrode positive or negative"])

In [None]:
df_encoded_2.columns

In [None]:
df_encoded_2.drop(columns=["Electrode positive or negative_-"], inplace=True)

### IV. Variables Catégoriques : 'Type of Weld'

In [None]:
df["Type of weld"].value_counts()

In [None]:
df_encoded = pd.get_dummies(df_encoded_2, columns=["Type of weld"])

In [None]:
df_encoded.columns

In [None]:
df_encoded.drop(columns=["Type of weld_GMAA"], inplace=True)

## V. Analyse des composantes principales: ACP

### V. ACP : drop les variables cibles

In [None]:
df_encoded.columns

In [None]:
X = df_encoded[
    [
        "Carbon concentration / (weight%)",
        "Silicon concentration / (weight%)",
        "Manganese concentration / (weight%)",
        "Sulphur concentration / (weight%)",
        "Phosphorus concentration / (weight%)",
        "Nickel concentration / (weight%)",
        "Chromium concentration / (weight%)",
        "Molybdenum concentration / (weight%)",
        "Vanadium concentration / (weight%)",
        "Copper concentration / (weight%)",
        "Cobalt concentration / (weight%)",
        "Tungsten concentration / (weight%)",
        "Oxygen concentration / parts per million by weight",
        "Titanium concentration / parts per million by weight",
        "Nitrogen concentration / parts per million by weight",
        "Aluminium concentration / parts per million by weight",
        "Boron concentration / parts per million by weight",
        "Niobium concentration / parts per million by weight",
        "Tin concentration / parts per million by weight",
        "Arsenic concentration / parts per million by weight",
        "Antimony concentration / parts per million by weight",
        "Current / A",
        "Voltage / V",
        "Heat input / kJ/mm",
        "Interpass temperature / deg C",
        "Post weld heat treatment temperature / deg C",
        "Post weld heat treatment time / hours",
        "Charpy temperature / deg C",
        "Charpy impact toughness / J",
        "AC or DC_AC",
        "AC or DC_DC",
        "Electrode positive or negative_+",
        "Electrode positive or negative_0",
        "Type of weld_FCA",
        "Type of weld_GTAA",
        "Type of weld_MMA",
        "Type of weld_NGGMA",
        "Type of weld_NGSAW",
        "Type of weld_SA",
        "Type of weld_SAA",
        "Type of weld_ShMA",
        "Type of weld_TSA",
    ]
]

In [None]:
y = df_encoded[
    [
        "Yield strength / MPa",
        "Ultimate tensile strength / MPa",
        "Elongation / %",
        "Reduction of Area / %",
        "Hardness / kg/mm2",
    ]
]

### V. ACP : standardisation

In [None]:
def replace_range_with_avg(value):
    if isinstance(value, str) and "-" in value:
        try:
            # Séparer les deux nombres et calculer la moyenne
            numbers = list(map(float, value.split("-")))
            return sum(numbers) / len(numbers)
        except ValueError:
            return value  # Si ce n'est pas un format attendu, garder la valeur originale
    else:
        return value  # Si ce n'est pas une chaîne au format '150-200', garder la valeur originale


# Appliquer la fonction sur toutes les colonnes du DataFrame
X = X.applymap(replace_range_with_avg)

In [None]:
# Standardisation des données (important pour la PCA)
scaler = StandardScaler()
X_scaled_28 = scaler.fit_transform(
    X[
        [
            "Carbon concentration / (weight%)",
            "Silicon concentration / (weight%)",
            "Manganese concentration / (weight%)",
            "Sulphur concentration / (weight%)",
            "Phosphorus concentration / (weight%)",
            "Nickel concentration / (weight%)",
            "Chromium concentration / (weight%)",
            "Molybdenum concentration / (weight%)",
            "Vanadium concentration / (weight%)",
            "Copper concentration / (weight%)",
            "Cobalt concentration / (weight%)",
            "Tungsten concentration / (weight%)",
            "Oxygen concentration / parts per million by weight",
            "Titanium concentration / parts per million by weight",
            "Nitrogen concentration / parts per million by weight",
            "Aluminium concentration / parts per million by weight",
            "Boron concentration / parts per million by weight",
            "Niobium concentration / parts per million by weight",
            "Tin concentration / parts per million by weight",
            "Arsenic concentration / parts per million by weight",
            "Antimony concentration / parts per million by weight",
            "Current / A",
            "Voltage / V",
            "Heat input / kJ/mm",
            "Interpass temperature / deg C",
            "Post weld heat treatment temperature / deg C",
            "Post weld heat treatment time / hours",
            "Charpy temperature / deg C",
            "Charpy impact toughness / J",
        ]
    ]
)

In [None]:
X[
    [
        "Carbon concentration / (weight%)",
        "Silicon concentration / (weight%)",
        "Manganese concentration / (weight%)",
        "Sulphur concentration / (weight%)",
        "Phosphorus concentration / (weight%)",
        "Nickel concentration / (weight%)",
        "Chromium concentration / (weight%)",
        "Molybdenum concentration / (weight%)",
        "Vanadium concentration / (weight%)",
        "Copper concentration / (weight%)",
        "Cobalt concentration / (weight%)",
        "Tungsten concentration / (weight%)",
        "Oxygen concentration / parts per million by weight",
        "Titanium concentration / parts per million by weight",
        "Nitrogen concentration / parts per million by weight",
        "Aluminium concentration / parts per million by weight",
        "Boron concentration / parts per million by weight",
        "Niobium concentration / parts per million by weight",
        "Tin concentration / parts per million by weight",
        "Arsenic concentration / parts per million by weight",
        "Antimony concentration / parts per million by weight",
        "Current / A",
        "Voltage / V",
        "Heat input / kJ/mm",
        "Interpass temperature / deg C",
        "Post weld heat treatment temperature / deg C",
        "Post weld heat treatment time / hours",
        "Charpy temperature / deg C",
        "Charpy impact toughness / J",
    ]
] = X_scaled_28

### V. ACP : Choix du nombre de composants

In [None]:
# Appliquer la PCA
pca = PCA()  # Laisser scikit-learn décider du nombre maximal de composantes
pca.fit(X)

# Variance expliquée par chaque composante
explained_variance_ratio = pca.explained_variance_ratio_

# Calcul de la variance cumulée
explained_variance_cumulative = np.cumsum(explained_variance_ratio)

# Tracer le graphe
plt.figure(figsize=(8, 5))
plt.plot(
    range(1, len(explained_variance_cumulative) + 1),
    explained_variance_cumulative,
    marker="",
    linestyle="-",
)
plt.title("Variance expliquée cumulée en fonction du nombre de composantes principales")
plt.xlabel("Nombre de composantes principales")
plt.ylabel("Variance expliquée cumulée")
plt.grid(True)
plt.show()

In [None]:
# 25 composantes peut expliquer plus de 90% des données

### V. ACP

In [None]:
# Appliquer la PCA
pca = PCA(n_components=25)  # Laisser scikit-learn décider du nombre maximal de composantes
pca.fit(X)
pca_result = pca.transform(X)
# Créer un DataFrame pour les résultats de la PCA
pca_X = pd.DataFrame(
    pca_result,
    columns=[
        "PC1",
        "PC2",
        "PC3",
        "PC4",
        "PC5",
        "PC6",
        "PC7",
        "PC8",
        "PC9",
        "PC10",
        "PC11",
        "PC12",
        "PC13",
        "PC14",
        "PC15",
        "PC16",
        "PC17",
        "PC18",
        "PC19",
        "PC20",
        "PC21",
        "PC22",
        "PC23",
        "PC24",
        "PC25",
    ],
)

# Afficher les résultats
pca_X

In [None]:
print("Variance expliquée cumulée par chaque composante :", pca.explained_variance_ratio_.sum())