# This Jupyter Notebook cleans and processes the raw data from the journal article *A deep learning dataset for metal multiaxial fatigue life prediction* [2024].

## First, we combine all of the linear and torsional strain data

Three raw data files are uploaded to a google drive. They are the "Specific information of the materials" file, the "data_all_strain-controlled" file, and a zipped file version of the "All data_Strain" folder. These files/folders were uploaded by the authors of the paper above, and can be accessed from the paper.

In [41]:
#First, we have to mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


First, we download the files uploaded to the following google drive folder, keeping the name. https://drive.google.com/drive/folders/1snslfpCeNJcaQOKeNlvnKzsCnp2b1B66?usp=drive_link

Now, we load and unzip the strain data

In [38]:
!find /content/drive/ -name "All_data_Strain.zip"

/content/drive/MyDrive/MANE4962_FinalProject_RawData/All_data_Strain.zip
/content/drive/MyDrive/MLE_S25_Project/All_data_Strain.zip


In [42]:
import zipfile
import os

zip_path = "/content/drive/MyDrive/MANE4962_FinalProject_RawData/All_data_Strain.zip"

extract_path = "/content/All_data_Strain"


with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [47]:
import pandas as pd
import glob
import os

#Repeated name because we need to access the files in the unzipped folder
folder_path = "/content/All_data_Strain/All_data_Strain"

csv_files = glob.glob(folder_path + "/*.csv")

data_list = []

for file in csv_files:
    #Read CSV raw data
    df = pd.read_csv(file, header=None, usecols=[0, 1])
    #Reshape from array to vector
    df = df.to_numpy().flatten('F')
    #Turn back to DataFrame
    df = pd.DataFrame([df])
    #Keep original filename for tracking
    df.insert(0, "filename", os.path.basename(file))

    data_list.append(df)

Now, we combine our dataframes into df_strain, that contains all of the time series strain data

In [105]:
#Combining all strain data together
df_strain = pd.concat(data_list, ignore_index=True)

#Renaming columns manually
df_strain.columns = ["filename"] + [f"linear_strain_t{i}" for i in range(1, 242)] + [f"torsional_strain_t{i-241}" for i in range(242, 483)]

print("dataframe shape =",df_strain.shape)

df_strain.head()

dataframe shape = (914, 483)


Unnamed: 0,filename,linear_strain_t1,linear_strain_t2,linear_strain_t3,linear_strain_t4,linear_strain_t5,linear_strain_t6,linear_strain_t7,linear_strain_t8,linear_strain_t9,...,torsional_strain_t232,torsional_strain_t233,torsional_strain_t234,torsional_strain_t235,torsional_strain_t236,torsional_strain_t237,torsional_strain_t238,torsional_strain_t239,torsional_strain_t240,torsional_strain_t241
0,HRB335-0.002.csv,0.0,3.3e-05,6.7e-05,0.0001,0.000133,0.000167,0.0002,0.000233,0.000267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,S460N-b-0.00104-0.0018.csv,0.0,2.7e-05,5.4e-05,8.2e-05,0.000109,0.000136,0.000163,0.00019,0.000216,...,0.00175,0.001761,0.00177,0.001778,0.001785,0.00179,0.001794,0.001798,0.001799,0.0018
2,E235-0.0021-0.0037.csv,0.0,5.5e-05,0.00011,0.000165,0.00022,0.000274,0.000329,0.000383,0.000437,...,-0.000864,-0.000769,-0.000674,-0.000579,-0.000483,-0.000387,-0.00029,-0.000194,-9.7e-05,0.0
3,E355-2-8-0.0037-0.0063.csv,0.0,9.7e-05,0.000194,0.00029,0.000387,0.000483,0.000579,0.000674,0.000769,...,-0.005097,-0.004682,-0.004216,-0.003703,-0.00315,-0.002562,-0.001947,-0.00131,-0.000659,0.0
4,Haynes188-760-A-0.00416.csv,0.0,6.9e-05,0.000139,0.000208,0.000277,0.000347,0.000416,0.000485,0.000555,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Second, we add the material properties to our strain data frame

In [106]:
csv_path = "/content/drive/MyDrive/MANE4962_FinalProject_RawData/data_all_strain-controlled.csv"
df_material_prop = pd.read_csv(csv_path)

print("dataframe shape =",df_material_prop.shape)

df_material_prop.head()

dataframe shape = (914, 6)


Unnamed: 0,load,Elastic Modulus [GPA],Ultimate Tensile Strength [MPA],Yield Strength [MPA],Poissons Ratio,Nf(label)
0,1Cr18Ni9T-1-0.002.csv,193.0,605.0,310.0,0.3,5.30103
1,1Cr18Ni9T-1-0.003.csv,193.0,605.0,310.0,0.3,4.093772
2,1Cr18Ni9T-1-0.004.csv,193.0,605.0,310.0,0.3,3.740363
3,1Cr18Ni9T-1-0.005.csv,193.0,605.0,310.0,0.3,3.491362
4,1Cr18Ni9T-1-0.01.csv,193.0,605.0,310.0,0.3,2.977724


Now, we will rearrange and match the indicies of df_strain and df_material_prop. This will allow the data to appended correctly

In [107]:
df_strain = df_strain.set_index(df_strain.iloc[:,0])
df_strain = df_strain.drop(df_strain.columns[0], axis=1)

df_material_prop = df_material_prop.set_index(df_material_prop.iloc[:,0])
df_material_prop = df_material_prop.drop(df_material_prop.columns[0], axis=1)

df_material_prop_new_index = df_material_prop.reindex(df_strain.index)

df_strain_material_target = pd.concat([df_strain, df_material_prop_new_index], axis=1)

print("dataframe shape =",df_strain_material_target.shape)

df_strain_material_target.head()

dataframe shape = (914, 487)


Unnamed: 0_level_0,linear_strain_t1,linear_strain_t2,linear_strain_t3,linear_strain_t4,linear_strain_t5,linear_strain_t6,linear_strain_t7,linear_strain_t8,linear_strain_t9,linear_strain_t10,...,torsional_strain_t237,torsional_strain_t238,torsional_strain_t239,torsional_strain_t240,torsional_strain_t241,Elastic Modulus [GPA],Ultimate Tensile Strength [MPA],Yield Strength [MPA],Poissons Ratio,Nf(label)
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HRB335-0.002.csv,0.0,3.3e-05,6.7e-05,0.0001,0.000133,0.000167,0.0002,0.000233,0.000267,0.0003,...,0.0,0.0,0.0,0.0,0.0,210.0,520.0,355.0,0.3,4.482044
S460N-b-0.00104-0.0018.csv,0.0,2.7e-05,5.4e-05,8.2e-05,0.000109,0.000136,0.000163,0.00019,0.000216,0.000243,...,0.00179,0.001794,0.001798,0.001799,0.0018,208.5,643.0,500.0,0.3,5.759366
E235-0.0021-0.0037.csv,0.0,5.5e-05,0.00011,0.000165,0.00022,0.000274,0.000329,0.000383,0.000437,0.00049,...,-0.000387,-0.00029,-0.000194,-9.7e-05,0.0,196.4,375.4,247.8,0.3,4.2545
E355-2-8-0.0037-0.0063.csv,0.0,9.7e-05,0.000194,0.00029,0.000387,0.000483,0.000579,0.000674,0.000769,0.000864,...,-0.002562,-0.001947,-0.00131,-0.000659,0.0,208.6,473.0,318.4,0.29,2.985875
Haynes188-760-A-0.00416.csv,0.0,6.9e-05,0.000139,0.000208,0.000277,0.000347,0.000416,0.000485,0.000555,0.000624,...,0.0,0.0,0.0,0.0,0.0,146.93456,490.0,268.0,0.5,4.545381


## Finally, the material type is added to our dataframe

First, we import the raw data excel sheet for the material breakdowns

In [94]:
xlsx_path = "/content/drive/MyDrive/MANE4962_FinalProject_RawData/Specific information of the materials.xlsx"
df_materials = pd.read_excel(xlsx_path)
df_materials.head()

Unnamed: 0,Category,Material,Chemical Constituents in Weight %,Additional processing information of specimens,Specimen type,Test machine,Loading frequency,Loading paths,Loading method,Samples,Ref
0,Stainless Steel,1Cr–18Ni–9T,C (0.065); Mn (1.34); Si (0.95); S (0.03); P (...,The base material is solution-treated.,Tubular specimens,Instron 8800,Not specified,"U1, U2, S1",Strain controlled,14,"Chen, X., An, K., & Kim, K. S. (2004). Low‐cyc..."
1,,S347,C < 0.08; Mn < 2; Si < 1; P < 0.045; S < 0.03;...,The specimens were rolled after drilling but b...,Tubular specimens,A servohydraulic loading device,2 Hz,"U1, U2, P1, S1, S3",Strain controlled,19,"Hoffmeyer, J., Döring, R., Seeger, T., & Vormw..."
2,,X5CrNi18-10,Not specified,"Austenitic stainless steel, soft annealed.",Tubular specimens,Instron 8874,0.1 - 2 Hz,"U1,U2,P1,S1,SF12,SF13,SF15,SF16,SF19",Strain controlled,63,"Pejkowski, Ł., & Skibicki, D. (2019). Stress-s..."
3,,AISI 316L,Not specified,Not specified.,Tubular specimens,MTS Model 809,0.1 Hz,"U1,U2,P1,S1,S3",Strain controlled,25,"Feng, E. S., Wang, X. G., & Jiang, C. (2019). ..."
4,,304 stainless steel,Cr (19.2); Ni (10.8); Mn (1.6); Si (0.40); P<0...,Specimen surfaces were polished to a 0.3 micro...,Tubular specimens,A two post axial servo-hydraulic test frame wi...,Not specified,"U1,U2,P1,S1,B1,B2",Strain controlled,8,"Jones, D., & Kurath, P. (1988). Cyclic Fatigue..."


We need to change the NaN values to the first NaN value that comes before it

In [99]:
for i in range(len(df_materials)): #Iterates through 41 different allows
  if pd.isna(df_materials.loc[i, 'Category']): #Following line runs only if NaN value is detected
    df_materials.loc[i, 'Category'] = df_materials.loc[i-1, 'Category'] #Pulls value from previous row

df_materials.head()

Unnamed: 0,Category,Material,Chemical Constituents in Weight %,Additional processing information of specimens,Specimen type,Test machine,Loading frequency,Loading paths,Loading method,Samples,Ref
0,Stainless Steel,1Cr–18Ni–9T,C (0.065); Mn (1.34); Si (0.95); S (0.03); P (...,The base material is solution-treated.,Tubular specimens,Instron 8800,Not specified,"U1, U2, S1",Strain controlled,14,"Chen, X., An, K., & Kim, K. S. (2004). Low‐cyc..."
1,Stainless Steel,S347,C < 0.08; Mn < 2; Si < 1; P < 0.045; S < 0.03;...,The specimens were rolled after drilling but b...,Tubular specimens,A servohydraulic loading device,2 Hz,"U1, U2, P1, S1, S3",Strain controlled,19,"Hoffmeyer, J., Döring, R., Seeger, T., & Vormw..."
2,Stainless Steel,X5CrNi18-10,Not specified,"Austenitic stainless steel, soft annealed.",Tubular specimens,Instron 8874,0.1 - 2 Hz,"U1,U2,P1,S1,SF12,SF13,SF15,SF16,SF19",Strain controlled,63,"Pejkowski, Ł., & Skibicki, D. (2019). Stress-s..."
3,Stainless Steel,AISI 316L,Not specified,Not specified.,Tubular specimens,MTS Model 809,0.1 Hz,"U1,U2,P1,S1,S3",Strain controlled,25,"Feng, E. S., Wang, X. G., & Jiang, C. (2019). ..."
4,Stainless Steel,304 stainless steel,Cr (19.2); Ni (10.8); Mn (1.6); Si (0.40); P<0...,Specimen surfaces were polished to a 0.3 micro...,Tubular specimens,A two post axial servo-hydraulic test frame wi...,Not specified,"U1,U2,P1,S1,B1,B2",Strain controlled,8,"Jones, D., & Kurath, P. (1988). Cyclic Fatigue..."


Now, we rearrange and match the indicies like before, to add this material data onto our strain_material_target dataframe

In [108]:
#First, we make a dictionary from our material dataframe
material_category_map = df_materials.set_index('Material')['Category'].to_dict()

def append_material(index):
    #Make sure index is a string
    index_str = str(index)

    for material in material_category_map:
        #Make sure variable we loop through is also a string
        #Check if material name is present in index
        if str(material) in index_str:
            return material_category_map[material]
    return None

#Use the function on our dataframe to add the new material column
df_strain_material_target['Material'] = df_strain_material_target.index.map(append_material)

df_strain_material_target.head()

Unnamed: 0_level_0,linear_strain_t1,linear_strain_t2,linear_strain_t3,linear_strain_t4,linear_strain_t5,linear_strain_t6,linear_strain_t7,linear_strain_t8,linear_strain_t9,linear_strain_t10,...,torsional_strain_t238,torsional_strain_t239,torsional_strain_t240,torsional_strain_t241,Elastic Modulus [GPA],Ultimate Tensile Strength [MPA],Yield Strength [MPA],Poissons Ratio,Nf(label),Material
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HRB335-0.002.csv,0.0,3.3e-05,6.7e-05,0.0001,0.000133,0.000167,0.0002,0.000233,0.000267,0.0003,...,0.0,0.0,0.0,0.0,210.0,520.0,355.0,0.3,4.482044,Carbon and Alloy Steel
S460N-b-0.00104-0.0018.csv,0.0,2.7e-05,5.4e-05,8.2e-05,0.000109,0.000136,0.000163,0.00019,0.000216,0.000243,...,0.001794,0.001798,0.001799,0.0018,208.5,643.0,500.0,0.3,5.759366,Carbon and Alloy Steel
E235-0.0021-0.0037.csv,0.0,5.5e-05,0.00011,0.000165,0.00022,0.000274,0.000329,0.000383,0.000437,0.00049,...,-0.00029,-0.000194,-9.7e-05,0.0,196.4,375.4,247.8,0.3,4.2545,Carbon and Alloy Steel
E355-2-8-0.0037-0.0063.csv,0.0,9.7e-05,0.000194,0.00029,0.000387,0.000483,0.000579,0.000674,0.000769,0.000864,...,-0.001947,-0.00131,-0.000659,0.0,208.6,473.0,318.4,0.29,2.985875,Carbon and Alloy Steel
Haynes188-760-A-0.00416.csv,0.0,6.9e-05,0.000139,0.000208,0.000277,0.000347,0.000416,0.000485,0.000555,0.000624,...,0.0,0.0,0.0,0.0,146.93456,490.0,268.0,0.5,4.545381,


We now need to swap the last two columns, so our target data [Nf], is the last column in our dataframe

In [109]:
column_names = list(df_strain_material_target.columns)
last_two_columns = column_names[-2:]
other_columns = column_names[:-2]
swapped_columns = other_columns + [last_two_columns[1], last_two_columns[0]]

df_final = df_strain_material_target[swapped_columns]

df_final.head()

Unnamed: 0_level_0,linear_strain_t1,linear_strain_t2,linear_strain_t3,linear_strain_t4,linear_strain_t5,linear_strain_t6,linear_strain_t7,linear_strain_t8,linear_strain_t9,linear_strain_t10,...,torsional_strain_t238,torsional_strain_t239,torsional_strain_t240,torsional_strain_t241,Elastic Modulus [GPA],Ultimate Tensile Strength [MPA],Yield Strength [MPA],Poissons Ratio,Material,Nf(label)
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HRB335-0.002.csv,0.0,3.3e-05,6.7e-05,0.0001,0.000133,0.000167,0.0002,0.000233,0.000267,0.0003,...,0.0,0.0,0.0,0.0,210.0,520.0,355.0,0.3,Carbon and Alloy Steel,4.482044
S460N-b-0.00104-0.0018.csv,0.0,2.7e-05,5.4e-05,8.2e-05,0.000109,0.000136,0.000163,0.00019,0.000216,0.000243,...,0.001794,0.001798,0.001799,0.0018,208.5,643.0,500.0,0.3,Carbon and Alloy Steel,5.759366
E235-0.0021-0.0037.csv,0.0,5.5e-05,0.00011,0.000165,0.00022,0.000274,0.000329,0.000383,0.000437,0.00049,...,-0.00029,-0.000194,-9.7e-05,0.0,196.4,375.4,247.8,0.3,Carbon and Alloy Steel,4.2545
E355-2-8-0.0037-0.0063.csv,0.0,9.7e-05,0.000194,0.00029,0.000387,0.000483,0.000579,0.000674,0.000769,0.000864,...,-0.001947,-0.00131,-0.000659,0.0,208.6,473.0,318.4,0.29,Carbon and Alloy Steel,2.985875
Haynes188-760-A-0.00416.csv,0.0,6.9e-05,0.000139,0.000208,0.000277,0.000347,0.000416,0.000485,0.000555,0.000624,...,0.0,0.0,0.0,0.0,146.93456,490.0,268.0,0.5,,4.545381


Now, we upload this data frame into the same google drive folder for use in training future ML models

In [111]:
df_final.to_csv('/content/drive/MyDrive/MANE4962_FinalProject_RawData/final_data.csv', index=True)