In [1]:
import pandas as pd
import numpy as np

In [2]:
filename='data/data.h5'

# Molar Ratio

In [3]:
# Read the HDF5 file into a DataFrame
readDF=pd.read_hdf(filename, 'df') 

# Mapping from molar ratio values to discrete class labels
class_mapping = {
    0.9: 0,
    0.95: 1,
    0.975: 2,
    1.0: 3,
    1.025: 4,
    1.05: 5,
    1.1: 6,
}
# Apply the class mapping to create a 'class' column in the DataFrame
readDF['class'] = readDF['ratio'].replace(class_mapping)

# Define samples that will be used as test data    
test_sub_list=['DDB', 'DEA', 'DEB', 'DDD', 'DEC', 'DED', 'DDF', 'DEE', 'DEF', 'DBR', 'DCA', 
               'DCI', 'DDH', 'DEG', 'DEH', 'DDJ', 'DEI', 'DEJ', 'DDL', 'DEK', 'DEL']

# Define sub-lists for different training folds during 5-fold cross-validation
train_fold0_sub_list=['DAB', 'DAG', 'DCK', 'DCM', 'DBI', 'DBN','DAE', 'DAJ','DAA', 'DAF','DAM', 'DBC']
train_fold1_sub_list=['DAL', 'DBB', 'DCO', 'DCC', 'DCE', 'DBS','DAO', 'DBE', 'DAK', 'DAP', 'DBH', 'DBM']
train_fold2_sub_list=[ 'DBG', 'DBL', 'DDE', 'DCG', 'DCJ', 'DDI','DBJ', 'DBO', 'DBA', 'DBF', 'DDM', 'DDN']
train_fold3_sub_list=['DBQ', 'DCB', 'DCD','DCL', 'DCN','DAD', 'DAI', 'DBT', 'DBK', 'DBP','DDO', 'DDP']
train_fold4_sub_list=['DDC', 'DCF', 'DCH','DCP','DDG', 'DAN', 'DBD','DDK', 'DDA','DAC', 'DAH', 'DDQ']

# Drop samples not in any fold from the training DataFrame and reset its index
indexDropTrain = readDF[ ~readDF['expID'].isin(train_fold0_sub_list+train_fold1_sub_list+train_fold2_sub_list+train_fold3_sub_list+train_fold4_sub_list ) ].index
trainDF=readDF.drop(indexDropTrain)
trainDF=trainDF.reset_index(drop=True)

# Assigning fold numbers to each sample in trainDF
for element in trainDF[trainDF['expID'].isin(train_fold0_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=0
for element in trainDF[trainDF['expID'].isin(train_fold1_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=1
for element in trainDF[trainDF['expID'].isin(train_fold2_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=2
for element in trainDF[trainDF['expID'].isin(train_fold3_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=3
for element in trainDF[trainDF['expID'].isin(train_fold4_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=4
    
# Drop samples which are not in the test list from the DataFrame to create a test set and reset its index
indexDropTest = readDF[ ~readDF['expID'].isin(test_sub_list) ].index
testDF=readDF.drop(indexDropTest)
testDF=testDF.reset_index(drop=True)

# Construct feature matrices for training and testing by horizontally stacking the specified signal columns 
X_train = pd.DataFrame(data = np.hstack((np.vstack(trainDF["ND"].to_numpy()),np.vstack(trainDF["LP725"].to_numpy()),
            np.vstack(trainDF["LP780"].to_numpy()),np.vstack(trainDF["SP775"].to_numpy()))))
X_test = pd.DataFrame(data = np.hstack((np.vstack(testDF["ND"].to_numpy()),np.vstack(testDF["LP725"].to_numpy()),
            np.vstack(testDF["LP780"].to_numpy()),np.vstack(testDF["SP775"].to_numpy()))))

# Create target arrays for training and testing datasets
y_train = np.array(trainDF['class'])
y_test = np.array(testDF['class'])

In [8]:
trainDF.head()

Unnamed: 0,date,expID,patchID,ND,LP725,LP780,SP775,ratio,molarity,evac_duration,...,RSHUNT_forward,RSHUNT_backward,RS_forward,RS_backward,PLQY,iVOC,jscPLQY,egPLQY,class,fold
0,20230725,DAA,11,"[18478.851940457203, 18388.056884635833, 18310...","[154.9223817118554, 154.91414141414143, 154.83...","[113.53322700691122, 113.65948963317385, 113.6...","[131.4250398724083, 131.66214779372675, 131.58...",0.9,0.67,720.0,...,,,,,,,,,0.0,0.0
1,20230725,DAA,12,"[18423.745614035088, 18338.976076555024, 18282...","[155.85778841042, 155.63902179691652, 155.6794...","[113.70547581073897, 113.71185539606593, 113.7...","[131.64699627857522, 131.52897395002657, 131.5...",0.9,0.67,720.0,...,,,,,,,,,0.0,0.0
2,20230725,DAA,13,"[17444.824295587452, 17362.170653907495, 17277...","[154.5956937799043, 154.5388091440723, 154.516...","[113.46251993620415, 113.7333864965444, 113.65...","[130.8654970760234, 131.00026581605528, 130.96...",0.9,0.67,720.0,...,,,,,,,,,0.0,0.0
3,20230725,DAA,14,"[18320.8793195109, 18179.110579479002, 18144.1...","[155.35194045720363, 155.33227006911218, 155.1...","[113.60446570972887, 113.58612440191388, 113.6...","[130.961456671983, 131.10287081339712, 131.140...",0.9,0.67,720.0,...,,,,,,,,,0.0,0.0
4,20230725,DAA,21,"[20531.059542796385, 20465.95427963849, 20388....","[158.68819776714514, 158.5882509303562, 158.46...","[113.90271132376395, 113.7796384901648, 113.79...","[131.8421052631579, 131.64832535885168, 131.81...",0.9,0.67,720.0,...,,,,,,,,,0.0,0.0


In [5]:
testDF.head()

Unnamed: 0,date,expID,patchID,ND,LP725,LP780,SP775,ratio,molarity,evac_duration,...,FF_backward,RSHUNT_forward,RSHUNT_backward,RS_forward,RS_backward,PLQY,iVOC,jscPLQY,egPLQY,class
0,20230801,DBR,11,"[19123.92955874535, 19022.538809144073, 18941....","[150.6345029239766, 150.4585326953748, 150.085...","[114.14035087719299, 114.04359383306752, 114.0...","[133.0358851674641, 132.94364699627857, 132.71...",1.0,0.67,720.0,...,,,,,,,,,,3.0
1,20230801,DBR,12,"[17605.041895784238, 17518.820109976434, 17413...","[151.07200837915684, 150.98140874574497, 150.6...","[114.01702016234617, 114.16653574234093, 114.0...","[132.7148468185389, 132.74417386750457, 132.54...",1.0,0.67,720.0,...,,,,,,,,,,3.0
2,20230801,DBR,13,"[18126.012227538544, 18043.00345560872, 17947....","[149.52711323763955, 149.5980861244019, 149.14...","[113.96810207336523, 114.11802232854865, 114.0...","[132.60552897395002, 132.7538543328017, 132.55...",1.0,0.67,720.0,...,,,,,,,,,,3.0
3,20230801,DBR,14,"[17089.280963603036, 17050.76433621367, 16999....","[149.5386226760932, 149.5833987954962, 149.560...","[114.00523697302958, 114.019114951558, 113.893...","[131.87666928515318, 131.90756742602775, 131.8...",1.0,0.67,720.0,...,,,,,,,,,,3.0
4,20230801,DBR,21,"[19371.1154752553, 19283.620843152657, 19188.8...","[154.36763550667715, 154.68447237496727, 154.5...","[114.67426027755957, 114.6305315527625, 114.62...","[136.4278606965174, 136.40560356114165, 136.16...",1.0,0.67,720.0,...,,,,,,,,,,3.0


# Molarity

In [9]:
# Read the HDF5 file into a DataFrame
readDF=pd.read_hdf(filename, 'df') 

# Mapping from molarity values to discrete class labels
class_mapping = {
    0.56: 0,
    0.61: 1,
    0.67: 2,
    0.75: 3,
    0.84: 4,
}
# Apply the class mapping to create a 'class' column in the DataFrame
readDF['class'] = readDF['molarity'].replace(class_mapping)

# Define samples that will be used as test data     
test_sub_list=[ 'DFN', 'DGO', 'DHT','DIA', 'DIB', 'DIC','DID', 'DIE', 
               'DIF','DIG', 'DIH', 'DII','DIJ', 'DIK', 'DIL']

# Define sub-lists for different training folds during 5-fold cross-validation
train_fold0_sub_list=['DDM', 'DFA', 'DFB', 'DFD', 'DFE','DFG', 'DFH', 'DFJ', 'DFK']
train_fold1_sub_list=['DEM', 'DEN','DFC',  'DFF', 'DGD', 'DFI', 'DGG', 'DFL', 'DGJ']
train_fold2_sub_list=['DEO', 'DEP','DGB', 'DGC', 'DGE','DGH', 'DGI','DGK', 'DGL']
train_fold3_sub_list=['DEQ', 'DER','DHA', 'DHB','DHD', 'DHE','DHG', 'DHH', 'DHJ']
train_fold4_sub_list=['DDN','DGA','DFM', 'DHC', 'DHF', 'DHI', 'DHL', 'DGF','DHK']

# Drop samples not in any fold from the training DataFrame and reset its index
indexDropTrain = readDF[ ~readDF['expID'].isin(train_fold0_sub_list+train_fold1_sub_list+train_fold2_sub_list+train_fold3_sub_list+train_fold4_sub_list ) ].index
trainDF=readDF.drop(indexDropTrain)
trainDF=trainDF.reset_index(drop=True)

# Assigning fold numbers to each sample in trainDF
for element in trainDF[trainDF['expID'].isin(train_fold0_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=0
for element in trainDF[trainDF['expID'].isin(train_fold1_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=1
for element in trainDF[trainDF['expID'].isin(train_fold2_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=2
for element in trainDF[trainDF['expID'].isin(train_fold3_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=3
for element in trainDF[trainDF['expID'].isin(train_fold4_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=4
    
# Drop samples which are not in the test list from the DataFrame to create a test set and reset its index
indexDropTest = readDF[ ~readDF['expID'].isin(test_sub_list) ].index
testDF=readDF.drop(indexDropTest)
testDF=testDF.reset_index(drop=True)

# Construct feature matrices for training and testing by horizontally stacking the specified signal columns
X_train = pd.DataFrame(data = np.hstack((np.vstack(trainDF["ND"].to_numpy()),np.vstack(trainDF["LP725"].to_numpy()),
            np.vstack(trainDF["LP780"].to_numpy()),np.vstack(trainDF["SP775"].to_numpy()))))
X_test = pd.DataFrame(data = np.hstack((np.vstack(testDF["ND"].to_numpy()),np.vstack(testDF["LP725"].to_numpy()),
            np.vstack(testDF["LP780"].to_numpy()),np.vstack(testDF["SP775"].to_numpy()))))

# Create target arrays for training and testing datasets
y_train = np.array(trainDF['class'])
y_test = np.array(testDF['class'])

In [11]:
trainDF.head()

Unnamed: 0,date,expID,patchID,ND,LP725,LP780,SP775,ratio,molarity,evac_duration,...,RSHUNT_forward,RSHUNT_backward,RS_forward,RS_backward,PLQY,iVOC,jscPLQY,egPLQY,class,fold
0,20230808,DDM,11,"[20447.484316852737, 20372.55874534822, 20283....","[151.83997873471557, 151.7854864433812, 151.69...","[114.10393407761829, 114.15948963317385, 114.1...","[132.71105794790006, 132.63795853269536, 132.6...",1.0,0.67,720.0,...,2615.0,6315.0,92.0,77.0,0.317,1.14,21.822,1.582,2.0,0.0
1,20230808,DDM,12,"[18999.957318669807, 18871.351924587587, 18821...","[152.45037968054464, 152.07069913589945, 152.5...","[113.58051846032993, 113.68211573710396, 113.8...","[132.09269442262374, 132.11966483372612, 132.3...",1.0,0.67,720.0,...,4748.0,7910.0,71.0,62.0,0.249,1.134,21.822,1.583,2.0,0.0
2,20230808,DDM,13,"[19407.616959064326, 19321.50212652844, 19251....","[151.17198298777245, 150.85300372142478, 151.0...","[113.92769803296119, 113.65975544922914, 113.8...","[131.914673046252, 132.0515683147262, 131.8716...",1.0,0.67,720.0,...,2265.0,3911.0,52.0,81.0,0.302,1.138,21.822,1.583,2.0,0.0
3,20230808,DDM,14,"[18705.356376014664, 18647.27258444619, 18581....","[154.15213406650955, 153.9795758051846, 153.81...","[113.85362660382299, 113.92223095051061, 113.9...","[140.42183817753337, 140.06284367635507, 140.0...",1.0,0.67,720.0,...,4876.0,7427.0,72.0,62.0,0.186,1.126,21.822,1.583,2.0,0.0
4,20230808,DDM,21,"[20418.130662477088, 20331.922492799164, 20142...","[155.6394344069128, 155.58104215763288, 155.76...","[114.10395391463734, 114.14977742864625, 114.2...","[133.0738413197172, 133.00523697302958, 132.97...",1.0,0.67,720.0,...,6585.0,9521.0,71.0,63.0,0.325,1.139,21.899,1.581,2.0,0.0


In [12]:
testDF.head()

Unnamed: 0,date,expID,patchID,ND,LP725,LP780,SP775,ratio,molarity,evac_duration,...,FF_backward,RSHUNT_forward,RSHUNT_backward,RS_forward,RS_backward,PLQY,iVOC,jscPLQY,egPLQY,class
0,20230822,DFN,11,"[19718.26812770563, 19626.41314935065, 19536.2...","[155.1461038961039, 155.21753246753246, 154.93...","[114.60984848484848, 114.55465367965368, 114.5...","[133.75703463203465, 133.6977813852814, 133.97...",1.0,0.67,720.0,...,72.55,6322.0,25091.0,67.0,54.0,0.077,1.106,21.668,1.587,2.0
1,20230822,DFN,12,"[18604.333333333332, 18508.16693722944, 18431....","[157.00676406926408, 156.54707792207793, 156.8...","[114.1853354978355, 114.33441558441558, 114.30...","[132.98863636363637, 133.05384199134198, 133.0...",1.0,0.67,720.0,...,71.1,5864.0,27093.0,76.0,57.0,0.051,1.099,21.515,1.593,2.0
2,20230822,DFN,13,"[18881.385281385283, 18792.37635281385, 18736....","[155.30790043290042, 155.19426406926408, 154.8...","[114.42045454545455, 114.4323593073593, 114.49...","[133.1025432900433, 132.97673160173161, 133.01...",1.0,0.67,720.0,...,71.0,6489.0,25172.0,67.0,55.0,0.091,1.106,21.822,1.583,2.0
3,20230822,DFN,14,"[18134.16693722944, 18112.218073593074, 18055....","[157.36525974025975, 157.56222943722943, 156.9...","[114.4512987012987, 114.34469696969697, 114.43...","[133.05194805194805, 132.90719696969697, 132.8...",1.0,0.67,720.0,...,72.86,7691.0,26516.0,64.0,53.0,0.073,1.104,21.668,1.588,2.0
4,20230822,DFN,21,"[21166.849296536795, 21117.14258658009, 21007....","[158.97808441558442, 158.8647186147186, 159.39...","[114.8465909090909, 114.89799783549783, 114.63...","[135.37310606060606, 135.41991341991343, 135.4...",1.0,0.67,720.0,...,71.46,5573.0,35199.0,72.0,58.0,0.062,1.101,21.668,1.587,2.0
