In [1]:
import pandas as pd
import numpy as np

In [2]:
filename='data/data.h5'

In [3]:
# Read the HDF5 file into a DataFrame
readDF=pd.read_hdf(filename, 'df') 
# Filter the DataFrame based on date, standard ratio, and standard molarity
# Before 20230724, there were problems with other processes in the fabrication. The fluctuation in processes other than the monitored perovskite formation leads to a lot of variation the solar cell performance. However, since this variation is not caused by the perovskite layer, the ML models trained on data acquired during the perovskite step can impossible know about these variation. Because of this, a thresold was set at this date.
readDF=readDF[(readDF["date"]>20230724) &(readDF["ratio"]==1) & (readDF["molarity"]==0.67)]

# List of signal types to process
signals=["ND","LP725","LP780","SP775"]

# Pad signals in the DataFrame to have the same length (1979)
for iy, signal in enumerate(signals):
    for ix in range(len(readDF)):
        temp1=readDF.loc[readDF.index[ix],signal]
        
        # Calculate how much padding is required and perform padding
        values_to_add = 1979 - len(temp1)
        constant_value = 0  
        temp1_padded = np.concatenate((temp1, [constant_value] * values_to_add))
        
        # Update the DataFrame with padded signals
        readDF.at[readDF.index[ix],signal]=temp1_padded
        
# Define samples that will be used as training data        
train_samples=[ 'DHM', 'DHN', 'DHO', 'DHP', 'DHQ' ,'DHR' ,
               'DGS', 'DGT', 'DGU' ,'DGV', 'DGW', 'DGX','DHS', 
               'DFS','DFT', 'DFU' ,'DFV', 'DFW' ,'DFX','DGR',
               'DEV' ,'DEW', 'DEX' ,'DFQ', 'DFR' ,'DGP','DGQ',
               'DAC', 'DAH' ,'DAM', 'DBC' ,'DBH', 'DBM', 'DBR', 'DCA', 'DCI', 'DDM', 'DDN' ,'DDO',
               'DDP', 'DDQ', 'DDR' ,'DDS' ,'DDT', 'DDU' ,'DDV', 'DDW' ,'DDX', 'DEM', 'DEN',
               'DES' ,'DET', 'DEU', 'DFO', 'DFP','DGM', 'DHV', 'DHW', 'DHX', 'DIU', 'DIV' ]

# Define sub-lists for different training folds during 5-fold cross-validation
train_fold0_sub_list=['DHM',  'DHR',  'DGW',  'DFU', 'DEV',  'DGP', 'DFO', 'DHX', 'DDO',  'DDT', 'DEM' ]
train_fold1_sub_list=[ 'DHN', 'DGS', 'DGX' , 'DFV', 'DEW', 'DGQ',  'DFP',  'DIU', 'DDP', 'DDU', 'DEN']
train_fold2_sub_list=[ 'DHO',  'DGT' , 'DHS',  'DFW', 'DEX', 'DES',  'DGM',  'DIV',  'DDQ',  'DDV']
train_fold3_sub_list=[ 'DHP', 'DGU', 'DFS' ,  'DFX' ,  'DFQ',  'DET', 'DHV', 'DDM', 'DDR',  'DDW']
train_fold4_sub_list=[ 'DHQ',  'DGV', 'DFT', 'DGR', 'DFR', 'DEU', 'DHW',  'DDN', 'DDS',  'DDX'  ]

# Drop samples not in any fold from the training DataFrame and reset its index
indexDropTrain = readDF[ ~readDF['expID'].isin(train_fold0_sub_list+train_fold1_sub_list+train_fold2_sub_list+train_fold3_sub_list+train_fold4_sub_list ) ].index
trainDF=readDF.drop(indexDropTrain)
trainDF=trainDF.reset_index(drop=True)

# Assigning fold numbers to each sample in trainDF
for element in trainDF[trainDF['expID'].isin(train_fold0_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=0
for element in trainDF[trainDF['expID'].isin(train_fold1_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=1
for element in trainDF[trainDF['expID'].isin(train_fold2_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=2
for element in trainDF[trainDF['expID'].isin(train_fold3_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=3
for element in trainDF[trainDF['expID'].isin(train_fold4_sub_list)].index.tolist():
    trainDF.loc[element,"fold"]=4
    
# Drop the train samples from the DataFrame to create a test set and reset its index
indexDropTest = readDF[ readDF['expID'].isin(train_samples) ].index
testDF=readDF.drop(indexDropTest)
testDF=testDF.reset_index(drop=True)

def filter(df):
    # Filter out rows where 'PCE_backward' is not available
    df=df[df['PCE_backward'].notna()]
    
    # Set threshold for 'PCE_backward' to be used
    thres_PCE=1
    
    # Filter the DataFrame to include only rows where 'PCE_backward' is larger than the threshold
    if thres_PCE>0:
        df=df[df['PCE_backward']>thres_PCE]
    
    df=df.reset_index(drop=True)
    
    return df

# Apply the `filter` function to the training and testing DataFrames.
# This ensures that only entries with 'PCE_backward' values above the threshold are retained.
trainDF, testDF = [filter(df) for df in (trainDF, testDF)]  

In [6]:
trainDF.head()

Unnamed: 0,date,expID,patchID,ND,LP725,LP780,SP775,ratio,molarity,evac_duration,...,FF_backward,RSHUNT_forward,RSHUNT_backward,RS_forward,RS_backward,PLQY,iVOC,jscPLQY,egPLQY,fold
0,20230808,DDM,11,"[20447.484316852737, 20372.55874534822, 20283....","[151.83997873471557, 151.7854864433812, 151.69...","[114.10393407761829, 114.15948963317385, 114.1...","[132.71105794790006, 132.63795853269536, 132.6...",1.0,0.67,720.0,...,61.57,2615.0,6315.0,92.0,77.0,0.317,1.14,21.822,1.582,3.0
1,20230808,DDM,12,"[18999.957318669807, 18871.351924587587, 18821...","[152.45037968054464, 152.07069913589945, 152.5...","[113.58051846032993, 113.68211573710396, 113.8...","[132.09269442262374, 132.11966483372612, 132.3...",1.0,0.67,720.0,...,65.2,4748.0,7910.0,71.0,62.0,0.249,1.134,21.822,1.583,3.0
2,20230808,DDM,13,"[19407.616959064326, 19321.50212652844, 19251....","[151.17198298777245, 150.85300372142478, 151.0...","[113.92769803296119, 113.65975544922914, 113.8...","[131.914673046252, 132.0515683147262, 131.8716...",1.0,0.67,720.0,...,55.0,2265.0,3911.0,52.0,81.0,0.302,1.138,21.822,1.583,3.0
3,20230808,DDM,14,"[18705.356376014664, 18647.27258444619, 18581....","[154.15213406650955, 153.9795758051846, 153.81...","[113.85362660382299, 113.92223095051061, 113.9...","[140.42183817753337, 140.06284367635507, 140.0...",1.0,0.67,720.0,...,65.66,4876.0,7427.0,72.0,62.0,0.186,1.126,21.822,1.583,3.0
4,20230808,DDM,21,"[20418.130662477088, 20331.922492799164, 20142...","[155.6394344069128, 155.58104215763288, 155.76...","[114.10395391463734, 114.14977742864625, 114.2...","[133.0738413197172, 133.00523697302958, 132.97...",1.0,0.67,720.0,...,65.18,6585.0,9521.0,71.0,63.0,0.325,1.139,21.899,1.581,3.0


In [7]:
testDF.head()

Unnamed: 0,date,expID,patchID,ND,LP725,LP780,SP775,ratio,molarity,evac_duration,...,FF_forward,FF_backward,RSHUNT_forward,RSHUNT_backward,RS_forward,RS_backward,PLQY,iVOC,jscPLQY,egPLQY
0,20230816,DEO,11,"[20062.38490164806, 19955.747208931418, 19823....","[156.744019138756, 156.47315257841575, 156.626...","[114.41414141414141, 114.349548112706, 114.306...","[131.85300372142478, 131.85885167464116, 131.9...",1.0,0.67,720.0,...,61.13,67.58,10746.0,44120.0,102.0,77.0,0.107,1.103,22.054,1.576
1,20230816,DEO,12,"[18850.19750132908, 18760.504253056886, 18551....","[153.81977671451355, 154.26847421584264, 154.4...","[114.26129718234981, 114.25784157363104, 114.1...","[131.14566719829878, 131.10845295055822, 130.8...",1.0,0.67,720.0,...,55.88,65.57,7747.0,311854.0,101.0,68.0,0.075,1.1,21.822,1.583
2,20230816,DEO,13,"[18961.158958001062, 18843.721424774056, 18745...","[156.1711855396066, 156.03987240829346, 156.06...","[114.54518872939926, 114.3705475810739, 114.48...","[131.06299840510366, 131.15071770334927, 130.8...",1.0,0.67,720.0,...,56.16,65.78,6847.0,31268.0,91.0,68.0,0.182,1.119,21.899,1.58
3,20230816,DEO,14,"[18534.28734715577, 18422.086124401914, 18322....","[153.06246677299308, 153.3370547581074, 153.07...","[114.42583732057416, 114.31472620946305, 114.3...","[130.62227538543328, 130.8910154173312, 130.72...",1.0,0.67,720.0,...,61.82,68.89,11783.0,42474.0,79.0,62.0,0.12,1.104,22.054,1.576
4,20230816,DEO,31,"[20208.205741626793, 20070.29691653376, 19943....","[159.99096225412015, 159.58373205741626, 160.0...","[114.42530568846358, 114.40111642743221, 114.3...","[132.69803296119085, 132.34051036682615, 132.5...",1.0,0.67,720.0,...,61.08,67.42,10293.0,40024.0,91.0,73.0,0.115,1.11,21.822,1.582
