In [6]:
import pandas as pd
import numpy as np
import pickle
import os

def load_parameters(param_path):
    
    try:
        df_param = pd.read_excel(param_path)
        
        
        required_columns = ['CAS Number', 'a', 'b']
        if not set(required_columns).issubset(df_param.columns):
            missing = set(required_columns) - set(df_param.columns)
            raise ValueError(f"The parameter file is missing necessary columns: {missing}")

        
        param_dict = {}
        for _, row in df_param.iterrows():
            compound = str(row['CAS Number']).strip()
            param_dict[compound] = {
                'a': float(row['a']),
                'b': float(row['b'])
            }
        return param_dict
    
    except Exception as e:
        raise RuntimeError(f"Parameter loading failed: {str(e)}")

def calculate_logKOA(input_path, param_path):
    
    try:
        
        param_dict = load_parameters(param_path)
        df_input = pd.read_excel(input_path)
        
        
        required_input_columns = ['CAS Number', 'Temperature']
        if not set(required_input_columns).issubset(df_input.columns):
            missing = set(required_input_columns) - set(df_input.columns)
            raise ValueError(f"The input file is missing necessary columns: {missing}")

        
        df_input['log KOA'] = np.nan
        error_log = []

        
        for idx, row in df_input.iterrows():
            compound = str(row['CAS Number']).strip()
            T_value = row['Temperature']
            
            
            if compound not in param_dict:
                error_log.append(f"Row {idx+2}: Undefined compound '{compound}'")
                continue
                
            
            try:
                T = float(T_value)
                if np.isnan(T):
                    raise ValueError("Temperature value is empty")
            except Exception as e:
                error_log.append(f"Row {idx+2}: Invalid Temperature value '{T_value}' - {str(e)}")
                continue
                
            
            a = param_dict[compound]['a']
            b = param_dict[compound]['b']
            
            
            try:
                logKOA = a + b / (273.15 + T)
                df_input.at[idx, 'log KOA'] = round(logKOA, 13)
            except ZeroDivisionError:
                error_log.append(f"Row {idx+2}: Zero Division Error (Temperature={T})")
            except Exception as e:
                error_log.append(f"Row {idx+2}: calculation error - {str(e)}")

        
        report = [
            f"Number of successfully processed rows in the process of calculating logKOA: {len(df_input) - len(error_log)}/{len(df_input)}",
            f"Total number of errors: {len(error_log)}",
            "Detials of errors:"
        ] + error_log
        
        print("\n".join(report[:3]))
        
        
        return df_input

    except Exception as e:
        print(f"Processing failed for calculating logKOA: {str(e)}")


if __name__ == "__main__":
    
    
    input_excel = input("Please enter the path of the input data file (For example：Input.xlsx): ").strip('"')
    param_excel = input("Please enter the path of the Parameters file (For example：Parameters.xlsx): ").strip('"')
    
    try:
        logKPi_prediction_input = calculate_logKOA(
        input_path=input_excel,
        param_path=param_excel
        )
    except FileNotFoundError:
        print("Error：The input file does not exist, please check the path!")
    except Exception as e:
        print(f"Run Time Error: {str(e)}")
    


CPi_calculation_input = logKPi_prediction_input.copy()
input_data = logKPi_prediction_input[['Temperature', 'log GM (DPi)', 'log KOA']].copy()

Original_input_indices = logKPi_prediction_input['Temperature'].index

min_val = pickle.load(open('min_3_features.pkl', 'rb'))
max_val = pickle.load(open('max_3_features.pkl', 'rb'))
def preprocess(input_data):
    
    preprocessed_data = input_data.copy()
    features = ['Temperature', 'log GM (DPi)', 'log KOA']
    for col in features:
        x = preprocessed_data[col]
        normalized_data = (x - min_val[col]) / (max_val[col] - min_val[col])
        preprocessed_data[col] = normalized_data
    return preprocessed_data

  
preprocessed_data = preprocess(input_data)


with open('XGB_3_input_features.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
    


y_pred = loaded_model.predict(preprocessed_data)

CPi_calculation_input["Predicted log KPi"] = pd.Series(y_pred, index=Original_input_indices)



def calculating_CPi(input_path):
    
    try:
        
        df_input = input_path
        
        
        required_input_columns = ['Predicted log KPi', 'CG', 'CPMi']
        if not set(required_input_columns).issubset(df_input.columns):
            missing = set(required_input_columns) - set(df_input.columns)
            raise ValueError(f"In the process of calculating CPi, the input file is missing necessary columns: {missing}")

        
        df_input['CPi_pred'] = np.nan
        error_log = []

        
        for idx, row in df_input.iterrows():
            logKPi_pred_value = row['Predicted log KPi']
            CG_value = row['CG']
            CPMi_value = row['CPMi']
            
            
            try:
                logKPi_pred = float(logKPi_pred_value)
                if np.isnan(logKPi_pred):
                    raise ValueError("logKPi_pred value is empty")
            except Exception as e:
                error_log.append(f"Row {idx+2}: Invalid logKPi_pred value '{logKPi_pred_value}' - {str(e)}")
                continue
            
            
            try:
                CG = float(CG_value)
                if np.isnan(CG):
                    raise ValueError("CG value is empty")
            except Exception as e:
                error_log.append(f"Row {idx+2}: Invalid CG value '{CG_value}' - {str(e)}")
                continue
                
            
            try:
                CPMi = float(CPMi_value)
                if np.isnan(CPMi):
                    raise ValueError("CPMi value is empty")
            except Exception as e:
                error_log.append(f"Row {idx+2}: Invalid CPMi value '{CPMi_value}' - {str(e)}")
                continue
                
            
            
            try:
                CPi_pred = (10 ** logKPi_pred) * CG * CPMi
                df_input.at[idx, 'CPi_pred'] = round(CPi_pred, 13)
            except Exception as e:
                error_log.append(f"Row {idx+2}: calculation error - {str(e)}")

        
        
        report = [
            f"Number of successfully processed rows in the process of calculating CPi: {len(df_input) - len(error_log)}/{len(df_input)}",
            f"Total number of errors: {len(error_log)}",
            "Detials of errors:"
        ] + error_log
        
        print("\n".join(report[:3]))
        
        
        return df_input

    except Exception as e:
        print(f"Processing failed for calculating CPi: {str(e)}")


if __name__ == "__main__":
    
    File_with_CPi_pred = calculating_CPi(
        input_path=CPi_calculation_input
    )
    


def load_parameters(param_path):
    
    try:
        df_param = pd.read_excel(param_path)
        
        
        required_columns = ['CAS Number', 'TEF']
        if not set(required_columns).issubset(df_param.columns):
            missing = set(required_columns) - set(df_param.columns)
            raise ValueError(f"The parameter file is missing necessary columns: {missing}")

        
        param_dict = {}
        for _, row in df_param.iterrows():
            compound = str(row['CAS Number']).strip()
            param_dict[compound] = {
                'TEF': float(row['TEF'])
            }
        return param_dict
    
    except Exception as e:
        raise RuntimeError(f"Parameter loading failed: {str(e)}")

def calculating_BaPeqij(input_path, param_path):
    
    try:
        
        param_dict = load_parameters(param_path)
        df_input = input_path
        
        
        required_input_columns = ['CAS Number', 'CPi_pred']
        if not set(required_input_columns).issubset(df_input.columns):
            missing = set(required_input_columns) - set(df_input.columns)
            raise ValueError(f"In the process of calculating BaPeqi of each PAH compound, the input file is missing necessary columns: {missing}")

        
        df_input['BaPeqij'] = np.nan
        error_log = []

        
        for idx, row in df_input.iterrows():
            compound = str(row['CAS Number']).strip()
            CPi_pred_value = row['CPi_pred']
            
            
            if compound not in param_dict:
                error_log.append(f"Row {idx+2}: Undefined compound '{compound}'")
                continue
                
            
            try:
                CPi_pred = float(CPi_pred_value)
                if np.isnan(CPi_pred):
                    raise ValueError("CPi_pred value is empty")
            except Exception as e:
                error_log.append(f"Row {idx+2}: Invalid CPi_pred value '{CPi_pred_value}' - {str(e)}")
                continue
                
            
            TEF = param_dict[compound]['TEF']
            
            
            try:
                BaPeqij = CPi_pred * TEF
                df_input.at[idx, 'BaPeqij'] = round(BaPeqij, 13)
            except Exception as e:
                error_log.append(f"Row {idx+2}: Calculation error - {str(e)}")
        
        
        report = [
            f"Number of successfully processed rows in the process of calculating BaPeqi of each PAH compound: {len(df_input) - len(error_log)}/{len(df_input)}",
            f"Total number of errors: {len(error_log)}",
            "Details of errors:"
        ] + error_log
        
        print("\n".join(report[:3]))
        
        
        return df_input
   
    except Exception as e:
        print(f"Processing failed for calculating BaPeqi of each PAH compound: {str(e)}")


if __name__ == "__main__":
    
    File_with_BaPeqij = calculating_BaPeqij(
        input_path=File_with_CPi_pred,
        param_path=param_excel
    )


BaPeqi_pred = File_with_BaPeqij.groupby(['Temperature', 'log GM (DPi)'], as_index=False).agg(
    BaPeqi=('BaPeqij', 'sum')
)

Calculating_BaPeq_regions = BaPeqi_pred.copy()



Calculating_BaPeq_regions['GM (DPi)'] = 10 ** Calculating_BaPeq_regions['log GM (DPi)']
Calculating_BaPeq_regions['ln GM (DPi)'] = np.log(Calculating_BaPeq_regions['GM (DPi)'])


Calculating_BaPeq_regions['IFi'] = 1-0.5*(1-1/(1+0.00076*(Calculating_BaPeq_regions['GM (DPi)']**2.8)))
Calculating_BaPeq_regions['DFHAi'] = Calculating_BaPeq_regions['IFi']*(1/(1+np.exp(6.84+1.183*Calculating_BaPeq_regions['ln GM (DPi)']))+1/(1+np.exp(0.924-1.885*Calculating_BaPeq_regions['ln GM (DPi)'])))
Calculating_BaPeq_regions['DFTRi'] = (0.00352/Calculating_BaPeq_regions['GM (DPi)'])*(np.exp(-0.234*(Calculating_BaPeq_regions['ln GM (DPi)']+3.40)**2)+63.9*np.exp(-0.819*(Calculating_BaPeq_regions['ln GM (DPi)']-1.61)**2))
Calculating_BaPeq_regions['DFARi'] = (0.0155/Calculating_BaPeq_regions['GM (DPi)'])*(np.exp(-0.416*(Calculating_BaPeq_regions['ln GM (DPi)']+2.84)**2)+19.11*np.exp(-0.482*(Calculating_BaPeq_regions['ln GM (DPi)']-1.362)**2))
Calculating_BaPeq_regions['BaPeqHAi'] = Calculating_BaPeq_regions['DFHAi']*Calculating_BaPeq_regions['BaPeqi']
Calculating_BaPeq_regions['BaPeqTRi'] = Calculating_BaPeq_regions['DFTRi']*Calculating_BaPeq_regions['BaPeqi']
Calculating_BaPeq_regions['BaPeqARi'] = Calculating_BaPeq_regions['DFARi']*Calculating_BaPeq_regions['BaPeqi']


BaPeq_particulate_regions = (
    Calculating_BaPeq_regions.groupby('Temperature')
    .agg(BaPeq_HA=('BaPeqHAi', 'sum'),
         BaPeq_TR=('BaPeqTRi', 'sum'),
         BaPeq_AR=('BaPeqARi', 'sum'))
    .reset_index()
)


BaPeq_particulate_regions['BaPeq_of_particulate_PAHs'] = BaPeq_particulate_regions['BaPeq_HA'] + BaPeq_particulate_regions['BaPeq_TR'] + BaPeq_particulate_regions['BaPeq_AR']


output_path = input("Please input the saving path of the output result (For example：Output.xlsx): ").strip('"')

output_dir = os.path.dirname(output_path)
if output_dir:
    os.makedirs(output_dir, exist_ok=True)

BaPeq_particulate_regions.to_excel(output_path, index=False)
print(f"The ouput result has been saved to: {output_path}")

Please enter the path of the input data file (For example：Input.xlsx):  Input data for integrated model.xlsx
Please enter the path of the Parameters file (For example：Parameters.xlsx):  Parameter file for integrated model.xlsx


Number of successfully processed rows in the process of calculating logKOA: 6020/6020
Total number of errors: 0
Detials of errors:
Number of successfully processed rows in the process of calculating CPi: 6020/6020
Total number of errors: 0
Detials of errors:
Number of successfully processed rows in the process of calculating BaPeqi of each PAH compound: 6020/6020
Total number of errors: 0
Details of errors:


Please input the saving path of the output result (For example：Output.xlsx):  BaPeq of particulate PAHs in different respiratory regions directly predicted by the integrated model.xlsx


The ouput result has been saved to: BaPeq of particulate PAHs in different respiratory regions directly predicted by the integrated model.xlsx
