In [1]:
import pandas as pd

# create mock input data
# Specify the path to your Excel file
file_path = 'Etapa3_DATE MODELATE_RF 11.2024.xlsx'

# Read all sheets into a dictionary of DataFrames
all_sheets = pd.read_excel(file_path, sheet_name=None)


# Iterate through the dictionary of DataFrames
for sheet_name, df in all_sheets.items():
    # Add a new column with the sheet name
    df['SheetName'] = sheet_name
    
    # Assign the DataFrame to a variable named after the sheet name
    globals()[sheet_name] = df
    globals()[sheet_name].set_index('Primary ID', inplace=True)
    globals()[sheet_name].drop(columns=['Sec ID', 'SheetName'], inplace=True)

In [2]:
X1 = globals()['Input1_Etapa3']
X2 = globals()['Input2_Etapa3']
X = pd.merge(X1, X2, left_index=True, right_index=True)
X.dropna(inplace=True)

# Check the number of columns
num_columns = X.shape[1]
print(f"Number of columns: {num_columns}")

# Create a header list
header = ["Primary ID"] + list(X.columns)

X.to_csv("X_mock.csv", index=True)


Number of columns: 1509


In [3]:
X

Unnamed: 0_level_0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,Bioavailability Score,PAINS #alerts,Brenk #alerts,Leadlikeness #violations,Synthetic Accessibility,API %,Plast %,ST-Diam (mm),3PBT-Diam (mm),3PBT-Radius (mm)
Primary ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BCS1_S1,0,0.7160,0.512656,112.2327,60.25982,6,6,53,28,25,...,0.55,0,1,2,4.39,5,0.0,1.99,1.88,0.940
BCS1_S2,0,0.7160,0.512656,112.2327,60.25982,6,6,53,28,25,...,0.55,0,1,2,4.39,5,0.0,1.91,1.77,0.885
BCS1_S3,0,0.7160,0.512656,112.2327,60.25982,6,6,53,28,25,...,0.55,0,1,2,4.39,5,0.0,1.91,1.82,0.910
BCS1_S4,0,0.7160,0.512656,112.2327,60.25982,6,6,53,28,25,...,0.55,0,1,2,4.39,5,0.0,1.91,1.91,0.955
BCS1_S5,0,0.7160,0.512656,112.2327,60.25982,6,6,53,28,25,...,0.55,0,1,2,4.39,5,0.0,1.90,1.83,0.915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PS-S16,1,1.8296,3.347436,83.8478,39.03872,12,12,30,19,11,...,0.85,0,0,1,2.23,25,15.0,1.78,1.86,0.930
PS-S17,1,1.8296,3.347436,83.8478,39.03872,12,12,30,19,11,...,0.85,0,0,1,2.23,25,15.0,1.86,1.86,0.930
PS-S18,1,1.8296,3.347436,83.8478,39.03872,12,12,30,19,11,...,0.85,0,0,1,2.23,25,15.0,1.92,1.83,0.915
PS-S19,1,1.8296,3.347436,83.8478,39.03872,12,12,30,19,11,...,0.85,0,0,1,2.23,25,15.0,1.84,1.89,0.945


In [4]:
from sklearn.decomposition import PCA
import joblib

def infer(X):
    # Drop the columns only if they exist in the DataFrame
    columns_to_drop = ['Sec ID']
    X = X.drop(columns=[col for col in columns_to_drop if col in X.columns], errors='ignore')

    X.set_index('Primary ID', inplace=True)

    # Load the column names from the Excel files (assuming they are in the first row)
    padel_cols = pd.read_excel('Padel_cols.xlsx', header=None).iloc[0].dropna().tolist()
    spartan_cols = pd.read_excel('Spartan_cols.xlsx', header=None).iloc[0].dropna().tolist()
    swissadme_cols = pd.read_excel('Swissadme_cols.xlsx', header=None).iloc[0].dropna().tolist()

    # Load the preprocessing parameters
    zero_variance_columns_padel = joblib.load('padel_cols_to_drop.pkl')
    zero_variance_columns_swissadme = joblib.load('swissadme_cols_to_drop.pkl')
    zero_variance_columns_spartan = joblib.load('spartan_cols_to_drop.pkl')

    # Drop zero variance columns
    X_padel = X[padel_cols].drop(columns=zero_variance_columns_padel)
    X_swissadme = X[swissadme_cols].drop(columns=zero_variance_columns_swissadme)
    X_spartan = X[spartan_cols].drop(columns=zero_variance_columns_spartan)

    # load the values for standardization
    mean_padel = joblib.load('padel_mean.pkl')
    std_padel = joblib.load('padel_std.pkl')
    mean_swissadme = joblib.load('swissadme_mean.pkl')
    std_swissadme = joblib.load('swissadme_std.pkl')
    mean_spartan = joblib.load('spartan_mean.pkl')
    std_spartan = joblib.load('spartan_std.pkl')

    # standardize the molecular descriptor columns
    X_padel_standardized = (X_padel - mean_padel) / std_padel
    X_swissadme_standardized = (X_swissadme - mean_swissadme) / std_swissadme
    X_spartan_standardized = (X_spartan - mean_spartan) / std_spartan

    # load PCA axes
    pca_padel = joblib.load('padel_pca_fit.pkl')
    pca_swissadme = joblib.load('swissadme_pca_fit.pkl')
    pca_spartan = joblib.load('spartan_pca_fit.pkl')

    X_pca_padel = pca_padel.fit_transform(X_padel_standardized)
    X_pca_swissadme = pca_swissadme.fit_transform(X_swissadme_standardized)
    X_pca_spartan = pca_spartan.fit_transform(X_spartan_standardized)

    # Convert PCA results to DataFrames with 'Primary ID' as the index
    X_padel_pca_df = pd.DataFrame(X_pca_padel, index=X.index)
    X_spartan_pca_df = pd.DataFrame(X_pca_spartan, index=X.index)
    X_swissadme_pca_df = pd.DataFrame(X_pca_swissadme, index=X.index)

    other_features_cols = ["API %", "Plast %", "ST-Diam (mm)", "3PBT-Diam (mm)", "3PBT-Radius (mm)"]
    X_other_features = X[other_features_cols].set_index(X.index)

    # Normalize the rest of the columns
    non_pca_features_mean = joblib.load('non_pca_features_mean.pkl')
    non_pca_features_std = joblib.load('non_pca_features_std.pkl')
    X_standardized = (X_other_features - non_pca_features_mean) / non_pca_features_std
    
    # Merge the PCA-transformed data back together with the rest of the features
    X_final = pd.concat([X_padel_pca_df, X_spartan_pca_df, X_swissadme_pca_df, X_standardized], axis=1)
    print(X_final.columns)

    # Make sure all the column names are strings
    X_final.columns = X_final.columns.astype(str)

    # do inference
    RF_multiregressor = joblib.load('multi_output_model.pkl')
    y_pred = RF_multiregressor.predict(X_final.values)

    return y_pred

    

In [5]:
import pandas as pd

X = pd.read_csv('X_mock.csv')
y_hat = infer(X)

Index([                 0,                  1,                  2,
                        3,                  4,                  5,
                        6,                  7,                  8,
                        0,                  1,                  2,
                        3,                  4,                  5,
                        6,                  0,                  1,
                        2,                  3,                  4,
                        5,                  6,            'API %',
                'Plast %',     'ST-Diam (mm)',   '3PBT-Diam (mm)',
       '3PBT-Radius (mm)'],
      dtype='object')


In [9]:
y_hat

array([[8.62945000e+03, 2.10775000e+03, 6.77260000e+03, ...,
        7.54904736e+00, 3.46230000e+00, 5.79964312e+00],
       [8.40015000e+03, 1.79475000e+03, 6.38630000e+03, ...,
        8.33097504e+00, 2.72560000e+00, 4.66601192e+00],
       [7.88515000e+03, 1.67585000e+03, 5.82120000e+03, ...,
        8.34000298e+00, 2.66580000e+00, 5.50026754e+00],
       ...,
       [2.35075000e+03, 5.92450000e+02, 1.82780000e+03, ...,
        6.67291200e+01, 1.87080000e+00, 2.53568998e+00],
       [2.33250000e+03, 5.39100000e+02, 1.90640000e+03, ...,
        6.61814400e+01, 1.82490000e+00, 2.47759233e+00],
       [2.23730000e+03, 4.51300000e+02, 1.73375000e+03, ...,
        6.71510400e+01, 2.29090000e+00, 2.28866632e+00]])

In [6]:
print(y_hat.shape)

(598, 13)


In [7]:
y_hat_one = infer(X.iloc[range(0,9), :])


Index([                 0,                  1,                  2,
                        3,                  4,                  5,
                        6,                  7,                  8,
                        0,                  1,                  2,
                        3,                  4,                  5,
                        6,                  0,                  1,
                        2,                  3,                  4,
                        5,                  6,            'API %',
                'Plast %',     'ST-Diam (mm)',   '3PBT-Diam (mm)',
       '3PBT-Radius (mm)'],
      dtype='object')


In [8]:
print(X.head())

  Primary ID  nAcid  ALogP    ALogp2       AMR      apol  naAromAtom  \
0    BCS1_S1      0  0.716  0.512656  112.2327  60.25982           6   
1    BCS1_S2      0  0.716  0.512656  112.2327  60.25982           6   
2    BCS1_S3      0  0.716  0.512656  112.2327  60.25982           6   
3    BCS1_S4      0  0.716  0.512656  112.2327  60.25982           6   
4    BCS1_S5      0  0.716  0.512656  112.2327  60.25982           6   

   nAromBond  nAtom  nHeavyAtom  ...  Bioavailability Score  PAINS #alerts  \
0          6     53          28  ...                   0.55              0   
1          6     53          28  ...                   0.55              0   
2          6     53          28  ...                   0.55              0   
3          6     53          28  ...                   0.55              0   
4          6     53          28  ...                   0.55              0   

   Brenk #alerts  Leadlikeness #violations  Synthetic Accessibility  API %  \
0              1    