In [4]:
import pandas as pd

# create mock input data
# Specify the path to your Excel file
file_path = 'Etapa4 11.2024.xlsx'

# Read all sheets into a dictionary of DataFrames
all_sheets = pd.read_excel(file_path, sheet_name=None)


# Iterate through the dictionary of DataFrames
for sheet_name, df in all_sheets.items():
    # Add a new column with the sheet name
    df['SheetName'] = sheet_name
    
    # Assign the DataFrame to a variable named after the sheet name
    globals()[sheet_name] = df
    globals()[sheet_name].set_index('Primary ID', inplace=True)
    globals()[sheet_name].drop(columns=['Sec ID', 'SheetName'], inplace=True, errors='ignore')

In [5]:
X = globals()['Input1_Etapa4']
X.dropna(inplace=True)

# Check the number of columns
num_columns = X.shape[1]
print(f"Number of columns: {num_columns}")

# Create a header list
header = ["Primary ID"] + list(X.columns)

X.to_csv("X_mock.csv", index=True)


Number of columns: 1506


In [6]:
X

Unnamed: 0_level_0,API %,Plast %,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,...,Lipinski #violations,Ghose #violations,Veber #violations,Egan #violations,Muegge #violations,Bioavailability Score,PAINS #alerts,Brenk #alerts,Leadlikeness #violations,Synthetic Accessibility
Primary ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Amlodipine_N1_1,5,0.0,0,0.7160,0.512656,112.2327,60.25982,6,6,53,...,0,0,0,0,0,0.55,0,1,2,4.39
Amlodipine_N2_1,15,0.0,0,0.7160,0.512656,112.2327,60.25982,6,6,53,...,0,0,0,0,0,0.55,0,1,2,4.39
Amlodipine_N3_1,25,0.0,0,0.7160,0.512656,112.2327,60.25982,6,6,53,...,0,0,0,0,0,0.55,0,1,2,4.39
Amlodipine_N4_1,5,7.5,0,0.7160,0.512656,112.2327,60.25982,6,6,53,...,0,0,0,0,0,0.55,0,1,2,4.39
Amlodipine_N5_1,15,7.5,0,0.7160,0.512656,112.2327,60.25982,6,6,53,...,0,0,0,0,0,0.55,0,1,2,4.39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Spironolactona_N10_1,15,7.5,0,1.4012,1.963361,112.2170,69.68538,0,0,61,...,0,0,0,0,0,0.55,0,1,1,5.94
Spironolactona_N11_1,15,7.5,0,1.4012,1.963361,112.2170,69.68538,0,0,61,...,0,0,0,0,0,0.55,0,1,1,5.94
Cafeina_N2_1,15,0.0,0,-0.9595,0.920640,49.7123,26.75193,5,5,24,...,0,1,0,0,1,0.55,0,0,1,2.03
Cafeina_N5_1,15,7.5,0,-0.9595,0.920640,49.7123,26.75193,5,5,24,...,0,1,0,0,1,0.55,0,0,1,2.03


In [9]:
from sklearn.decomposition import PCA
import joblib

def infer(X):
    # Drop the columns only if they exist in the DataFrame
    columns_to_drop = ['Sec ID']
    X = X.drop(columns=[col for col in columns_to_drop if col in X.columns], errors='ignore')

    X.set_index('Primary ID', inplace=True)

    # Load the column names from the Excel files (assuming they are in the first row)
    padel_cols = pd.read_excel('Padel_cols.xlsx', header=None).iloc[0].dropna().tolist()
    spartan_cols = pd.read_excel('Spartan_cols.xlsx', header=None).iloc[0].dropna().tolist()
    swissadme_cols = pd.read_excel('Swissadme_cols.xlsx', header=None).iloc[0].dropna().tolist()

    # Load the preprocessing parameters
    zero_variance_columns_padel = joblib.load('padel_cols_to_drop.pkl')
    zero_variance_columns_swissadme = joblib.load('swissadme_cols_to_drop.pkl')
    zero_variance_columns_spartan = joblib.load('spartan_cols_to_drop.pkl')

    # Drop zero variance columns
    X_padel = X[padel_cols].drop(columns=zero_variance_columns_padel)
    X_swissadme = X[swissadme_cols].drop(columns=zero_variance_columns_swissadme)
    X_spartan = X[spartan_cols].drop(columns=zero_variance_columns_spartan)

    # load the values for standardization
    mean_padel = joblib.load('padel_mean.pkl')
    std_padel = joblib.load('padel_std.pkl')
    mean_swissadme = joblib.load('swissadme_mean.pkl')
    std_swissadme = joblib.load('swissadme_std.pkl')
    mean_spartan = joblib.load('spartan_mean.pkl')
    std_spartan = joblib.load('spartan_std.pkl')

    # standardize the molecular descriptor columns
    X_padel_standardized = (X_padel - mean_padel) / std_padel
    X_swissadme_standardized = (X_swissadme - mean_swissadme) / std_swissadme
    X_spartan_standardized = (X_spartan - mean_spartan) / std_spartan

    # load PCA axes
    pca_padel = joblib.load('padel_pca_fit.pkl')
    pca_swissadme = joblib.load('swissadme_pca_fit.pkl')
    pca_spartan = joblib.load('spartan_pca_fit.pkl')

    X_pca_padel = pca_padel.fit_transform(X_padel_standardized)
    X_pca_swissadme = pca_swissadme.fit_transform(X_swissadme_standardized)
    X_pca_spartan = pca_spartan.fit_transform(X_spartan_standardized)

    # Convert PCA results to DataFrames with 'Primary ID' as the index
    X_padel_pca_df = pd.DataFrame(X_pca_padel, index=X.index)
    X_spartan_pca_df = pd.DataFrame(X_pca_spartan, index=X.index)
    X_swissadme_pca_df = pd.DataFrame(X_pca_swissadme, index=X.index)

    other_features_cols = ["API %", "Plast %"]
    X_other_features = X[other_features_cols].set_index(X.index)

    # Normalize the rest of the columns
    non_pca_features_mean = joblib.load('non_pca_features_mean.pkl')
    non_pca_features_std = joblib.load('non_pca_features_std.pkl')
    X_standardized = (X_other_features - non_pca_features_mean) / non_pca_features_std
    
    # Merge the PCA-transformed data back together with the rest of the features
    X_final = pd.concat([X_padel_pca_df, X_spartan_pca_df, X_swissadme_pca_df, X_standardized], axis=1)
    print(X_final.columns)

    # Make sure all the column names are strings
    X_final.columns = X_final.columns.astype(str)

    # do inference
    RF_multiregressor = joblib.load('single_output_model.pkl')
    y_pred = RF_multiregressor.predict(X_final.values)

    return y_pred

    

In [10]:
import pandas as pd

X = pd.read_csv('X_mock.csv')
y_hat = infer(X)

Index([        0,         1,         2,         3,         4,         5,
               6,         7,         8,         0,         1,         2,
               3,         4,         5,         0,         1,         2,
               3,         4,         5,         6,   'API %', 'Plast %'],
      dtype='object')


In [11]:
y_hat

array([3.3       , 2.23666667, 2.18      , 4.68333333, 3.42735714,
       2.75      , 4.09333333, 3.08166667, 2.85      , 3.42735714,
       3.42735714, 4.87      , 4.81      , 4.77      , 4.96      ,
       4.875     , 4.63      , 4.56      , 4.25      , 3.66      ,
       4.875     , 4.875     , 3.42      , 2.70833333, 2.78      ,
       4.52      , 3.4695    , 3.2905    , 4.15      , 3.404     ,
       3.34      , 3.4695    , 3.4695    , 3.12      , 1.92033333,
       2.11333333, 4.70833333, 3.34552381, 3.02783333, 4.42333333,
       3.69533333, 3.62      , 3.34552381, 3.34552381, 3.31      ,
       2.54      , 2.65      , 4.51      , 3.07916667, 2.9875    ,
       4.1       , 3.17      , 3.17      , 3.07916667, 3.07916667,
       3.83      , 1.72      , 1.4       , 4.72      , 3.11833333,
       1.94666667, 4.9       , 4.3125    , 4.07      , 3.11833333,
       3.11833333, 3.58      , 2.70583333, 2.73      , 4.58      ,
       3.479     , 3.10916667, 4.05      , 3.0815    , 3.     

In [12]:
print(y_hat.shape)

(124,)


In [13]:
y_hat_one = infer(X.iloc[range(0,9), :])


Index([        0,         1,         2,         3,         4,         5,
               6,         7,         8,         0,         1,         2,
               3,         4,         5,         0,         1,         2,
               3,         4,         5,         6,   'API %', 'Plast %'],
      dtype='object')


In [14]:
print(X.head())

        Primary ID  API %  Plast %  nAcid  ALogP    ALogp2       AMR  \
0  Amlodipine_N1_1      5      0.0      0  0.716  0.512656  112.2327   
1  Amlodipine_N2_1     15      0.0      0  0.716  0.512656  112.2327   
2  Amlodipine_N3_1     25      0.0      0  0.716  0.512656  112.2327   
3  Amlodipine_N4_1      5      7.5      0  0.716  0.512656  112.2327   
4  Amlodipine_N5_1     15      7.5      0  0.716  0.512656  112.2327   

       apol  naAromAtom  nAromBond  ...  Lipinski #violations  \
0  60.25982           6          6  ...                     0   
1  60.25982           6          6  ...                     0   
2  60.25982           6          6  ...                     0   
3  60.25982           6          6  ...                     0   
4  60.25982           6          6  ...                     0   

   Ghose #violations  Veber #violations  Egan #violations  Muegge #violations  \
0                  0                  0                 0                   0   
1             