In [22]:
import pandas as pd

# Load the main dataset
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')

X.set_index('Primary ID', inplace=True)
y.set_index('Primary ID', inplace=True)

# Load the column names from the Excel files (assuming they are in the first row)
padel_cols = pd.read_excel('Padel_cols.xlsx', header=None).iloc[0].dropna().tolist()
spartan_cols = pd.read_excel('Spartan_cols.xlsx', header=None).iloc[0].dropna().tolist()
swissadme_cols = pd.read_excel('Swissadme_cols.xlsx', header=None).iloc[0].dropna().tolist()


Check non-numeric data

In [23]:
non_numeric_columns = X.select_dtypes(exclude=['number']).columns
non_numeric_columns

Index([], dtype='object')

Check rows with nans

In [24]:
def check_nans(data):
    rows_with_nans = data.isnull().any(axis=1)
    num_rows_with_nans = rows_with_nans.sum()
    total_rows = len(data)
    fraction_rows_with_nans = num_rows_with_nans / total_rows

    print(f"Number of rows with NaNs: {num_rows_with_nans}")
    print(f"Fraction of rows with NaNs: {fraction_rows_with_nans:.2f}")

    # Identify columns with NaNs
    columns_with_nans = data.columns[data.isnull().any()].tolist()
    num_columns_with_nans = len(columns_with_nans)

    print(f"Columns with NaNs: {columns_with_nans}")

    # Print detailed information about NaNs in each column
    nan_info = data.isnull().sum()
    print("\nDetailed NaN information:")
    print(nan_info[nan_info > 0])
    print("Number of columns with nan values:")
    print(f"{num_columns_with_nans} columns out of the total {len(data.columns)} columns")

In [25]:
from sklearn.decomposition import PCA
import numpy as np
import joblib

def perform_pca(X_subset, subset_name, explained_variance_threshold=0.95):
    # Print data nans before standardization
    print("NANs before standardization")
    check_nans(X_subset)

    # Drop zero variance columns
    zero_variance_columns = X_subset.loc[:, X_subset.std() == 0].columns
    X_subset = X_subset.drop(columns=zero_variance_columns)
    # save the columns to drop
    joblib.dump(zero_variance_columns, subset_name + '_cols_to_drop.pkl')
    
    # Standardize the data if necessary
    mean = X_subset.mean()
    std = X_subset.std()
    X_standardized = (X_subset - mean) / std
    # save the values for standardization
    joblib.dump(mean, subset_name + '_mean.pkl')
    joblib.dump(std, subset_name + '_std.pkl')

    # Print data nans before standardization
    print("NANs after standardization")
    check_nans(X_standardized)

    # Initialize PCA
    pca = PCA()

    # Fit PCA
    pca.fit(X_standardized)

    # Calculate cumulative explained variance
    cum_var_explained = np.cumsum(pca.explained_variance_ratio_)
    
    # Determine the number of components needed to reach the explained variance threshold
    num_components = np.argmax(cum_var_explained >= explained_variance_threshold) + 1
    
    # Apply PCA with the selected number of components
    pca = PCA(n_components=num_components)
    X_pca = pca.fit_transform(X_standardized)

    # Save the scaler
    joblib.dump(pca, subset_name + '_pca_fit.pkl')
    
    return X_pca, num_components, pca.explained_variance_ratio_

# Perform PCA on each set of columns
X_padel_pca, padel_n_components, padel_variance_ratio = perform_pca(X[padel_cols], subset_name='padel')
X_spartan_pca, spartan_n_components, spartan_variance_ratio = perform_pca(X[spartan_cols], subset_name='spartan')
X_swissadme_pca, swissadme_n_components, swissadme_variance_ratio = perform_pca(X[swissadme_cols], subset_name='swissadme')

# Display the number of components retained
print(f"Padel: {padel_n_components} components retained")
print(f"Spartan: {spartan_n_components} components retained")
print(f"SwissADME: {swissadme_n_components} components retained")


NANs before standardization
Number of rows with NaNs: 0
Fraction of rows with NaNs: 0.00
Columns with NaNs: []

Detailed NaN information:
Series([], dtype: int64)
Number of columns with nan values:
0 columns out of the total 1444 columns
NANs after standardization
Number of rows with NaNs: 0
Fraction of rows with NaNs: 0.00
Columns with NaNs: []

Detailed NaN information:
Series([], dtype: int64)
Number of columns with nan values:
0 columns out of the total 1083 columns
NANs before standardization
Number of rows with NaNs: 0
Fraction of rows with NaNs: 0.00
Columns with NaNs: []

Detailed NaN information:
Series([], dtype: int64)
Number of columns with nan values:
0 columns out of the total 23 columns
NANs after standardization
Number of rows with NaNs: 0
Fraction of rows with NaNs: 0.00
Columns with NaNs: []

Detailed NaN information:
Series([], dtype: int64)
Number of columns with nan values:
0 columns out of the total 23 columns
NANs before standardization
Number of rows with NaNs: 

Remove NaN rows

After performing PCA, merge the datasets back together

In [26]:
# Convert PCA results to DataFrames with 'Primary ID' as the index
X_padel_pca_df = pd.DataFrame(X_padel_pca, index=X.index)
X_spartan_pca_df = pd.DataFrame(X_spartan_pca, index=X.index)
X_swissadme_pca_df = pd.DataFrame(X_swissadme_pca, index=X.index)

# Ensure that the other features DataFrame is also indexed by 'Primary ID'
other_features_cols = ["API dose", "API %", "Plast %", "%5min", "%10min", "%15min", "%30min", "%60min", "%120min", "%180min", "%240min"]
X_other_features = X[other_features_cols].set_index(X.index)
print(f"Other features: {X_other_features}")

# Normalize the rest of the columns
mean = X_other_features.mean()
std = X_other_features.std()
X_standardized = (X_other_features - mean) / std
# save the values for standardization
joblib.dump(mean, 'non_pca_features_mean.pkl')
joblib.dump(std, 'non_pca_features_std.pkl')

# Merge the PCA-transformed data back together with the rest of the features
X_final = pd.concat([X_padel_pca_df, X_spartan_pca_df, X_swissadme_pca_df, X_standardized], axis=1)

# Make sure all the column names are strings
X_final.columns = X_final.columns.astype(str)

# Verify the final DataFrame shape and columns
print(f"Final DataFrame shape: {X_final.shape}")
print(X_final)

Other features:                  API dose  API %  Plast %      %5min     %10min     %15min  \
Primary ID                                                                   
Amlodipine_N1_1      7.90      5      0.0   6.824995  10.599703  14.395172   
Amlodipine_N1_2      9.30      5      0.0  13.240678  16.853359  20.485702   
Amlodipine_N2_1     23.40     15      0.0   3.087206   5.356809   7.638926   
Amlodipine_N2_2     22.05     15      0.0  13.442931  17.108792  20.794604   
Amlodipine_N4_1      8.25      5      7.5   4.643496   9.536276  14.456096   
...                   ...    ...      ...        ...        ...        ...   
Diclofenac_N2_2     25.95     15      0.0   5.927844   9.993104  14.071073   
Diclofenac_N4_1      7.55      5      7.5   7.781062  10.656150  15.479299   
Diclofenac_N4_2      8.10      5      7.5   8.525924  16.088311  20.369692   
Diclofenac_N5_1     24.30     15      7.5   7.800232  11.797752  16.138128   
Diclofenac_N5_2     23.40     15      7.5   9.08

Check for duplicates in the column names between the 3 data sources (Padel, Spartan, SwissADME)

In [27]:
X_final.index

Index(['Amlodipine_N1_1', 'Amlodipine_N1_2', 'Amlodipine_N2_1',
       'Amlodipine_N2_2', 'Amlodipine_N4_1', 'Amlodipine_N4_2',
       'Amlodipine_N5_1', 'Amlodipine_N5_2', 'Amlodipine_N10_1',
       'Amlodipine_N10_2',
       ...
       'Atenolol_N10_1', 'Atenolol_N10_2', 'Atenolol_N11_1', 'Atenolol_N11_2',
       'Diclofenac_N2_1', 'Diclofenac_N2_2', 'Diclofenac_N4_1',
       'Diclofenac_N4_2', 'Diclofenac_N5_1', 'Diclofenac_N5_2'],
      dtype='object', name='Primary ID', length=222)

In [28]:
X_final.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '0', '1', '2', '3', '4',
       '5', '0', '1', '2', '3', '4', '5', '6', 'API dose', 'API %', 'Plast %',
       '%5min', '%10min', '%15min', '%30min', '%60min', '%120min', '%180min',
       '%240min'],
      dtype='object')

In [29]:
y.index

Index(['Amlodipine_N1_1', 'Amlodipine_N1_2', 'Amlodipine_N2_1',
       'Amlodipine_N2_2', 'Amlodipine_N4_1', 'Amlodipine_N4_2',
       'Amlodipine_N5_1', 'Amlodipine_N5_2', 'Amlodipine_N10_1',
       'Amlodipine_N10_2',
       ...
       'Atenolol_N10_1', 'Atenolol_N10_2', 'Atenolol_N11_1', 'Atenolol_N11_2',
       'Diclofenac_N2_1', 'Diclofenac_N2_2', 'Diclofenac_N4_1',
       'Diclofenac_N4_2', 'Diclofenac_N5_1', 'Diclofenac_N5_2'],
      dtype='object', name='Primary ID', length=222)

In [30]:
y.columns

Index(['Temp Z1', 'Temp Z2', 'Temp Z3', 'Temp Z4', 'Temp Z5', 'Screw speed'], dtype='object')

In [31]:
X_final.to_csv("X_PCA.csv")
y.to_csv("y_PCA.csv")

In [32]:
X_final

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,0,...,API %,Plast %,%5min,%10min,%15min,%30min,%60min,%120min,%180min,%240min
Primary ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Amlodipine_N1_1,21.038401,5.392050,-6.014286,0.904203,-2.571425,10.451831,2.280783,7.892633,1.552963,3.661798,...,-1.332252,-1.332461,-0.062991,-0.265398,-0.361737,-0.943323,-1.157887,-1.662517,-2.266260,-2.632925
Amlodipine_N1_2,21.038401,5.392050,-6.014286,0.904203,-2.571425,10.451831,2.280783,7.892633,1.552963,3.661798,...,-1.332252,-1.332461,1.681166,0.995101,0.593867,-0.848942,-1.000840,-0.904451,-1.579811,-2.073213
Amlodipine_N2_1,21.038401,5.392050,-6.014286,0.904203,-2.571425,10.451831,2.280783,7.892633,1.552963,3.661798,...,0.062842,-1.332461,-1.079141,-1.322166,-1.421792,-1.540447,-1.496968,-1.931493,-2.425500,-2.638783
Amlodipine_N2_2,21.038401,5.392050,-6.014286,0.904203,-2.571425,10.451831,2.280783,7.892633,1.552963,3.661798,...,0.062842,-1.332461,1.736150,1.046586,0.642334,-1.370477,-1.455579,-1.883627,-2.717351,-2.900448
Amlodipine_N4_1,21.038401,5.392050,-6.014286,0.904203,-2.571425,10.451831,2.280783,7.892633,1.552963,3.661798,...,-1.332252,0.037013,-0.656051,-0.479745,-0.352178,-1.194768,-1.259775,-1.903822,-2.311200,-2.414435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Diclofenac_N2_2,-0.440269,1.172098,-1.724177,8.433101,-11.788265,-3.305513,-1.183039,-1.464674,9.176554,-0.052923,...,0.062842,-1.332461,-0.306889,-0.387665,-0.412588,-0.288815,-0.232772,-0.009871,-0.540551,-0.596400
Diclofenac_N4_1,-0.440269,1.172098,-1.724177,8.433101,-11.788265,-3.305513,-1.183039,-1.464674,9.176554,-0.052923,...,-1.332252,0.037013,0.196923,-0.254021,-0.191638,-0.000825,0.004343,0.647778,0.629468,0.443946
Diclofenac_N4_2,-0.440269,1.172098,-1.724177,8.433101,-11.788265,-3.305513,-1.183039,-1.464674,9.176554,-0.052923,...,-1.332252,0.037013,0.399421,0.840896,0.575665,0.189010,0.385057,0.661094,0.365010,0.142634
Diclofenac_N5_1,-0.440269,1.172098,-1.724177,8.433101,-11.788265,-3.305513,-1.183039,-1.464674,9.176554,-0.052923,...,0.062842,0.037013,0.202135,-0.023917,-0.088267,-0.284606,-0.014914,0.156705,-0.257140,-0.556943


# Train Random Forest Multiregressor

In [33]:
# test-train split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

# Initialize the base model
base_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create a multi-output regressor
multi_output_model = MultiOutputRegressor(base_model)

# Train the model
multi_output_model.fit(X_train, y_train)

Evaluate the model

In [35]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = multi_output_model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
r2 = r2_score(y_test, y_pred, multioutput='variance_weighted')

# Print MSE and target variable names
for target_name, mse_value in zip(y_test.columns, mse):
    print(f"Target: {target_name}, Mean Squared Error: {mse_value}")

print(f"\nOverall R^2 Score: {r2}")


Target: Temp Z1, Mean Squared Error: 16.129571111111105
Target: Temp Z2, Mean Squared Error: 23.72357125555557
Target: Temp Z3, Mean Squared Error: 18.91315580246913
Target: Temp Z4, Mean Squared Error: 14.762472283333315
Target: Temp Z5, Mean Squared Error: 15.177010833333341
Target: Screw speed, Mean Squared Error: 391.6386748567901

Overall R^2 Score: 0.7867220666614736


In [36]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predict on the test set
y_pred = multi_output_model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')  # MSE for each target
r2_per_target = r2_score(y_test, y_pred, multioutput='raw_values')  # R² for each target
r2_overall = r2_score(y_test, y_pred, multioutput='variance_weighted')  # Overall weighted R²

# Calculate variance or range of each target for relative performance
variance_targets = np.var(y_test, axis=0)  # Variance of each target in test set
range_targets = np.ptp(y_test, axis=0)  # Range (max-min) of each target in test set

# Print MSE, R² score, variance, and range for each target
for target_name, mse_value, r2_value, variance_value, range_value in zip(y_test.columns, mse, r2_per_target, variance_targets, range_targets):
    print(f"Target: {target_name}, Mean Squared Error: {mse_value}, R² Score: {r2_value}")
    print(f"Target: {target_name}, Variance: {variance_value}, Range: {range_value}")
    print(f"Relative MSE (MSE/Variance): {mse_value/variance_value if variance_value != 0 else 'Undefined'}")
    print(f"Relative MSE (MSE/Range): {mse_value/range_value if range_value != 0 else 'Undefined'}\n")

# Print overall R² score
print(f"\nOverall R^2 Score: {r2_overall}")


Target: Temp Z1, Mean Squared Error: 16.129571111111105, R² Score: 0.9286358300232037
Target: Temp Z1, Variance: 226.01777777777772, Range: 55
Relative MSE (MSE/Variance): 0.0713641699767963
Relative MSE (MSE/Range): 0.2932649292929292

Target: Temp Z2, Mean Squared Error: 23.72357125555557, R² Score: 0.6450508940736196
Target: Temp Z2, Variance: 66.83654320987654, Range: 45
Relative MSE (MSE/Variance): 0.3549491059263804
Relative MSE (MSE/Range): 0.5271904723456793

Target: Temp Z3, Mean Squared Error: 18.91315580246913, R² Score: 0.6844275032134737
Target: Temp Z3, Variance: 59.93283950617285, Range: 39
Relative MSE (MSE/Variance): 0.3155724967865264
Relative MSE (MSE/Range): 0.48495271288382386

Target: Temp Z4, Mean Squared Error: 14.762472283333315, R² Score: 0.7965480666574791
Target: Temp Z4, Variance: 72.56, Range: 38
Relative MSE (MSE/Variance): 0.20345193334252087
Relative MSE (MSE/Range): 0.3884861127192978

Target: Temp Z5, Mean Squared Error: 15.177010833333341, R² Score: 

In [37]:
import joblib

# Save the model
joblib.dump(multi_output_model, 'multi_output_model.pkl')

['multi_output_model.pkl']