In [1]:
import pandas as pd

# Load the main dataset
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')

# Load the column names from the Excel files (assuming they are in the first row)
padel_cols = pd.read_excel('Padel_cols.xlsx', header=None).iloc[0].dropna().tolist()
spartan_cols = pd.read_excel('Spartan_cols.xlsx', header=None).iloc[0].dropna().tolist()
swissadme_cols = pd.read_excel('Swissadme_cols.xlsx', header=None).iloc[0].dropna().tolist()


TypeError: Cannot convert numpy.ndarray to numpy.ndarray

Check non-numeric data

In [None]:
non_numeric_columns = X.select_dtypes(exclude=['number']).columns
non_numeric_columns

Index(['Primary ID'], dtype='object')

Check rows with nans

In [None]:
def check_nans(data):
    rows_with_nans = data.isnull().any(axis=1)
    num_rows_with_nans = rows_with_nans.sum()
    total_rows = len(data)
    fraction_rows_with_nans = num_rows_with_nans / total_rows

    print(f"Number of rows with NaNs: {num_rows_with_nans}")
    print(f"Fraction of rows with NaNs: {fraction_rows_with_nans:.2f}")

    # Identify columns with NaNs
    columns_with_nans = data.columns[data.isnull().any()].tolist()
    num_columns_with_nans = len(columns_with_nans)

    print(f"Columns with NaNs: {columns_with_nans}")

    # Print detailed information about NaNs in each column
    nan_info = data.isnull().sum()
    print("\nDetailed NaN information:")
    print(nan_info[nan_info > 0])
    print("Number of columns with nan values:")
    print(f"{num_columns_with_nans} columns out of the total {len(data.columns)} columns")

In [None]:
from sklearn.decomposition import PCA
import numpy as np

def perform_pca(X_subset, explained_variance_threshold=0.95):
    # Print data nans before standardization
    print("NANs before standardization")
    check_nans(X_subset)

    # Drop zero variance columns
    zero_variance_columns = X_subset.loc[:, X_subset.std() == 0].columns
    X_subset = X_subset.drop(columns=zero_variance_columns)
    
    # Standardize the data if necessary
    X_standardized = (X_subset - X_subset.mean()) / X_subset.std()

    # Print data nans before standardization
    print("NANs after standardization")
    check_nans(X_standardized)

    # Initialize PCA
    pca = PCA()

    # Fit PCA
    pca.fit(X_standardized)

    # Calculate cumulative explained variance
    cum_var_explained = np.cumsum(pca.explained_variance_ratio_)
    
    # Determine the number of components needed to reach the explained variance threshold
    num_components = np.argmax(cum_var_explained >= explained_variance_threshold) + 1
    
    # Apply PCA with the selected number of components
    pca = PCA(n_components=num_components)
    X_pca = pca.fit_transform(X_standardized)
    
    return X_pca, num_components, pca.explained_variance_ratio_

# Perform PCA on each set of columns
X_padel_pca, padel_n_components, padel_variance_ratio = perform_pca(X[padel_cols])
X_spartan_pca, spartan_n_components, spartan_variance_ratio = perform_pca(X[spartan_cols])
X_swissadme_pca, swissadme_n_components, swissadme_variance_ratio = perform_pca(X[swissadme_cols])

# Display the number of components retained
print(f"Padel: {padel_n_components} components retained")
print(f"Spartan: {spartan_n_components} components retained")
print(f"SwissADME: {swissadme_n_components} components retained")


NANs before standardization
Number of rows with NaNs: 0
Fraction of rows with NaNs: 0.00
Columns with NaNs: []

Detailed NaN information:
Series([], dtype: int64)
Number of columns with nan values:
0 columns out of the total 1444 columns
NANs after standardization
Number of rows with NaNs: 0
Fraction of rows with NaNs: 0.00
Columns with NaNs: []

Detailed NaN information:
Series([], dtype: int64)
Number of columns with nan values:
0 columns out of the total 1079 columns
NANs before standardization
Number of rows with NaNs: 0
Fraction of rows with NaNs: 0.00
Columns with NaNs: []

Detailed NaN information:
Series([], dtype: int64)
Number of columns with nan values:
0 columns out of the total 23 columns
NANs after standardization
Number of rows with NaNs: 0
Fraction of rows with NaNs: 0.00
Columns with NaNs: []

Detailed NaN information:
Series([], dtype: int64)
Number of columns with nan values:
0 columns out of the total 23 columns
NANs before standardization
Number of rows with NaNs: 

After performing PCA, merge the datasets back together

In [None]:
# Convert PCA results to DataFrames with 'Primary ID' as the index
X_padel_pca_df = pd.DataFrame(X_padel_pca, index=X['Primary ID'])
X_spartan_pca_df = pd.DataFrame(X_spartan_pca, index=X['Primary ID'])
X_swissadme_pca_df = pd.DataFrame(X_swissadme_pca, index=X['Primary ID'])

# Ensure that the other features DataFrame is also indexed by 'Primary ID'
other_features_cols = X.columns.difference(padel_cols + spartan_cols + swissadme_cols)
X_other_features = X[other_features_cols].set_index('Primary ID')

# Ensure that the target DataFrame is also indexed by 'Primary ID'
y.set_index('Primary ID', inplace=True)

# Merge the PCA-transformed data back together with the rest of the features
X_final = pd.concat([X_padel_pca_df, X_spartan_pca_df, X_swissadme_pca_df, X_other_features], axis=1)

# Make sure all the column names are strings
X_final.columns = X_final.columns.astype(str)

# Verify the final DataFrame shape and columns
print(f"Final DataFrame shape: {X_final.shape}")
print(X_final)

Final DataFrame shape: (1607, 28)
                    0          1         2          3         4          5  \
Primary ID                                                                   
BCS1_S1     15.040758  11.898574 -0.559357  -4.244448 -6.108275   5.193660   
BCS1_S1     15.040758  11.898574 -0.559357  -4.244448 -6.108275   5.193660   
BCS1_S2     15.040758  11.898574 -0.559357  -4.244448 -6.108275   5.193660   
BCS1_S2     15.040758  11.898574 -0.559357  -4.244448 -6.108275   5.193660   
BCS1_S3     15.040758  11.898574 -0.559357  -4.244448 -6.108275   5.193660   
...               ...        ...       ...        ...       ...        ...   
PS-S18      -6.372475   5.878233 -1.844172  11.395460  3.400629 -14.099519   
PS-S19      -6.372475   5.878233 -1.844172  11.395460  3.400629 -14.099519   
PS-S19      -6.372475   5.878233 -1.844172  11.395460  3.400629 -14.099519   
PS-S20      -6.372475   5.878233 -1.844172  11.395460  3.400629 -14.099519   
PS-S20      -6.372475   5.8782

Check for duplicates in the column names between the 3 data sources (Padel, Spartan, SwissADME)

In [None]:
X_final.index

Index(['BCS1_S1', 'BCS1_S1', 'BCS1_S2', 'BCS1_S2', 'BCS1_S3', 'BCS1_S3',
       'BCS1_S4', 'BCS1_S4', 'BCS1_S6', 'BCS1_S6',
       ...
       'PS-S16', 'PS-S16', 'PS-S17', 'PS-S17', 'PS-S18', 'PS-S18', 'PS-S19',
       'PS-S19', 'PS-S20', 'PS-S20'],
      dtype='object', name='Primary ID', length=1607)

In [None]:
X_final.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '0', '1', '2', '3', '4',
       '5', '6', '0', '1', '2', '3', '4', '5', '6', '3PBT-Diam (mm)',
       '3PBT-Radius (mm)', 'API %', 'Plast %', 'ST-Diam (mm)'],
      dtype='object')

In [None]:
y.index

Index(['BCS1_S1', 'BCS1_S1', 'BCS1_S2', 'BCS1_S2', 'BCS1_S3', 'BCS1_S3',
       'BCS1_S4', 'BCS1_S4', 'BCS1_S6', 'BCS1_S6',
       ...
       'PS-S16', 'PS-S16', 'PS-S17', 'PS-S17', 'PS-S18', 'PS-S18', 'PS-S19',
       'PS-S19', 'PS-S20', 'PS-S20'],
      dtype='object', name='Primary ID', length=1607)

In [None]:
X_final.to_csv("X_PCA.csv")
y.to_csv("y_PCA.csv")

# Train Random Forest Multiregressor

In [9]:
# test-train split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

# Initialize the base model
base_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create a multi-output regressor
multi_output_model = MultiOutputRegressor(base_model)

# Train the model
multi_output_model.fit(X_train, y_train)

Evaluate the model

In [13]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = multi_output_model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
r2 = r2_score(y_test, y_pred, multioutput='variance_weighted')

# Print MSE and target variable names
for target_name, mse_value in zip(y_test.columns, mse):
    print(f"Target: {target_name}, Mean Squared Error: {mse_value}")

print(f"\nOverall R^2 Score: {r2}")


Target: ST-Hardness (g), Mean Squared Error: 414001.5087016641
Target: ST-Rigidity at 2% deformation (g), Mean Squared Error: 50935.79464859308
Target: ST-Rigidity at 4% deformation (g), Mean Squared Error: 347443.38856135693
Target: ST-Peak stress (N/mp), Mean Squared Error: 34733449227465.918
Target: 3PBT-Hardness (g), Mean Squared Error: 6292.1189766186435
Target: 3PBT-Deformation at hardness (mm), Mean Squared Error: 0.04723693067147367
Target: 3PBT-Total work (mJ), Mean Squared Error: 16.91536016265038
Target: 3PBT-Maximum force (N), Mean Squared Error: 0.5948792704883535
Target: 3PBT-Peak stress (N/mp), Mean Squared Error: 69440451975.155
Target: 3PBT-Flexural stress (g/mmp) (Samaro 2021 Prasad 2019), Mean Squared Error: 35.114754986343314
Target: 3PBT-Flexural strain (%), Mean Squared Error: 6.018572234882204
Target: 3PBT-Breaking distance (mm), Mean Squared Error: 0.04723693067147367
Target: 3PBT-Stiffness (N/mm) (Hu 2022), Mean Squared Error: 0.20529226655031205

Overall R^2 S

In [11]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predict on the test set
y_pred = multi_output_model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')  # MSE for each target
r2_per_target = r2_score(y_test, y_pred, multioutput='raw_values')  # R² for each target
r2_overall = r2_score(y_test, y_pred, multioutput='variance_weighted')  # Overall weighted R²

# Calculate variance or range of each target for relative performance
variance_targets = np.var(y_test, axis=0)  # Variance of each target in test set
range_targets = np.ptp(y_test, axis=0)  # Range (max-min) of each target in test set

# Print MSE, R² score, variance, and range for each target
for target_name, mse_value, r2_value, variance_value, range_value in zip(y_test.columns, mse, r2_per_target, variance_targets, range_targets):
    print(f"Target: {target_name}, Mean Squared Error: {mse_value}, R² Score: {r2_value}")
    print(f"Target: {target_name}, Variance: {variance_value}, Range: {range_value}")
    print(f"Relative MSE (MSE/Variance): {mse_value/variance_value if variance_value != 0 else 'Undefined'}")
    print(f"Relative MSE (MSE/Range): {mse_value/range_value if range_value != 0 else 'Undefined'}\n")

# Print overall R² score
print(f"\nOverall R^2 Score: {r2_overall}")


Target: ST-Hardness (g), Mean Squared Error: 414001.5087016641, R² Score: 0.9507216119811631
Target: ST-Hardness (g), Variance: 8401279.452229852, Range: 12780.0
Relative MSE (MSE/Variance): 0.04927838801883689
Relative MSE (MSE/Range): 32.3944842489565

Target: ST-Rigidity at 2% deformation (g), Mean Squared Error: 50935.79464859308, R² Score: 0.9344010867435183
Target: ST-Rigidity at 2% deformation (g), Variance: 776473.1474962385, Range: 3905.0
Relative MSE (MSE/Variance): 0.06559891325648172
Relative MSE (MSE/Range): 13.043737426016154

Target: ST-Rigidity at 4% deformation (g), Mean Squared Error: 347443.38856135693, R² Score: 0.9310336161619226
Target: ST-Rigidity at 4% deformation (g), Variance: 5037865.8300702125, Range: 9430.0
Relative MSE (MSE/Variance): 0.06896638383807745
Relative MSE (MSE/Range): 36.84447386652778

Target: ST-Peak stress (N/mp), Mean Squared Error: 34733449227465.918, R² Score: 0.9963792485250529
Target: ST-Peak stress (N/mp), Variance: 9592884092651836.0,