In [1]:
import pandas as pd

# Load the main dataset
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')

X.set_index('Primary ID', inplace=True)
y.set_index('Primary ID', inplace=True)

# Load the column names from the Excel files (assuming they are in the first row)
padel_cols = pd.read_excel('Padel_cols.xlsx', header=None).iloc[0].dropna().tolist()
spartan_cols = pd.read_excel('Spartan_cols.xlsx', header=None).iloc[0].dropna().tolist()
swissadme_cols = pd.read_excel('Swissadme_cols.xlsx', header=None).iloc[0].dropna().tolist()


Check non-numeric data

In [2]:
non_numeric_columns = X.select_dtypes(exclude=['number']).columns
non_numeric_columns

Index(['Unnamed: 2'], dtype='object')

Check rows with nans

In [3]:
def check_nans(data):
    rows_with_nans = data.isnull().any(axis=1)
    num_rows_with_nans = rows_with_nans.sum()
    total_rows = len(data)
    fraction_rows_with_nans = num_rows_with_nans / total_rows

    print(f"Number of rows with NaNs: {num_rows_with_nans}")
    print(f"Fraction of rows with NaNs: {fraction_rows_with_nans:.2f}")

    # Identify columns with NaNs
    columns_with_nans = data.columns[data.isnull().any()].tolist()
    num_columns_with_nans = len(columns_with_nans)

    print(f"Columns with NaNs: {columns_with_nans}")

    # Print detailed information about NaNs in each column
    nan_info = data.isnull().sum()
    print("\nDetailed NaN information:")
    print(nan_info[nan_info > 0])
    print("Number of columns with nan values:")
    print(f"{num_columns_with_nans} columns out of the total {len(data.columns)} columns")

In [5]:
from sklearn.decomposition import PCA
import numpy as np
import joblib

def perform_pca(X_subset, subset_name, explained_variance_threshold=0.95):
    # Print data nans before standardization
    print("NANs before standardization")
    check_nans(X_subset)

    # Drop zero variance columns
    zero_variance_columns = X_subset.loc[:, X_subset.std() == 0].columns
    X_subset = X_subset.drop(columns=zero_variance_columns)
    # save the columns to drop
    joblib.dump(zero_variance_columns, subset_name + '_cols_to_drop.pkl')
    
    # Standardize the data if necessary
    mean = X_subset.mean()
    std = X_subset.std()
    X_standardized = (X_subset - mean) / std
    # save the values for standardization
    joblib.dump(mean, subset_name + '_mean.pkl')
    joblib.dump(std, subset_name + '_std.pkl')

    # Print data nans before standardization
    print("NANs after standardization")
    check_nans(X_standardized)

    # Initialize PCA
    pca = PCA()

    # Fit PCA
    pca.fit(X_standardized)

    # Calculate cumulative explained variance
    cum_var_explained = np.cumsum(pca.explained_variance_ratio_)
    
    # Determine the number of components needed to reach the explained variance threshold
    num_components = np.argmax(cum_var_explained >= explained_variance_threshold) + 1
    
    # Apply PCA with the selected number of components
    pca = PCA(n_components=num_components)
    X_pca = pca.fit_transform(X_standardized)

    # Save the scaler
    joblib.dump(pca, subset_name + '_pca_fit.pkl')
    
    return X_pca, num_components, pca.explained_variance_ratio_

# Perform PCA on each set of columns
X_padel_pca, padel_n_components, padel_variance_ratio = perform_pca(X[padel_cols], subset_name='padel')
X_spartan_pca, spartan_n_components, spartan_variance_ratio = perform_pca(X[spartan_cols], subset_name='spartan')
X_swissadme_pca, swissadme_n_components, swissadme_variance_ratio = perform_pca(X[swissadme_cols], subset_name='swissadme')

# Display the number of components retained
print(f"Padel: {padel_n_components} components retained")
print(f"Spartan: {spartan_n_components} components retained")
print(f"SwissADME: {swissadme_n_components} components retained")


NANs before standardization
Number of rows with NaNs: 0
Fraction of rows with NaNs: 0.00
Columns with NaNs: []

Detailed NaN information:
Series([], dtype: int64)
Number of columns with nan values:
0 columns out of the total 1444 columns
NANs after standardization
Number of rows with NaNs: 0
Fraction of rows with NaNs: 0.00
Columns with NaNs: []

Detailed NaN information:
Series([], dtype: int64)
Number of columns with nan values:
0 columns out of the total 1073 columns
NANs before standardization
Number of rows with NaNs: 0
Fraction of rows with NaNs: 0.00
Columns with NaNs: []

Detailed NaN information:
Series([], dtype: int64)
Number of columns with nan values:
0 columns out of the total 23 columns
NANs after standardization
Number of rows with NaNs: 0
Fraction of rows with NaNs: 0.00
Columns with NaNs: []

Detailed NaN information:
Series([], dtype: int64)
Number of columns with nan values:
0 columns out of the total 23 columns
NANs before standardization
Number of rows with NaNs: 

Remove NaN rows

After performing PCA, merge the datasets back together

In [6]:
# Convert PCA results to DataFrames with 'Primary ID' as the index
X_padel_pca_df = pd.DataFrame(X_padel_pca, index=X.index)
X_spartan_pca_df = pd.DataFrame(X_spartan_pca, index=X.index)
X_swissadme_pca_df = pd.DataFrame(X_swissadme_pca, index=X.index)

# Ensure that the other features DataFrame is also indexed by 'Primary ID'
other_features_cols = ["API dose", "API %", "Plast %", "%5min", "%10min", "%15min", "%30min", "%60min", "%120min", "%180min", "%240min", "Scor imprimabilitate", 'ST-Hardness (g)',	'ST-Rigidity at 2% deformation (g)', 'ST-Rigidity at 4% deformation (g)', 'ST-Peak stress (N/mp)', '3PBT-Hardness (g)',	
             '3PBT-Deformation at hardness (mm)', '3PBT-Total work (mJ)', '3PBT-Maximum force (N)', '3PBT-Peak stress (N/mp)', 
             '3PBT-Flexural stress (g/mmp) (Samaro 2021 Prasad 2019)',	'3PBT-Flexural strain (%)', '3PBT-Breaking distance (mm)', '3PBT-Stiffness (N/mm) (Hu 2022)']
X_other_features = X[other_features_cols].set_index(X.index)
print(f"Other features: {X_other_features}")

# Normalize the rest of the columns
mean = X_other_features.mean()
std = X_other_features.std()
X_standardized = (X_other_features - mean) / std
# save the values for standardization
joblib.dump(mean, 'non_pca_features_mean.pkl')
joblib.dump(std, 'non_pca_features_std.pkl')

# Merge the PCA-transformed data back together with the rest of the features
X_final = pd.concat([X_padel_pca_df, X_spartan_pca_df, X_swissadme_pca_df, X_standardized], axis=1)

# Make sure all the column names are strings
X_final.columns = X_final.columns.astype(str)

# Verify the final DataFrame shape and columns
print(f"Final DataFrame shape: {X_final.shape}")
print(X_final)

Other features:             API dose  API %  Plast %      %5min     %10min     %15min  \
Primary ID                                                              
BCS1_S1        8.600      5      0.0  10.032837  13.726531  17.440437   
BCS1_S2        8.600      5      0.0  10.032837  13.726531  17.440437   
BCS1_S3        8.600      5      0.0  10.032837  13.726531  17.440437   
BCS1_S4        8.600      5      0.0  10.032837  13.726531  17.440437   
BCS1_S6       22.725     15      0.0   8.265069  11.232801  14.216765   
...              ...    ...      ...        ...        ...        ...   
BCS4_S51      23.475     15      7.5   9.034772  13.207822  17.418949   
BCS4_S52      23.475     15      7.5   9.034772  13.207822  17.418949   
BCS4_S53      23.475     15      7.5   9.034772  13.207822  17.418949   
BCS4_S54      23.475     15      7.5   9.034772  13.207822  17.418949   
BCS4_S55      23.475     15      7.5   9.034772  13.207822  17.418949   

               %30min     %60min  

Check for duplicates in the column names between the 3 data sources (Padel, Spartan, SwissADME)

In [7]:
X_final.index

Index(['BCS1_S1', 'BCS1_S2', 'BCS1_S3', 'BCS1_S4', 'BCS1_S6', 'BCS1_S7',
       'BCS1_S8', 'BCS1_S9', 'BCS1_S10', 'BCS1_S16',
       ...
       'BCS4_S46', 'BCS4_S47', 'BCS4_S48', 'BCS4_S49', 'BCS4_S50', 'BCS4_S51',
       'BCS4_S52', 'BCS4_S53', 'BCS4_S54', 'BCS4_S55'],
      dtype='object', name='Primary ID', length=459)

In [8]:
X_final.columns

Index(['0', '1', '2', '3', '4', '5', '6', '0', '1', '2', '3', '4', '5', '0',
       '1', '2', '3', '4', '5', 'API dose', 'API %', 'Plast %', '%5min',
       '%10min', '%15min', '%30min', '%60min', '%120min', '%180min', '%240min',
       'Scor imprimabilitate', 'ST-Hardness (g)',
       'ST-Rigidity at 2% deformation (g)',
       'ST-Rigidity at 4% deformation (g)', 'ST-Peak stress (N/mp)',
       '3PBT-Hardness (g)', '3PBT-Deformation at hardness (mm)',
       '3PBT-Total work (mJ)', '3PBT-Maximum force (N)',
       '3PBT-Peak stress (N/mp)',
       '3PBT-Flexural stress (g/mmp) (Samaro 2021 Prasad 2019)',
       '3PBT-Flexural strain (%)', '3PBT-Breaking distance (mm)',
       '3PBT-Stiffness (N/mm) (Hu 2022)'],
      dtype='object')

In [9]:
y.index

Index(['BCS1_S1', 'BCS1_S2', 'BCS1_S3', 'BCS1_S4', 'BCS1_S6', 'BCS1_S7',
       'BCS1_S8', 'BCS1_S9', 'BCS1_S10', 'BCS1_S16',
       ...
       'BCS4_S46', 'BCS4_S47', 'BCS4_S48', 'BCS4_S49', 'BCS4_S50', 'BCS4_S51',
       'BCS4_S52', 'BCS4_S53', 'BCS4_S54', 'BCS4_S55'],
      dtype='object', name='Primary ID', length=459)

In [10]:
y.columns

Index(['Temperatura  duză de imprimare (grade)',
       'Temperatura pat imprimare (grade)', 'Viteză de imprimare (x = ori)'],
      dtype='object')

In [11]:
X_final.to_csv("X_PCA.csv")
y.to_csv("y_PCA.csv")

In [12]:
X_final

Unnamed: 0_level_0,0,1,2,3,4,5,6,0,1,2,...,ST-Peak stress (N/mp),3PBT-Hardness (g),3PBT-Deformation at hardness (mm),3PBT-Total work (mJ),3PBT-Maximum force (N),3PBT-Peak stress (N/mp),3PBT-Flexural stress (g/mmp) (Samaro 2021 Prasad 2019),3PBT-Flexural strain (%),3PBT-Breaking distance (mm),3PBT-Stiffness (N/mm) (Hu 2022)
Primary ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BCS1_S1,21.571509,6.170082,-4.661010,-0.469868,5.097721,-10.968115,1.279308,3.536737,0.569903,0.494731,...,-0.047765,1.189939,2.545668,-0.075992,1.189939,1.243201,1.115514,0.294845,2.545668,-1.115197
BCS1_S2,21.571509,6.170082,-4.661010,-0.469868,5.097721,-10.968115,1.279308,3.536737,0.569903,0.494731,...,0.088265,0.105614,2.611650,-0.315973,0.105614,0.411376,0.631490,0.257331,2.611650,-1.450849
BCS1_S3,21.571509,6.170082,-4.661010,-0.469868,5.097721,-10.968115,1.279308,3.536737,0.569903,0.494731,...,-0.331958,0.174441,3.051533,-0.221201,0.174441,0.315732,0.481431,0.366198,3.051533,-1.503677
BCS1_S4,21.571509,6.170082,-4.661010,-0.469868,5.097721,-10.968115,1.279308,3.536737,0.569903,0.494731,...,-0.013517,1.397627,2.560331,0.005358,1.397627,1.338394,1.138138,0.311490,2.560331,-1.057212
BCS1_S6,21.571509,6.170082,-4.661010,-0.469868,5.097721,-10.968115,1.279308,3.536737,0.569903,0.494731,...,-0.218973,-0.697365,2.736284,-0.428642,-0.697365,-0.131628,0.419138,0.204932,2.736284,-1.701862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BCS4_S51,-24.497635,11.864313,10.260432,-10.789995,-8.537452,-2.392841,12.572023,-3.120655,1.981968,1.489739,...,0.030103,-0.284404,-0.738793,-0.451420,-0.284404,-0.482955,-0.234112,-0.363625,-0.738793,0.716628
BCS4_S52,-24.497635,11.864313,10.260432,-10.789995,-8.537452,-2.392841,12.572023,-3.120655,1.981968,1.489739,...,0.067337,0.047655,-0.738793,-0.403424,0.047655,-0.152040,0.004360,-0.362596,-0.738793,1.154763
BCS4_S53,-24.497635,11.864313,10.260432,-10.789995,-8.537452,-2.392841,12.572023,-3.120655,1.981968,1.489739,...,-0.208909,-1.192436,-0.570171,-0.529516,-1.192436,-0.782366,-0.114538,-0.370246,-0.570171,-0.741074
BCS4_S54,-24.497635,11.864313,10.260432,-10.789995,-8.537452,-2.392841,12.572023,-3.120655,1.981968,1.489739,...,-0.035717,-1.063235,-0.577502,-0.505111,-1.063235,-0.688989,-0.079227,-0.366673,-0.577502,-0.585448


# Train Random Forest Multiregressor

In [13]:
# test-train split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

# Initialize the base model
base_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create a multi-output regressor
multi_output_model = MultiOutputRegressor(base_model)

# Train the model
multi_output_model.fit(X_train, y_train)

Evaluate the model

In [35]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = multi_output_model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
r2 = r2_score(y_test, y_pred, multioutput='variance_weighted')

# Print MSE and target variable names
for target_name, mse_value in zip(y_test.columns, mse):
    print(f"Target: {target_name}, Mean Squared Error: {mse_value}")

print(f"\nOverall R^2 Score: {r2}")


Target: Temp Z1, Mean Squared Error: 16.129571111111105
Target: Temp Z2, Mean Squared Error: 23.72357125555557
Target: Temp Z3, Mean Squared Error: 18.91315580246913
Target: Temp Z4, Mean Squared Error: 14.762472283333315
Target: Temp Z5, Mean Squared Error: 15.177010833333341
Target: Screw speed, Mean Squared Error: 391.6386748567901

Overall R^2 Score: 0.7867220666614736


In [15]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predict on the test set
y_pred = multi_output_model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')  # MSE for each target
r2_per_target = r2_score(y_test, y_pred, multioutput='raw_values')  # R² for each target
r2_overall = r2_score(y_test, y_pred, multioutput='variance_weighted')  # Overall weighted R²

# Calculate variance or range of each target for relative performance
variance_targets = np.var(y_test, axis=0)  # Variance of each target in test set
range_targets = np.ptp(y_test, axis=0)  # Range (max-min) of each target in test set

# Print MSE, R² score, variance, and range for each target
for target_name, mse_value, r2_value, variance_value, range_value in zip(y_test.columns, mse, r2_per_target, variance_targets, range_targets):
    print(f"Target: {target_name}, Mean Squared Error: {mse_value}, R² Score: {r2_value}")
    print(f"Target: {target_name}, Variance: {variance_value}, Range: {range_value}")
    print(f"Relative MSE (MSE/Variance): {mse_value/variance_value if variance_value != 0 else 'Undefined'}")
    print(f"Relative MSE (MSE/Range): {mse_value/range_value if range_value != 0 else 'Undefined'}\n")

# Print overall R² score
print(f"\nOverall R^2 Score: {r2_overall}")


Target: Temperatura  duză de imprimare (grade), Mean Squared Error: 7.741304347826087, R² Score: 0.9488455938323412
Target: Temperatura  duză de imprimare (grade), Variance: 151.33211247637053, Range: 45.0
Relative MSE (MSE/Variance): 0.05115440616765882
Relative MSE (MSE/Range): 0.17202898550724638

Target: Temperatura pat imprimare (grade), Mean Squared Error: 0.0, R² Score: 1.0
Target: Temperatura pat imprimare (grade), Variance: 194.17237712665408, Range: 40.0
Relative MSE (MSE/Variance): 0.0
Relative MSE (MSE/Range): 0.0

Target: Viteză de imprimare (x = ori), Mean Squared Error: 0.0027205108695652273, R² Score: 0.6574980812137425
Target: Viteză de imprimare (x = ori), Variance: 0.00794305293005671, Range: 0.5
Relative MSE (MSE/Variance): 0.34250191878625746
Relative MSE (MSE/Range): 0.005441021739130455


Overall R^2 Score: 0.97758684166828


In [16]:
import joblib

# Save the model
joblib.dump(multi_output_model, 'multi_output_model.pkl')

['multi_output_model.pkl']