#  Data preprocessing

The following code snippet demonstrates how to preprocess the data.

The data is first shuffeld and split into a train and a test set.
Subsequently the data is scaled. Numerical features and categorical features are scaled using `MinMaxScaler` and `OneHotEncoder`, respectively. This ensures that the data is in the best suitable format for deep learning models.


## Import modules


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import h5py
import pickle
import numpy as np

### Load raw data

In [None]:
hdf5_file_name = "datasets/core_shell_particles_raw_122500.h5"

# load and shuffle dataframe
df = pd.read_hdf(hdf5_file_name)

### logarithmic scaling of scattering

In [4]:
# Calculate log1p scaling of target variables
df['log_Q_fwd'] = df['Q_fwd'].map(np.log1p)
df['log_Q_back'] = df['Q_back'].map(np.log1p)

df


Unnamed: 0,mat_core,mat_shell,r_core,r_shell,wavelength,Q_sca,Q_back,Q_fwd,log_Q_fwd,log_Q_back
0,Si,Si,1,2,"[400.0, 406.3492063492063, 412.6984126984127, ...","[2.160839086497138e-06, 2.0034967444389096e-06...","[3.2335562550935324e-06, 2.998696933738638e-06...","[3.2489628896450263e-06, 3.0117948478578205e-0...","[3.2489576117765287e-06, 3.011790312412824e-06...","[3.233551027161775e-06, 2.998692437655976e-06,..."
1,Si,SiO2,1,2,"[400.0, 406.3492063492063, 412.6984126984127, ...","[3.9143528694988674e-07, 3.6438451909391744e-0...","[5.868605004574553e-07, 5.463176494864905e-07,...","[5.874454060091694e-07, 5.468359469672794e-07,...","[5.874452334631845e-07, 5.468357974525575e-07,...","[5.868603282548991e-07, 5.463175002550578e-07,..."
2,Si,Au,1,2,"[400.0, 406.3492063492063, 412.6984126984127, ...","[2.6613881597578463e-06, 2.482639350607367e-06...","[3.991253362741106e-06, 3.7232254787413343e-06...","[3.992911163065486e-06, 3.724692610081798e-06,...","[3.992903191416928e-06, 3.724685673431503e-06,...","[3.9912453977105965e-06, 3.7232185475545557e-0..."
3,Si,Ag,1,2,"[400.0, 406.3492063492063, 412.6984126984127, ...","[6.066105540702947e-06, 5.130173539793614e-06,...","[9.09914972315353e-06, 7.695458555236092e-06, ...","[9.099166460968062e-06, 7.69506165808914e-06, ...","[9.09912506380404e-06, 7.695032051254063e-06, ...","[9.099108326141806e-06, 7.695428945346812e-06,..."
4,Si,Si3N4,1,2,"[400.0, 406.3492063492063, 412.6984126984127, ...","[9.57380852593803e-07, 8.912867496023801e-07, ...","[1.4351768921144704e-06, 1.3361307534275844e-0...","[1.4369658329245613e-06, 1.3377296403994528e-0...","[1.4369648004901479e-06, 1.3377287456399553e-0...","[1.4351758622491e-06, 1.3361298608056843e-06, ..."
...,...,...,...,...,...,...,...,...,...,...
122495,TiO2,Au,100,200,"[400.0, 406.3492063492063, 412.6984126984127, ...","[1.8061374652375413, 1.8107466674172565, 1.815...","[0.6247404502107098, 0.560330332065236, 0.4889...","[22.695964171074394, 22.10867965958843, 21.559...","[3.165304745500093, 3.140208289793411, 3.11613...","[0.48534808007737695, 0.44489755016916765, 0.3..."
122496,TiO2,Ag,100,200,"[400.0, 406.3492063492063, 412.6984126984127, ...","[4.156439654844897, 3.944409659769229, 3.80591...","[6.217249605487175, 5.158603360429961, 3.86682...","[47.24786679498787, 40.85434768441037, 36.4560...","[3.8763516153047957, 3.734195678861204, 3.6231...","[1.9764739392599957, 1.817850024636002, 1.5824..."
122497,TiO2,Si3N4,100,200,"[400.0, 406.3492063492063, 412.6984126984127, ...","[4.433211483300108, 3.368066819129745, 3.09282...","[12.202617783801292, 3.2882408604145907, 2.030...","[49.61738186401228, 29.427003220882753, 23.637...","[3.9242950324046735, 3.415330477984136, 3.2042...","[2.580415126884761, 1.4558765929966218, 1.1088..."
122498,TiO2,ZrO2,100,200,"[400.0, 406.3492063492063, 412.6984126984127, ...","[2.9450800250969262, 3.1586795991931713, 3.928...","[7.752459282587585, 8.713744968363265, 13.2640...","[21.418641658323462, 24.253300370411857, 36.79...","[3.109892829649966, 3.2289568549189402, 3.6322...","[2.1693347217464694, 2.27354188955001, 2.65773..."


## Shuffle and split

In [5]:
# Split data into training (60,000 samples) and test (2,500 samples) sets
df_train, df_test = train_test_split(df, test_size=2500, random_state=42)
len(df_test), len(df_train)

(2500, 120000)

### X-data: Feature Preprocessing

Create scaler pre-processors for the categorical (materials) and numerical (radii) features and apply them on the data.

In [6]:
# Define the preprocessing steps
categorical_features = ['mat_core', 'mat_shell']
numerical_features = ['r_core', 'r_shell']

# Extract features from the train and tes dataframes
X_train_raw = df_train[categorical_features + numerical_features]
X_test_raw = df_test[categorical_features + numerical_features]

# Create the preprocessor for numerical and categorical features
preprocessor_feat = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(feature_range=(-1, 1)), numerical_features),  
        ('cat', OneHotEncoder(categories=[['Si', 'SiO2', 'Au', 'Ag', 'Si3N4', 'ZrO2', 'TiO2'],  
                                          ['Si', 'SiO2', 'Au', 'Ag', 'Si3N4', 'ZrO2', 'TiO2']], 
                              sparse_output=False, handle_unknown='ignore'), categorical_features)
    ]
)
# Fit the preprocessor on the training data
preprocessor_feat.fit(X_train_raw)

# Apply the preprocessor on train and test data
X_train = preprocessor_feat.transform(X_train_raw)
X_test = preprocessor_feat.transform(X_test_raw)


### Y-data: Targets Preprocessing

Combine forward and back-scattering, and apply a min-max scaler.

In [7]:
target_variables = ["log_Q_fwd", "log_Q_back"]

# Normalize the target variables on the training data
scaler_Qfwd = MinMaxScaler(feature_range=(-1, 1))
scaler_Qback = MinMaxScaler(feature_range=(-1, 1))

# get fwd and backward train and test data
y_Qfwd_train_raw = np.stack(df_train["log_Q_fwd"].values, axis=0)
y_Qback_train_raw = np.stack(df_train["log_Q_back"].values, axis=0)

y_Qfwd_test_raw = np.stack(df_test["log_Q_fwd"].values, axis=0)
y_Qback_test_raw = np.stack(df_test["log_Q_back"].values, axis=0)


# Fit the scalers on the training data
scaler_Qfwd.fit(y_Qfwd_train_raw)
scaler_Qback.fit(y_Qback_train_raw)

y_train = np.stack(
    [
        scaler_Qfwd.transform(y_Qfwd_train_raw),
        scaler_Qback.transform(y_Qback_train_raw),
    ],
    axis=-1,
)
y_test = np.stack(
    [
        scaler_Qfwd.transform(y_Qfwd_test_raw),
        scaler_Qback.transform(y_Qback_test_raw),
    ],
    axis=-1,
)


In [8]:
# Checking the min and max values in y_Qfwd_train
min_Q = y_train.min()
max_Q = y_train.max()
print(f"Min/Max train: {min_Q}, {max_Q}")

min_Q = y_test.min()
max_Q = y_test.max()
print(f"Min/Max test: {min_Q}, {max_Q}")


Min/Max train: -1.0, 1.0000000000000004
Min/Max test: -0.9999992347306157, 1.0016937744973184


### Save Preprocessed Training and Testing Data

In [None]:

# Save the DataFrames with log-transformed targets using pickle
dataset_preprocessed_file = 'datasets/core_shell_particles_preprocessed_122500.h5'  

f = h5py.File(dataset_preprocessed_file, "w")
f.create_dataset("X_train", data=X_train, dtype='f4')
f.create_dataset("y_train", data=y_train, dtype='f4')
f.create_dataset("X_test", data=X_test, dtype='f4')
f.create_dataset("y_test", data=y_test, dtype='f4')
f.close()


### Saving the Preprocessors for later reuse 

In [10]:
preprocessor_path = 'datasets/scaler_particle_geometries.pkl' 
scaler_Qfwd_path = 'datasets/scaler_Qfwd.pkl'        
scaler_Qback_path = 'datasets/scaler_Qback.pkl'       


with open(preprocessor_path, 'wb') as f:
    pickle.dump(preprocessor_feat, f)

with open(scaler_Qfwd_path, 'wb') as f:
    pickle.dump(scaler_Qfwd, f)

with open(scaler_Qback_path, 'wb') as f:
    pickle.dump(scaler_Qback, f)