In [7]:
import pandas as pd
from library.functions_to_abstract_data import extract_qm9_data
from torch_geometric.datasets import QM9
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors

Import the dataframe that has only valid smiles:

In [None]:
df_valid_smiles = pd.read_pickle(f'../data/RDKit/rdkit_only_valid_smiles_qm9.pkl')
df_valid = df_valid_smiles.drop(columns=df_valid_smiles.columns[-2:]) # Take all colmumns beside the last two (smiles and gap)

Remove redundant features (>0.90 correlation):

In [9]:
import numpy as np

# Compute correlation matrix
corr_matrix = df_valid.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

# Drop them
df_reduced = df_valid.drop(columns=to_drop)

print(f"Dropped {len(to_drop)} features; remaining: {df_reduced.shape[1]}")


Dropped 40 features; remaining: 177


Apply PCA (creates linear combinations of the features and ensures these linear combinations are orthogonal/independant):

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Standardize
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_reduced)

# Apply PCA
pca = PCA(n_components=0.95)  # Keep 95% of variance
pca_data = pca.fit_transform(scaled_data)

print(f"Original features: {df_reduced.shape[1]}")
print(f"Reduced to {pca_data.shape[1]} principal components")


Original features: 177
Reduced to 78 principal components
