In [2]:
!pip install chembl_webresource_client
!pip install rdkit

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.3.0-py3-none-any.whl.metadata (9.2 kB)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-26.1.0-py3-none-any.whl.metadata (8.5 kB)
Collecting url-normalize>=2.0 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_cache-1.3.0-py3-none-any.whl (69 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.6/69.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cattrs-26.1.0-py3-none-any.whl (73 kB)
[2K   [90m━━━━━━━━━━━━━━

In [4]:
import pandas as pd
import numpy as np
from rdkit import Chem

df = pd.read_csv("drd2_cleaned_dataset.csv")

df.shape

(890, 5)

In [5]:
from rdkit import Chem

df["mol"] = df["canonical_smiles"].apply(Chem.MolFromSmiles)

df["mol"].isnull().sum()

np.int64(0)

In [6]:
from rdkit.Chem import Descriptors

descriptor_names = [desc[0] for desc in Descriptors.descList]
len(descriptor_names)

217

In [7]:
from rdkit.Chem import Descriptors

# Create dictionary of descriptor functions
desc_funcs = {name: func for name, func in Descriptors.descList}

# Compute descriptors
descriptor_data = []

for mol in df["mol"]:
    values = {}
    for name, func in desc_funcs.items():
        try:
            values[name] = func(mol)
        except:
            values[name] = np.nan
    descriptor_data.append(values)

desc_df = pd.DataFrame(descriptor_data)

desc_df.shape

(890, 217)

In [8]:
desc_df.isnull().sum().sort_values(ascending=False).head(10)

Unnamed: 0,0
MaxAbsEStateIndex,0
MaxEStateIndex,0
MinAbsEStateIndex,0
MinEStateIndex,0
qed,0
SPS,0
MolWt,0
HeavyAtomMolWt,0
ExactMolWt,0
NumValenceElectrons,0


In [9]:
# Variance of each descriptor
variances = desc_df.var()

# Count zero-variance features
zero_var = (variances == 0).sum()

zero_var

np.int64(24)

In [10]:
variances.sort_values().head(10)

Unnamed: 0,0
NumRadicalElectrons,0.0
EState_VSA11,0.0
SlogP_VSA9,0.0
SMR_VSA8,0.0
fr_guanido,0.0
fr_benzodiazepine,0.0
fr_barbitur,0.0
fr_diazo,0.0
fr_lactam,0.0
fr_isothiocyan,0.0


In [11]:
non_zero_desc = desc_df.loc[:, variances > 0]

non_zero_desc.shape

(890, 193)

In [12]:
corr_matrix = non_zero_desc.corr().abs()

# Count highly correlated feature pairs (> 0.9)
high_corr = (corr_matrix > 0.9).sum().sum() - len(corr_matrix)

high_corr

np.int64(392)

In [13]:
# Get upper triangle only
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Get top 10 correlated feature pairs
top_corr_pairs = (
    upper.stack()
    .sort_values(ascending=False)
    .head(10)
)

top_corr_pairs

Unnamed: 0,Unnamed: 1,0
fr_Ar_NH,fr_Nhpyrrole,1.0
fr_COO,fr_COO2,1.0
fr_Imine,fr_amidine,1.0
NumAromaticCarbocycles,fr_benzene,1.0
fr_nitro,fr_nitro_arom,1.0
MaxAbsEStateIndex,MaxEStateIndex,1.0
fr_phenol,fr_phenol_noOrthoHbond,1.0
fr_Al_COO,fr_COO2,1.0
NumAmideBonds,fr_amide,1.0
fr_Al_COO,fr_COO,1.0


In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
desc_scaled = scaler.fit_transform(non_zero_desc)

In [15]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(desc_scaled)

explained_variance = pca.explained_variance_ratio_

explained_variance[:10]

array([0.22324999, 0.07561253, 0.06616656, 0.05476751, 0.04578946,
       0.03414597, 0.029244  , 0.02473935, 0.02331923, 0.02233607])

In [16]:
import numpy as np

cumulative_variance = np.cumsum(explained_variance)

cumulative_variance[:20]

array([0.22324999, 0.29886252, 0.36502908, 0.41979659, 0.46558605,
       0.49973202, 0.52897602, 0.55371537, 0.5770346 , 0.59937067,
       0.61852929, 0.63686569, 0.65364567, 0.66986135, 0.68468518,
       0.69874871, 0.71166061, 0.72344821, 0.73497917, 0.74590471])

In [17]:
# Find components needed for 80% and 90%
components_80 = np.argmax(cumulative_variance >= 0.80) + 1
components_90 = np.argmax(cumulative_variance >= 0.90) + 1

components_80, components_90

(np.int64(27), np.int64(44))

In [20]:
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from rdkit import DataStructs
import numpy as np

generator = GetMorganGenerator(radius=2, fpSize=1024)

def morgan_fp_new(mol):
    fp = generator.GetFingerprint(mol)
    arr = np.zeros((1024,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

morgan_features = np.array([morgan_fp_new(mol) for mol in df["mol"]])

morgan_features.shape

(890, 1024)

In [21]:
morgan_features.mean()

np.float64(0.04996159585674157)

In [22]:
# Variance across columns
fp_variance = morgan_features.var(axis=0)

zero_var_fp = (fp_variance == 0).sum()

zero_var_fp

np.int64(34)

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Remove zero variance bits first
morgan_nonzero = morgan_features[:, fp_variance > 0]

scaler_fp = StandardScaler()
fp_scaled = scaler_fp.fit_transform(morgan_nonzero)

pca_fp = PCA()
pca_fp.fit(fp_scaled)

fp_cumulative = np.cumsum(pca_fp.explained_variance_ratio_)

components_80_fp = np.argmax(fp_cumulative >= 0.80) + 1
components_90_fp = np.argmax(fp_cumulative >= 0.90) + 1

components_80_fp, components_90_fp

(np.int64(187), np.int64(272))

In [24]:
non_zero_desc.to_csv("rdkit_descriptors.csv", index=False)

pd.DataFrame(morgan_nonzero).to_csv("morgan_fingerprints.csv", index=False)

combined_features = np.hstack([non_zero_desc.values, morgan_nonzero])
pd.DataFrame(combined_features).to_csv("combined_features.csv", index=False)

In [25]:
print("Descriptors shape:", non_zero_desc.shape)
print("Morgan shape:", morgan_nonzero.shape)
print("Combined shape:", combined_features.shape)

Descriptors shape: (890, 193)
Morgan shape: (890, 990)
Combined shape: (890, 1183)
