In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [None]:
df_esol = pd.read_csv('../data/raw/esol.csv')
df_smiles = df_esol["smiles"]
df_esol = df_esol[['Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors', 'Number of Rings',
                   'Number of Rotatable Bonds', 'Polar Surface Area', 'measured log solubility in mols per litre']]
df_esol = df_esol.rename(
    columns={'measured log solubility in mols per litre': "y"})

print(df_esol.head(5))


In [None]:
def generate_features(smiles):
    mols = [Chem.MolFromSmiles(x) for x in list(smiles)]
    mol_fps = pd.DataFrame()
    for mol in mols:
        mol_fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        mol_fpvec = np.array(mol_fp)
        mol_fpvec = pd.DataFrame(mol_fpvec).T
        mol_fps = pd.concat([mol_fps, mol_fpvec], axis=0)
    return mol_fps


In [None]:
df_molfps = generate_features(df_smiles)
df_molfps


In [None]:
# only for determining the number of components
pca_c = PCA(n_components=200, whiten=True)
molpfs_pca_c = pca_c.fit_transform(df_molfps)
PC_values = np.arange(pca_c.n_components_) + 1
plt.plot(PC_values, pca_c.explained_variance_ratio_,
         'o-', linewidth=2, color='blue')
plt.title('PCA_components')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()
print("Components = ", pca_c.n_components_, ";\nTotal explained variance = ",
      round(pca_c.explained_variance_ratio_.sum(), 5))

# PCA first 100 features (compare to moc2vec 100 features)
ndims = 100
pca = PCA(n_components=100, whiten=True)
# fit the model to our data and extract the results
molpfs_pca = pca.fit_transform(df_molfps)

print("Components = ", pca.n_components_, ";\nTotal explained variance = ",
      round(pca.explained_variance_ratio_.sum(), 5))

df_molpfs_pca = pd.DataFrame(molpfs_pca)
df_molpfs_pca["y"] = df_esol["y"]
df_molpfs_pca.head(5)
df_molpfs_pca.to_csv(f"../data/processed/esol_pca_{ndims}dims.csv")
