In [1]:
import pandas as pd

In [2]:
from src.features.smiles_to_feature_vector import generate_feature_vectors_from_smiles

In [3]:
import numpy as np

# ESOL-Features

In [3]:
df_esol = pd.read_csv("../data/raw/esol.csv")

In [8]:
df_features = df_esol.drop(['Compound ID', "ESOL predicted log solubility in mols per litre", "smiles"],axis=1)

In [12]:
df_features = df_features.rename(columns={'measured log solubility in mols per litre': 'y'})

In [14]:
df_features.to_csv("esol_features_6dims.csv")

In [13]:
df_features

Unnamed: 0,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,y
0,1,457.432,7,3,7,202.32,-0.770
1,1,201.225,1,2,2,42.24,-3.300
2,1,152.237,0,0,4,17.07,-2.060
3,2,278.354,0,5,0,0.00,-7.870
4,2,84.143,0,1,0,0.00,-1.330
...,...,...,...,...,...,...,...
1123,1,197.381,0,0,0,0.00,-1.710
1124,1,219.266,1,0,1,71.00,0.106
1125,1,246.359,0,0,7,18.46,-3.091
1126,1,72.151,0,0,1,0.00,-3.180


In [7]:
import os
from rdkit import Chem
from gensim.models import word2vec
from mol2vec.features import mol2alt_sentence

In [56]:
def generate_mol_feature_vector(smiles, model, threshold):
    # count the nr of identifiers or substructures which are found in the pretrained model
    all_identifiers = mol2alt_sentence(Chem.MolFromSmiles(smiles), 1)
    existing_identifiers = [id for id in all_identifiers if id in model.wv]
    ratio_existing = len(existing_identifiers) / len(all_identifiers)
    if ratio_existing < threshold:
        return np.full((1,100), np.NaN)

    # get_mean_vector returns the mean of all the vectors for the given identifiers
    # return vector as pd.DataFrame with one row
    return model.wv.get_mean_vector(existing_identifiers).T

In [54]:
model = word2vec.Word2Vec.load("../models/model_100dim.pkl")

# Lipophilicity

In [4]:
df_lip = pd.read_csv("../data/raw/Lipophilicity.csv")

In [76]:
vecs = df_lip.smiles.apply(generate_mol_feature_vector, model=model, threshold=1)

In [77]:
df_lipo_mol2vec = pd.DataFrame(vecs.to_list())

In [78]:
df_lipo_mol2vec = df_lipo_mol2vec.dropna()

In [79]:
df_lipo_mol2vec = df_lipo_mol2vec.join(df_lip["exp"]).rename({"exp": "y"})

In [81]:
df_lipo_mol2vec.to_csv("../data/processed/lipo_mol2vec_100dims.csv")

## Morgan pca

In [5]:
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
from sklearn.decomposition import PCA

In [9]:
def get_morgan(smiles, dims=100):
    return np.array(GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), 2))
morgan_df = pd.DataFrame(df_lip.smiles.apply(get_morgan).to_list())
pca = PCA(n_components=100)
morgan_pca = pca.fit_transform(morgan_df.to_numpy())

KeyError: 'expt'

In [11]:

df_morgan_pca = pd.DataFrame(morgan_pca).join(df_lip["exp"]).rename(columns={"exp": "y"})
df_morgan_pca.to_csv("../data/processed/lipo_pca_100dims.csv")

# SAMPL

## mol2vec

In [84]:
df_sampl = pd.read_csv("../data/raw/SAMPL.csv")

In [100]:
vecs = df_sampl.smiles.apply(generate_mol_feature_vector, model=model, threshold=1)

In [101]:
df_sampl_mol2vec = pd.DataFrame(vecs.to_list())

In [102]:
df_sampl_mol2vec = df_sampl_mol2vec.dropna()

In [103]:
df_sampl_mol2vec = df_sampl_mol2vec.join(df_sampl["expt"]).rename(columns={"expt": "y"})

In [104]:
df_sampl_mol2vec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,y
0,0.016214,0.012790,-0.017872,-0.028390,-0.042108,-0.051491,-0.083414,-0.109482,-0.073367,-0.094714,...,0.026665,0.011694,-0.092414,0.101662,-0.168516,0.016477,0.157401,-0.094769,-0.019061,-11.01
2,0.049608,0.014997,-0.059075,0.041484,-0.029408,-0.113015,-0.027773,-0.047708,0.033426,-0.056274,...,-0.018149,0.083356,-0.091566,0.065267,-0.072673,0.042162,0.140899,-0.200474,0.017231,1.83
3,0.011491,0.012471,-0.064186,0.009589,-0.065068,-0.038959,-0.057275,-0.074527,-0.049159,-0.123039,...,0.072520,-0.048585,-0.091947,0.071382,-0.155135,0.003637,0.107502,-0.094570,-0.009354,-5.45
4,0.092693,0.010417,0.003927,0.010841,-0.055614,-0.079625,-0.094156,-0.074355,-0.032279,-0.102059,...,0.082309,0.069291,-0.234453,0.067601,-0.075477,-0.034403,0.072965,-0.149033,-0.062987,-4.21
5,0.002142,0.046737,-0.057976,-0.044081,-0.075069,-0.038647,-0.084356,-0.068500,-0.059625,-0.100704,...,0.048821,-0.056462,-0.084724,0.152715,-0.205727,-0.027942,0.184688,-0.082949,-0.009053,-6.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637,0.082068,-0.002458,0.006856,0.005336,-0.028064,-0.098318,-0.087953,-0.099401,-0.054404,-0.062015,...,0.052204,0.067428,-0.172817,0.070968,-0.091630,0.026654,0.105211,-0.125781,-0.038776,-2.04
638,0.061097,0.045054,0.065500,-0.085900,-0.070617,-0.025826,-0.033477,-0.046722,0.000595,-0.089824,...,0.008996,-0.101969,-0.062587,0.024075,-0.152772,0.063236,0.056456,-0.162120,0.038816,-5.48
639,-0.003687,0.029495,-0.086191,-0.033442,-0.086723,-0.040470,-0.066362,-0.068041,-0.040690,-0.093902,...,0.026486,-0.032759,-0.128811,0.096059,-0.210208,-0.023171,0.170117,-0.095462,-0.040933,-8.83
640,0.084672,0.014277,0.013805,0.028286,-0.040380,-0.078853,-0.082111,-0.050337,-0.053138,-0.109502,...,0.092574,0.048336,-0.205517,0.063352,-0.088814,-0.010362,0.075952,-0.122459,-0.053626,0.29


In [105]:
df_sampl_mol2vec.to_csv("../data/processed/sampl_mol2vec_100dims.csv")

## Morgan pca

In [114]:
morgan_df = pd.DataFrame(df_sampl.smiles.apply(get_morgan).to_list())
pca = PCA(n_components=100)
morgan_pca = pca.fit_transform(morgan_df.to_numpy())
df_morgan_pca = pd.DataFrame(morgan_pca).join(df_sampl["expt"]).rename(columns={"expt": "y"})
df_morgan_pca.to_csv("../data/processed/sampl_pca_100dims.csv")