In [16]:
import pandas as pd
import sqlite3
import numpy as np
from plotnine import *

# Connect to Database and Extract Dataset

In [17]:
conn = sqlite3.connect("sabs_moonshot.db")
data = pd.read_sql_query("""
SELECT smiles, fingerprint, r_avg_IC50, f_avg_IC50
FROM assays
INNER JOIN compounds ON compounds.id = assays.compound_id
WHERE assays.r_avg_IC50 != "" AND assays.f_avg_IC50 != "";
""", conn)
pd.to_numeric(data["r_avg_IC50"], errors='coerce')
pd.to_numeric(data["f_avg_IC50"], errors='coerce')
data["r_avg_IC50"] = data["r_avg_IC50"].replace("", np.NaN)
data["f_avg_IC50"] = data["f_avg_IC50"].replace("", np.NaN)
conn.close()
data = data[data["f_avg_IC50"] < 99]
data = data[data["r_avg_IC50"] < 99]
data

Unnamed: 0,smiles,fingerprint,r_avg_IC50,f_avg_IC50
0,N#Cc1cccc(NC(=O)Cc2cncc3ccccc23)c1,0000000000000000000000000000000000000000010000...,14.641091,26.680129
1,O=C(Cc1cncc2ccccc12)Nc1ccccc1,0000000000000000000000000000000000000000010000...,45.077469,57.469670
2,Cc1c(N)cncc1NC(=O)Cc1cccc(Cl)c1,0000000000000000000000000000000000000000010000...,8.201698,4.222340
3,Cc1ccncc1NC(=O)Cc1cccc(Cl)c1,0000000000000000000000000000000000000000010000...,13.895165,24.566619
4,Cc1ccncc1NC(=O)Nc1cccc(Cl)c1,0000000000000000000000000000000000000000000000...,45.689263,64.413611
...,...,...,...,...
656,Cc1ccc(C)c(S(=O)(=O)N2CCN(C(=O)CCl)CC2)c1,0000000000000000000000000000000000000000000000...,1.623369,14.216485
657,O=C(Nc1cncc2ccccc12)C1CCOc2cc(Cl)c(Cl)cc21,0000000000000000000000000000000000000000000000...,0.309337,0.205428
658,O=C(CCl)N1CCN(Cc2cccc(Cl)c2)CC1,0000000000000000000000000000000000000000000000...,0.545448,2.536267
659,O=C(CCl)N1CCN(S(=O)(=O)c2cccc(F)c2)CC1,0000000000000000000000000000000000000000000000...,2.023561,3.666429


## Create X, Y Inputs and Outputs numpy arrays

In [18]:
X = np.array([[int(c) for c in data["fingerprint"].iloc[i]] for i in range(len(data["fingerprint"]))])
Y = np.mean(data[["r_avg_IC50", "f_avg_IC50"]], axis=1).to_numpy()

# Perform Feature Selection

### Remove Low Variance Features

In [19]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold = 0.8 * (1-0.8)) # Remove features which are same in >80% of data
X = sel.fit_transform(X)
X.shape

(424, 42)

### Remove Highly Correlated Features

In [20]:
X = pd.DataFrame(X)
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
X.drop(to_drop, axis=1, inplace=True)
X = X.to_numpy()
X.shape


(424, 40)

### Final Datasets X and Y

In [21]:
X, Y # X is features, Y is avg IC50

(array([[1, 0, 1, ..., 0, 0, 0],
        [1, 0, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 1],
        ...,
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 1, 0]]),
 array([2.06606100e+01, 5.12735694e+01, 6.21201891e+00, 1.92308921e+01,
        5.50514367e+01, 1.95736654e+01, 2.41433120e+01, 5.25886615e+01,
        5.01270136e+01, 3.17851430e+00, 4.09895500e+00, 6.93546884e+00,
        1.12129633e+01, 1.44338049e+01, 6.49360433e-01, 1.41117183e+00,
        1.03219702e+01, 1.30655514e+00, 1.21195759e+00, 2.40495734e+00,
        2.35152632e+01, 2.44485424e+01, 4.95683803e-01, 2.42818953e+00,
        2.88429016e+00, 2.18294887e+00, 2.42818953e+00, 1.12071953e+01,
        2.38599256e+00, 1.69268759e+01, 9.60926899e+00, 3.92923277e+01,
        1.18245094e+00, 2.42818953e+00, 4.67341847e+00, 1.61230004e+01,
        2.53898763e+01, 1.18061211e+00, 1.65141195e+01, 2.60097438e+01,
        7.91992718e+00, 1.77537601e+00, 2.33451033e+01, 1.41997650e