In [1]:
import pandas as pd
import polars as pl
from rdkit import Chem

In [2]:
training_df = pl.read_csv('data/training_smiles.csv')
test_df = pl.read_csv('data/test_smiles.csv')

## Feature engineering

In [3]:
from rdkit.Chem import rdMolDescriptors as d
import rdkit.Chem.Fragments as f
from rdkit.Chem import Lipinski as l
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
import numpy as np
# from rdkit.Chem import rdFingerprintGenerator

# Define a function to calculate all features
def calculate_all_features(smiles):
    generator = GetMorganGenerator(radius=2, fpSize=1024)

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None, None, None# , None
    
    # fingerprint = np.array(generator.GetFingerprint(mol))
    mol_weight = d.CalcExactMolWt(mol)
    lipinski = l.HeavyAtomCount(mol)
    fragments = f.fr_Al_COO(mol)
    
    return mol_weight, fragments, lipinski # , fingerprint

# Apply the function to the SMILES column and unpack the results
training_df = training_df.with_columns([
    pl.col("SMILES").map_elements(lambda x: calculate_all_features(x)[0], return_dtype=pl.Float64).alias("MolWeight"),
    pl.col("SMILES").map_elements(lambda x: calculate_all_features(x)[1], return_dtype=pl.Int64).alias("Fragments"),
    pl.col("SMILES").map_elements(lambda x: calculate_all_features(x)[2], return_dtype=pl.Int64).alias("Lipinski-HAcount"),
    # pl.col("SMILES").map_elements(lambda x: calculate_all_features(x)[3], return_dtype=pl.String).alias("MorganFingerprints")
])



In [4]:
training_df.select("MolWeight", "Fragments", "Lipinski-HAcount")

MolWeight,Fragments,Lipinski-HAcount
f64,i64,i64
484.136155,0,31
484.163436,0,36
253.991676,0,16
383.184506,0,28
320.114791,0,22
…,…,…
412.120526,0,29
429.137636,0,32
507.130091,0,34
524.209341,0,37


## Make prediction

In [5]:
from sklearn.model_selection import train_test_split

X = training_df['MolWeight', 'Fragments', 'Lipinski-HAcount']
y = training_df[['ACTIVE']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [6]:
value_counts = training_df['ACTIVE'].value_counts()
print("Raw counts:")
print(value_counts)

# Actual ratio
ratio = (value_counts.filter(pl.col("ACTIVE") == 0.0)["count"] / value_counts.filter(pl.col("ACTIVE") == 1.0)["count"]).item()
print("\nCalculated ratio (negative/positive):")
print(ratio)

Raw counts:
shape: (2, 2)
┌────────┬────────┐
│ ACTIVE ┆ count  │
│ ---    ┆ ---    │
│ f64    ┆ u32    │
╞════════╪════════╡
│ 0.0    ┆ 201005 │
│ 1.0    ┆ 7933   │
└────────┴────────┘

Calculated ratio (negative/positive):
25.33782932055969


In [7]:
from xgboost import XGBClassifier

# Taken from https://xgboost.readthedocs.io/en/stable/get_started.html
bst = XGBClassifier(n_estimators=5, max_depth=11, learning_rate=1, scale_pos_weight=ratio, objective='binary:logistic')

In [8]:
X_train, y_train

(shape: (167_150, 3)
 ┌────────────┬───────────┬──────────────────┐
 │ MolWeight  ┆ Fragments ┆ Lipinski-HAcount │
 │ ---        ┆ ---       ┆ ---              │
 │ f64        ┆ i64       ┆ i64              │
 ╞════════════╪═══════════╪══════════════════╡
 │ 432.229204 ┆ 0         ┆ 30               │
 │ 349.055483 ┆ 0         ┆ 23               │
 │ 343.178358 ┆ 0         ┆ 25               │
 │ 391.054611 ┆ 0         ┆ 27               │
 │ 399.169525 ┆ 0         ┆ 30               │
 │ …          ┆ …         ┆ …                │
 │ 397.202048 ┆ 0         ┆ 29               │
 │ 395.130363 ┆ 0         ┆ 28               │
 │ 274.095357 ┆ 0         ┆ 20               │
 │ 298.19328  ┆ 0         ┆ 22               │
 │ 508.200965 ┆ 0         ┆ 37               │
 └────────────┴───────────┴──────────────────┘,
 shape: (167_150, 1)
 ┌────────┐
 │ ACTIVE │
 │ ---    │
 │ f64    │
 ╞════════╡
 │ 0.0    │
 │ 0.0    │
 │ 0.0    │
 │ 0.0    │
 │ 0.0    │
 │ …      │
 │ 0.0    │
 │ 0.0    │
 │

In [9]:
bst.fit(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression

class_weights = {0.0: 1, 1.0: ratio}

clf = LogisticRegression(random_state=42, class_weight=class_weights)

In [36]:
clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


## Evaluation

### Random Forest Classifier

In [37]:
clf_train_preds = clf.predict(X_train)

In [38]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_train, clf_train_preds)
metrics.auc(fpr, tpr)

0.4999968899670337

In [39]:
val_preds = rf = clf.predict(X_test)

In [40]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, val_preds)
metrics.auc(fpr, tpr)

0.4999875730085746

### XGBoost

In [31]:
train_preds = bst.predict(X_train)

In [32]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_train, train_preds)
metrics.auc(fpr, tpr)

0.6410364072743184

In [33]:
val_preds = bst.predict(X_test)

In [34]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, val_preds)
metrics.auc(fpr, tpr)

0.5901755950692451