In [None]:
import pandas as pd
import polars as pl
from rdkit import Chem

In [None]:
training_df = pl.read_csv('data/training_smiles.csv')
test_df = pl.read_csv('data/test_smiles.csv')

## Sanity check

In [None]:
# Is this binary? Yes
training_df['ACTIVE'].unique()

In [None]:
training_df

In [None]:
first_entry = training_df['SMILES'].first()

m1 = Chem.MolFromSmiles(first_entry)
m1

In [None]:
m1.GetNumAtoms()

In [None]:
import rdkit.Chem.rdMolDescriptors as d

d.CalcExactMolWt(m1)

## Feature engineering

In [None]:
# UNBALANCED
training_df['ACTIVE'].value_counts()

In [None]:
def calc_mol_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return d.CalcExactMolWt(mol)

# Apply the function to the SMILES column
training_df = training_df.with_columns(
    pl.col("SMILES").map_elements(calc_mol_descriptors, return_dtype=pl.Float64).alias("MolWeight")
)

In [None]:
import rdkit.Chem.Fragments as f

def fragments(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return f.fr_AI_COO(mol)

# Apply the function to the SMILES column
training_df = training_df.with_columns(
    pl.col("SMILES").map_elements(calc_mol_descriptors, return_dtype=pl.Float64).alias("Fragments")
)

In [None]:
import rdkit.Chem.Lipinski as l

def calc_lipinski(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return l.HeavyAtomCount(mol)

# Apply the function to the SMILES column
training_df = training_df.with_columns(
    pl.col("SMILES").map_elements(calc_lipinski, return_dtype=pl.Int64).alias("Lipinski-HAcount")
)

In [None]:
from rdkit.Chem import rdFingerprintGenerator

def calc_fingerprints(smiles):
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(2, fpSize=1024)
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return mfpgen.GetFingerprint(mol)

# Apply the function to the SMILES column
training_df = training_df.with_columns(
    pl.col("SMILES").map_elements(calc_fingerprints, return_dtype=pl.String).alias("MorganFingerprints")
)

In [None]:
training_df.select("MolWeight", "Fragments", "Lipinski-HAcount", "MorganFingerprints")

## Make prediction

In [None]:
from sklearn.model_selection import train_test_split

X = training_df['MolWeight', 'Fragments', 'Lipinski-HAcount', 'MorganFingerprints']
y = training_df[['ACTIVE']]

X_train, X_test, y_train, y_val = train_test_split(X, y, test_size=.2)

In [None]:
from xgboost import XGBClassifier

# Taken from https://xgboost.readthedocs.io/en/stable/get_started.html
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')

In [None]:
X_train, y_train

In [None]:
bst.fit(X_train, y_train)

In [None]:
preds = bst.predict(X_test)
preds.min(), preds.max()