In [None]:
import pandas as pd
import polars as pl
from rdkit import Chem

In [None]:
training_df = pl.read_csv('data/training_smiles.csv')
test_df = pl.read_csv('data/test_smiles.csv')

## Sanity check

In [None]:
# Is this binary? Yes
training_df['ACTIVE'].unique()

In [None]:
training_df

In [None]:
first_entry = training_df['SMILES'].first()

m1 = Chem.MolFromSmiles(first_entry)
m1

In [None]:
m1.GetNumAtoms()

In [None]:
import rdkit.Chem.rdMolDescriptors as d

d.CalcExactMolWt(m1)

## Feature engineering

In [None]:
from rdkit.Chem import rdMolDescriptors as d
import rdkit.Chem.Fragments as f
from rdkit.Chem import Lipinski as l
# from rdkit.Chem import rdFingerprintGenerator

# Define a function to calculate all features
def calculate_all_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None, None, None# , None
    
    # mfpgen = rdFingerprintGenerator.GetMorganGenerator(2, fpSize=1024)
    # fingerprint = mfpgen.GetFingerprint(mol)
    mol_weight = d.CalcExactMolWt(mol)
    lipinski = l.HeavyAtomCount(mol)
    fragments = f.fr_Al_COO(mol)
    
    return mol_weight, fragments, lipinski # , fingerprint

# Apply the function to the SMILES column and unpack the results
training_df = training_df.with_columns([
    pl.col("SMILES").map_elements(lambda x: calculate_all_features(x)[0], return_dtype=pl.Float64).alias("MolWeight"),
    pl.col("SMILES").map_elements(lambda x: calculate_all_features(x)[1], return_dtype=pl.Int64).alias("Fragments"),
    pl.col("SMILES").map_elements(lambda x: calculate_all_features(x)[2], return_dtype=pl.Int64).alias("Lipinski-HAcount"),
    # pl.col("SMILES").map_elements(lambda x: calculate_all_features(x)[3], return_dtype=pl.String).alias("MorganFingerprints")
])

In [None]:
training_df.select("MolWeight", "Fragments", "Lipinski-HAcount")

## Make prediction

In [None]:
from sklearn.model_selection import train_test_split

X = training_df['MolWeight', 'Fragments', 'Lipinski-HAcount']
y = training_df[['ACTIVE']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [None]:
value_counts = training_df['ACTIVE'].value_counts()
print("Raw counts:")
print(value_counts)

# Actual ratio
ratio = value_counts.filter(pl.col("ACTIVE") == 0.0)["count"] / value_counts.filter(pl.col("ACTIVE") == 1.0)["count"]
print("\nCalculated ratio (negative/positive):")
print(ratio.item())

In [None]:
from xgboost import XGBClassifier

# Taken from https://xgboost.readthedocs.io/en/stable/get_started.html
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, scale_pos_weight=ratio.item(), objective='binary:logistic')

In [None]:
X_train, y_train

In [None]:
bst.fit(X_train, y_train)

## Evaluation

In [None]:
train_preds = bst.predict(X_train)

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_train, train_preds)
metrics.auc(fpr, tpr)

In [None]:
val_preds = bst.predict(X_test)

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, val_preds)
metrics.auc(fpr, tpr)