In [None]:
import pandas as pd
import polars as pl
from rdkit import Chem

In [None]:
training_df = pl.read_csv('data/training_smiles.csv')
test_df = pl.read_csv('data/test_smiles.csv')

## Feature engineering

In [None]:
from rdkit.Chem import rdMolDescriptors as d
import rdkit.Chem.Fragments as f
from rdkit.Chem import Lipinski as l
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
import numpy as np
# from rdkit.Chem import rdFingerprintGenerator

# Define a function to calculate all features
def calculate_all_features(smiles):
    generator = GetMorganGenerator(radius=2, fpSize=1024)

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None, None, None# , None
    
    # fingerprint = np.array(generator.GetFingerprint(mol))
    mol_weight = d.CalcExactMolWt(mol)
    lipinski = l.HeavyAtomCount(mol)
    fragments = f.fr_Al_COO(mol)
    
    return mol_weight, fragments, lipinski # , fingerprint

# Apply the function to the SMILES column and unpack the results
training_df = training_df.with_columns([
    pl.col("SMILES").map_elements(lambda x: calculate_all_features(x)[0], return_dtype=pl.Float64).alias("MolWeight"),
    pl.col("SMILES").map_elements(lambda x: calculate_all_features(x)[1], return_dtype=pl.Int64).alias("Fragments"),
    pl.col("SMILES").map_elements(lambda x: calculate_all_features(x)[2], return_dtype=pl.Int64).alias("Lipinski-HAcount"),
    # pl.col("SMILES").map_elements(lambda x: calculate_all_features(x)[3], return_dtype=pl.String).alias("MorganFingerprints")
])

In [None]:
training_df.select("MolWeight", "Fragments", "Lipinski-HAcount")

## Make prediction

In [None]:
from sklearn.model_selection import train_test_split

X = training_df['MolWeight', 'Fragments', 'Lipinski-HAcount']
y = training_df[['ACTIVE']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [None]:
value_counts = training_df['ACTIVE'].value_counts()
print("Raw counts:")
print(value_counts)

# Actual ratio
ratio = (value_counts.filter(pl.col("ACTIVE") == 0.0)["count"] / value_counts.filter(pl.col("ACTIVE") == 1.0)["count"]).item()
print("\nCalculated ratio (negative/positive):")
print(ratio)

In [None]:
# Use GridSearchCV to find the best parameters for XGBoost
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

params = {
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [5, 10, 20, 50, 100],
    # 'scale_pos_weight': [1, ratio],
    'objective': ['binary:logistic', 'binary:hinge', 'binary:logitraw']
}
grid_search = GridSearchCV(estimator=XGBClassifier(random_state=42, eval_metric='auc'), param_grid=params, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_params

In [None]:
from xgboost import XGBClassifier

# Taken from https://xgboost.readthedocs.io/en/stable/get_started.html
bst = XGBClassifier(
    **best_params,
    scale_pos_weight=ratio,
    random_state=42,
    eval_metric='auc'
)

In [None]:
X_train, y_train

In [None]:
bst.fit(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression

class_weights = {0.0: 1, 1.0: ratio}

clf = LogisticRegression(random_state=42, class_weight=class_weights)

In [None]:
clf.fit(X_train, y_train)

## Evaluation

### Logistic Regression

In [None]:
clf_train_preds = clf.predict(X_train)

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_train, clf_train_preds)
auc = metrics.auc(fpr, tpr)
auc

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line for random classifier
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
val_preds = rf = clf.predict(X_test)

In [None]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, val_preds)
auc = metrics.auc(fpr, tpr)
auc

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line for random classifier
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

### XGBoost

In [None]:
train_preds = bst.predict(X_train)

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_train, train_preds)
auc = metrics.auc(fpr, tpr)
auc

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line for random classifier
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
val_preds = bst.predict(X_test)

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, val_preds)
auc = metrics.auc(fpr, tpr)
auc

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line for random classifier
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()