# Set up envinroment for Google Colab

In [3]:
# Check if directory exists and remove it if it does
import os
import shutil
if os.path.exists('crosstalk-q1-2025'):
    shutil.rmtree('crosstalk-q1-2025')
!git clone https://github.com/cottascience/crosstalk-q1-2025.git
%cd crosstalk-q1-2025
#!pip install -r requirements.txt

Cloning into 'crosstalk-q1-2025'...
remote: Enumerating objects: 178, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 178 (delta 7), reused 7 (delta 3), pack-reused 155 (from 1)[K
Receiving objects: 100% (178/178), 33.46 MiB | 23.04 MiB/s, done.
Resolving deltas: 100% (81/81), done.
/content/crosstalk-q1-2025


In [5]:
!pip install -r requirements.txt



# Download the training and test inputs data

### Download the training file and upload it to your google Drive (root folder)
https://drive.google.com/file/d/11S5p0QgP1X9rOFiIjNSLydLenJwm7hle/view?usp=drive_link
https://drive.google.com/file/d/1qM2ikVMKQZsN_WKJc5w7iAulYWpj2rPB/view?usp=drive_link

In [6]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/crosstalk_train.parquet' #check that this is correct

Mounted at /content/drive


# Load the train datasets

In [7]:
from dataset import Dataset
train_dataset = Dataset(filename=file_path, x_col="AVALON") # fingerprints available: 'ATOMPAIR', 'MACCS', 'ECFP6', 'ECFP4', 'FCFP4', 'FCFP6', 'TOPTOR', 'RDK', 'AVALON'

# Get a smaller subset to make it faster to debug

In [8]:
import numpy as np
random_indices = np.random.choice(len(train_dataset.X), size=1000, replace=False)
train_dataset.X = train_dataset.X[random_indices]
train_dataset.y = train_dataset.y[random_indices]

# Let's train catboost classifier and see how well it fits the training data

In [9]:
import catboost as cb
from eval import BinaryEvaluator
params = {
                'random_strength': 2, # only non-default hyperparam, default is 1
                'random_seed': 1234,
                'verbose': 0,
                'loss_function': 'Logloss',
                'task_type': 'GPU',
                'devices': '0'
            }
model = cb.CatBoostClassifier(**params)
eval = BinaryEvaluator(train_dataset.X, train_dataset.y)
model.fit(train_dataset.X, train_dataset.y)
yp = model.predict_proba(train_dataset.X)[:, 1] # or validation
print( eval.compute_metrics(yt=train_dataset.y, yp=yp) ) # or validation

{'accuracy': 1.0, 'balanced_accuracy': 1.0, 'roc_auc': 1.0, 'precision': 1.0, 'recall': 1.0, 'mean_reciprocal_rank': 0.071482871751884, 'positives': 67, 'predicted_positives': 67, 'hits_at_5': 0.07462686567164178, 'precision_at_5': 1.0, 'hits_at_10': 0.14925373134328357, 'precision_at_10': 1.0, 'hits_at_30': 0.44776119402985076, 'precision_at_30': 1.0, 'hits_at_67': 1.0, 'precision_at_67': 1.0}


# How well does it generalize though? Let's try 5-fold cross-validation

In [10]:
model = cb.CatBoostClassifier(**params)
res = eval.CV_model(model)
print(res)

{'mean': {'accuracy': 0.929, 'balanced_accuracy': 0.5116208585848054, 'roc_auc': 0.7053522694888918, 'precision': 0.2, 'recall': 0.02967032967032967, 'mrr': 0.08687235408371514, 'precision_at_k_5': 0.36, 'hits_at_k_5': 0.13626373626373628, 'precision_at_k_10': 0.34, 'hits_at_k_10': 0.2549450549450549, 'precision_at_k_30': 0.18666666666666668, 'hits_at_k_30': 0.4186813186813187}, 'std': {'accuracy': 0.0037416573867739447, 'balanced_accuracy': 0.018650667988979207, 'roc_auc': 0.04793981475496594, 'precision': 0.24494897427831783, 'recall': 0.03638009968625681, 'mrr': 0.026016201857305046, 'precision_at_k_5': 0.14966629547095764, 'hits_at_k_5': 0.059927904195323, 'precision_at_k_10': 0.1019803902718557, 'hits_at_k_10': 0.07936470367150676, 'precision_at_k_30': 0.04521553322083512, 'hits_at_k_30': 0.10275753606646996}}


# Let's compare it against simpler sklearn baselines

In [11]:
from eval import get_baseline_models

eval = BinaryEvaluator(train_dataset.X, train_dataset.y)
baselines = get_baseline_models()
baselines_res = {}

for m in baselines:
    baselines_res[m] = eval.CV_model(baselines[m])

print(baselines_res)

{'stratified_dummy': {'mean': {'accuracy': 0.885, 'balanced_accuracy': 0.48859202151422265, 'roc_auc': 0.48859202151422265, 'precision': 0.0375, 'recall': 0.03076923076923077, 'mrr': 0.1203235979706568, 'precision_at_k_5': 0.0, 'hits_at_k_5': 0.03076923076923077, 'precision_at_k_10': 0.02, 'hits_at_k_10': 0.6153846153846153, 'precision_at_k_30': 0.06000000000000001, 'hits_at_k_30': 1.0}, 'std': {'accuracy': 0.013038404810405309, 'balanced_accuracy': 0.018049376776483373, 'roc_auc': 0.018049376776483328, 'precision': 0.049999999999999996, 'recall': 0.03768445758127967, 'mrr': 0.03258959626756458, 'precision_at_k_5': 0.0, 'hits_at_k_5': 0.03768445758127967, 'precision_at_k_10': 0.04, 'hits_at_k_10': 0.4716833759001813, 'precision_at_k_30': 0.032659863237109045, 'hits_at_k_30': 0.0}}, 'most_frequent_dummy': {'mean': {'accuracy': 0.933, 'balanced_accuracy': 0.5, 'roc_auc': 0.5, 'precision': 0.0, 'recall': 0.0, 'mrr': 1.0, 'precision_at_k_5': 0.08, 'hits_at_k_5': 1.0, 'precision_at_k_10': 0

# Our Approach


In [None]:
test_file_path = '/content/drive/My Drive/crosstalk_test_inputs.parquet' #inputs

test_dataset = Dataset(filename=file_path, x_col="AVALON") # fingerprints available: 'ATOMPAIR', 'MACCS', 'ECFP6', 'ECFP4', 'FCFP4', 'FCFP6', 'TOPTOR', 'RDK', 'AVALON'

print(test_dataset)

