# Training Gradient Boosting
Use this notebook for quick experimenting

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from pathlib import Path
import os

def change_directory_to_repo():
    """Changes working directory to the repository root folder."""
    current_dir = Path.cwd()
    for parent in current_dir.parents:
        # Repository is the first folder with the .git folder
        files = list(parent.glob(".git"))
        if files:
            os.chdir(str(parent))

change_directory_to_repo()

from scripts.training import train_gb

In [16]:
def get_dataset(type: str, is_train=True):
    if is_train:
        name = 'train.npz'
    else:
        name = 'val.npz'
    
    if type == 'maccs':
        dataset = np.load('data/features/fingerprints/maccs/' + name)
    elif type == 'morgan':
        dataset = np.load('data/features/fingerprints/morgan/' + name)
    elif type == 'topological':
        dataset = np.load('data/features/fingerprints/topological/' + name)
    else:
        raise NotImplementedError()
    
    X = dataset['X']
    y = dataset['y']
    weights = train_gb.get_weight(y)
    return X, y, weights

In [17]:
def get_classifier(subsample: float, max_depth: int, n_estimators: int):
    gb = GradientBoostingClassifier(
        subsample=subsample,
        max_depth=max_depth,
        n_estimators=n_estimators,
    )
    return gb

In [18]:
def evaluate(classifier, X, y_true):
    y_pred = classifier.predict(X)
    score = roc_auc_score(y_true, y_pred)
    return score

In [19]:
def train_and_eval(parameters):
    X_train, y_train, weights = get_dataset(parameters['fingerprint'])
    classifier = get_classifier(parameters['subsample'], parameters['max_depth'], parameters['n_estimators'])
    classifier.fit(X_train, y_train, sample_weight=weights)
    train_score = evaluate(classifier, X_train, y_train)
    print(f'Train: {train_score}')

    X_val, y_val, _ = get_dataset(parameters['fingerprint'], is_train=False)
    val_score = evaluate(classifier, X_val, y_val)
    print(f'Val: {val_score}')

In [14]:
parameters = {
    'fingerprint': 'topological',
    'subsample': 0.3,
    'max_depth': 3,
    'n_estimators': 300
}

train_and_eval(parameters)

Train: 0.9620964435014022
Val: 0.8268455602902223
