In [1]:
import numpy as np
import pandas as pd

In [2]:
import consts

In [3]:
import xgboost as xgb

In [4]:
dataset = pd.read_csv('data/arc_red_zero.csv', index_col=0)
dataset

Unnamed: 0,Floral,Fruity,Herbal,Green,Woody,Sweet,Balsamic,Earth,Spicy,Animalic,...,SZ,n5ARing,ETA_dEpsilon_A,NtN,n11AHRing,n8FHRing,n11ARing,n3HRing,n8FARing,n11FHRing
107,0,0,0,0,0,1,1,0,0,0,...,13.333333,0,0.149206,0,0,0,0,0,0,0
126,0,0,0,0,1,0,1,0,0,0,...,10.666667,0,0.208889,0,0,0,0,0,0,0
174,0,0,0,0,0,0,0,1,0,0,...,5.666667,0,0.199048,0,0,0,0,0,0,0
177,0,0,0,0,0,1,0,0,0,0,...,4.000000,0,0.171861,0,0,0,0,0,0,0
179,0,0,0,0,0,1,0,0,0,0,...,8.000000,0,0.160952,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143411647,0,0,1,0,0,0,0,0,1,0,...,14.666667,0,0.000000,0,0,0,0,0,0,0
143963103,0,0,0,0,0,0,0,0,0,1,...,18.666667,0,0.060000,0,0,0,0,0,0,0
144116082,1,0,0,1,1,0,0,0,0,0,...,18.333333,0,0.034211,0,0,0,0,0,0,0
145793235,1,1,0,0,0,0,0,0,0,0,...,19.333333,0,0.045845,0,0,0,0,0,0,0


In [5]:
possible_target_cols = consts.POSSIBLE_TARGET_COLS
target_cols = consts.TARGET_COLS
target_cols

array(['Herbal', 'Green', 'Woody', 'Sweet', 'Balsamic', 'Earth', 'Spicy'],
      dtype='<U8')

In [10]:
feature_cols = dataset.columns[possible_target_cols.shape[0]:possible_target_cols.shape[0]+consts.ZERO_NUM_FEATURE_COLS_999]
feature_cols

Index(['LabuteASA', 'AATS1dv', 'Xpc-4dv', 'AMID_C', 'ETA_shape_p', 'ATSC5dv',
       'MATS7Z', 'MATS8Z', 'MATS6pe', 'AATSC8pe',
       ...
       'ATS3s', 'GATS8v', 'ATS4s', 'SLogP', 'MATS1dv', 'Xpc-6d', 'GATS6p',
       'ATSC4c', 'AATSC6i', 'StN'],
      dtype='object', length=311)

In [11]:
positive_dataset = dataset.loc[(dataset[target_cols] != 0).any(axis=1)]
features = positive_dataset[feature_cols].values
labels = positive_dataset[target_cols].values
print(features.shape)
print(labels.shape)

(1811, 311)
(1811, 7)


In [12]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, hamming_loss
from sklearn.model_selection import KFold

METRIC_FUNCTIONS = {'acc': accuracy_score, 'hamm': hamming_loss,
                    'sens': lambda l, p: recall_score(l, p, average='micro'),
                    'f1': lambda l, p: f1_score(l, p, average='micro')}
METRICS = ['acc', 'hamm', 'sens', 'f1']
# Initialize cross-validation
kf = KFold(n_splits=consts.KFOLDS, shuffle=True, random_state=consts.KFOLD_SEED)

val_stats = {metric: [] for metric in METRICS}
# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(kf.split(features, labels)):
    # Split data into train and validation sets
    train_features, val_features = features[train_idx], features[val_idx]
    train_labels, val_labels = labels[train_idx], labels[val_idx]

    clf = xgb.XGBClassifier(tree_method='hist')
    clf.fit(train_features, train_labels)

    val_preds = clf.predict(val_features)

    print(f'----- Fold {fold} -----')
    for metric in METRICS:
        val_metric = METRIC_FUNCTIONS[metric](val_labels, val_preds)
        val_stats[metric].append(val_metric)
        print(f'{metric}:\t', val_metric)

----- Fold 0 -----
acc:	 0.29476584022038566
hamm:	 0.17394726485635575
sens:	 0.4338919925512104
f1:	 0.513215859030837
----- Fold 1 -----
acc:	 0.2596685082872928
hamm:	 0.18389897395422258
sens:	 0.4059233449477352
f1:	 0.4999999999999999
----- Fold 2 -----
acc:	 0.2596685082872928
hamm:	 0.18153117600631413
sens:	 0.41563055062166965
f1:	 0.5043103448275863
----- Fold 3 -----
acc:	 0.24861878453038674
hamm:	 0.1850828729281768
sens:	 0.3956442831215971
f1:	 0.481767955801105
----- Fold 4 -----
acc:	 0.2430939226519337
hamm:	 0.17916337805840568
sens:	 0.39377289377289376
f1:	 0.48642533936651583


In [13]:
print(f'----- Mean -----')
for metric in METRICS:
        print(f'{metric}:\t', np.array(val_stats[metric]).mean())

----- Mean -----
acc:	 0.26116311279545834
hamm:	 0.18072473316069498
sens:	 0.40897261300302123
f1:	 0.4971438998052088


In [None]:
# RED MEDIAN
----- Mean -----
acc:	 0.2600596624202852
hamm:	 0.18127743677718564
sens:	 0.41501479804822017
f1:	 0.5001107014669101

# RED ZERO
----- Mean -----
acc:	 0.26834543323744725
hamm:	 0.17859588929403092
sens:	 0.4186999733273475
f1:	 0.5060110832598637

In [37]:
print(val_preds[:,0].mean())
print(val_preds[:,1].mean())
print(val_preds[:,2].mean())
print(val_preds[:,3].mean())
print(val_preds[:,4].mean())
print(val_preds[:,5].mean())
print(val_preds[:,6].mean())

0.2569060773480663
0.26243093922651933
0.17679558011049723
0.12154696132596685
0.07458563535911603
0.016574585635359115
0.03867403314917127
