In [4]:
import pandas as pd
import numpy as np
import warnings
import yaml
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn import preprocessing
warnings.filterwarnings("ignore")

with open("configuration.yaml", "r") as yml_file:
    config = yaml.load(yml_file, yaml.Loader)

In [5]:
data = pd.read_csv('data/final_dataset.csv')

for col in data.columns[data.isna().any()].tolist():
    data[col].fillna(0, inplace=True)

# data['TF_binding_site_agg'] = np.logical_or(data['TF_binding_site'], data['TF_binding_site_variant']).astype(int)

# data['TF_loss_add'] = data['TF_binding_site_agg'] + data['TF_loss']
# data['TF_gain_add'] = data['TF_binding_site_agg'] + data['TF_gain']
# data['TF_loss_diff_add'] = data['TF_binding_site_agg'] + data['TF_loss_diff']
# data['TF_gain_diff_add'] = data['TF_binding_site_agg'] + data['TF_gain_diff']

data['SpliceAI_pred_DP_AG'] = abs(data['SpliceAI_pred_DP_AG'])
data['SpliceAI_pred_DP_AL'] = abs(data['SpliceAI_pred_DP_AL'])
data['SpliceAI_pred_DP_DG'] = abs(data['SpliceAI_pred_DP_DG'])
data['SpliceAI_pred_DP_DL'] = abs(data['SpliceAI_pred_DP_DL'])


data_test = data[(data['data_source'] == 'Rheinbay et al 2020') | (data['data_source'] == 'Dr.Nod 2023')]
len_test_data = len(data_test)
data_test = pd.concat([data_test, data[data['data_source'] == 'COSMIC'].sample(n=len_test_data)]).reset_index(drop=True)   # get an equal amount of negative data
data = data.drop(data_test.index, inplace=False).reset_index(drop=True, inplace=False)

In [30]:
XGB_PARAMS = {                                            # CODE SOURCE: containers_build\boostdm\config.py
        "objective": "binary:logistic",
        "reg_lambda": 1,
        "random_state": 42,
        "scale_pos_weight": 1,
        "subsample": 0.7,        # fraction of observations to be random samples for each tree
        "reg_alpha": 0,          # L1 regularization term on weight
        "max_delta_step": 0,    # positive value can help make the update step more conservative. generally not used
        "min_child_weight": 1,
        "learning_rate": 1e-03,
        "colsample_bylevel": 1.0,
        "gamma": 0,     # specifies the minimum loss reduction required to make a split. Makes the algorithm conservative
        "colsample_bytree": 1.0,        # fraction of columns to be random samples for each tree
        "booster": "gbtree",
        "max_depth": 4, # Used to control over-fitting as higher depth will allow the model to learn relations very specific to a particular sample
        "silent": 1,
        "seed": 21,
        # "eval_metric": 'logloss',
        # "early_stopping_rounds": 2000
        # "reg_lambda": 1,  # explore this further

}

COLUMNS_TRAINING = config['COLUMNS_TRAINING']

BIASED_COLUMNS = ['chr', 'ref_x', 'IG_C_gene', 'IG_D_gene', 'IG_J_gene', 'IG_J_pseudogene']

COLUMNS_TRAINING = [x for x in COLUMNS_TRAINING if x not in BIASED_COLUMNS]

COLUMNS_SHAP = [f'my_shap_{x}' for x in COLUMNS_TRAINING]
COLUMNS_TRAINING = COLUMNS_TRAINING[:10]

for col in list(set(COLUMNS_TRAINING) - set(data.columns)):
    data[col] = 0

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))

data[COLUMNS_TRAINING] = min_max_scaler.fit_transform(data[COLUMNS_TRAINING])

for col in list(set(COLUMNS_TRAINING) - set(data_test.columns)):
    data_test[col] = 0

data_test[COLUMNS_TRAINING] = min_max_scaler.fit_transform(data_test[COLUMNS_TRAINING])

In [31]:
x_train, x_test, y_train, y_test = train_test_split(data[COLUMNS_TRAINING], data['driver'],
                                                    random_state=104, 
                                                    test_size=0.25, 
                                                    shuffle=True)

In [33]:
params = XGB_PARAMS.copy()                                          
params['n_estimators'] = 20000  # set it high enough to allow "early stopping" events below
params['base_score'] = y_train.mean()
params['silent'] = True
# params['n_jobs'] = 1
params['seed'] = 104
model = XGBClassifier(**params)

# Build step forward feature selection
sfs1 = sfs(model,
           k_features=(5, 10),
           forward=True,
           floating=False,
           verbose=1,
           scoring='accuracy',
           cv=5,
           n_jobs=4)

sfs1 = sfs1.fit(x_train, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.2min finished
Features: 1/10[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed:  1.2min finished
Features: 2/10[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   8 out of   8 | elapsed:  1.5min finished
Features: 3/10[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   7 out of   7 | elapsed:  1.3min finished
Features: 4/10[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of   6 | elapsed:  1.1min finished
Features: 5/10[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   54.4s finished
Features: 6/10[Parallel(n_jobs=4)]: Using backend LokyBacken

In [34]:
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

[0, 1, 2, 3, 4]


In [41]:
x_train.iloc[:, feat_cols]

Unnamed: 0,ada_score,rf_score,ENSP,UNIPARC,GO
297,0.000000,0.000000,0.016393,0.020833,0.351852
179,0.000000,0.000000,0.114754,0.145833,0.216049
554,0.000000,0.000000,0.016393,0.020833,0.006173
583,0.000000,0.000000,0.163934,0.166667,0.055556
137,0.000000,0.000000,0.081967,0.104167,0.049383
...,...,...,...,...,...
654,0.000000,0.000000,0.098361,0.125000,0.086420
251,0.000000,0.000000,0.016393,0.020833,0.611111
729,0.000000,0.000000,0.016393,0.020833,0.061728
705,0.000000,0.000000,0.147541,0.166667,0.166667
