In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr,spearmanr
from matplotlib.patches import Patch
from sklearn import metrics
from lightgbm import LGBMRegressor

# Model development below
Code is scattered, but the goal is to build a run_model function that takes in data, data type, and drug and returns single table comparing all models. Then the goal will be to iterate through data type combos and drugs and come up with a complete analysis output.

In [None]:
from load_data import AMLData, cluster_colors

In [None]:
# point of access for all data
data = AMLData()

In [None]:
drugs_to_focus = [
    'Panobinostat',
    'Gilteritinib',
    'Venetoclax',
    'Sorafenib',
    'Quizartinib (AC220)',
    'Trametinib (GSK1120212)',
]

In [None]:
data.auc_table[drugs_to_focus].dropna(subset=['Venetoclax', 'Panobinostat', 'Sorafenib'])

In [None]:
node_labels = data.meta['Cluster']
node_labels.dropna(inplace=True)

node_lut = dict(zip(sorted(node_labels.unique()), cluster_colors))

node_colors = pd.Series(node_labels, index=data.meta.index.values, name='Cluster').map(node_lut)
handles = [Patch(facecolor=node_lut[name]) for name in node_lut]

In [None]:
table = data.auc_table[data.drug_names].copy()


# at least 100 samples
drug_counts = table.describe().T['count']
high_occ_drugs = drug_counts[drug_counts > 100].index.values

counts = table[table < 100].count()
responsive_drugs = counts[counts > 10].index.values
# only run single drugs for now
drug_solo = [i for i in responsive_drugs if ' - ' not in i]

good_drugs = set(drug_solo).intersection(high_occ_drugs)

# adding FLT3 inhibitors back in.
for i in drugs_to_focus:
    if i not in good_drugs:
        good_drugs.add(i)

In [None]:
print(len(good_drugs))

In [None]:
def return_string(x):
    return '|'.join([str(i) for i in x])

If we wanted to train the model on each cluster, we would have 7-16 samples per cluster.

In [None]:
def score_all(y_test, preds):
    error = np.sqrt(metrics.mean_squared_error(y_test, preds))
    r2 = metrics.r2_score(y_test, preds)
    pearson, pr = pearsonr(y_test, preds)
    spearman, sr = spearmanr(y_test, preds)
    return error, r2, pearson, spearman, pr, sr


def run_subset_model(d_sets, drug_name, flt3_only=False, 
                     non_flt3_only=False, cluster=None):
    
    ds = data.get_trainable_data(d_sets, drug_name,
                                 new_format=True,
                                 flt3_only=flt3_only, 
                                 non_flt3_only=non_flt3_only,
                                 cluster=cluster)
    
    print(f"Feature size : {ds.features.shape}")
    
    args = [ds, True]
    results = pd.DataFrame([run_lgmb(*args)])
    
    if isinstance(d_sets, str):
        out_name = d_sets
    else:
        out_name = '_'.join(sorted(d_sets))
        
    results['source_data'] = out_name
    results['drug_name'] = drug_name
    return results

In [None]:
parameters = dict(
    device_type='cpu',
    boosting_type='gbdt',
    num_threads=8,
    n_jobs=None,
    objective='regression',
    metric='rmse',
    lambda_l1=1000,
    lambda_l2=1,
    reg_alpha=None,
    reg_lambda=None,
    learning_rate=.1,
    tree_learner='serial',
    max_bin=128,
    num_leaves=5,
    max_depth=-1,

    feature_fraction=1,  # .8

    bagging_freq=1,
    bagging_fraction=.8,
    subsample=None,
    subsample_freq=None,

    min_child_weight=0.2,
    min_data_in_leaf=2,
    min_child_samples=None,
    min_gain_to_split=None,
    colsample_bytree=None,
    min_split_gain=None,
    n_estimators=10000,
    verbose=-1,
    deterministic=True,
    random_state=10,

)

In [None]:
def run_lgmb(ds, plot=False):

     
    x_train, x_test, y_train, y_test = ds.train_test_split()
    x_train, y_train = ds.features, ds.target
    feature_names = list(set(ds.features.columns.values))
    model_name = 'gbt2'
    
    lgb_model = LGBMRegressor(**parameters)
    lgb_model.fit(x_train, y_train)
    
    feats = pd.Series(lgb_model.feature_importances_, index=feature_names)
    feats.sort_values(inplace=True, ascending=False)

    selected_feat = feats[feats > 0].index.values
    scores = feats[feats > 0]

    preds = lgb_model.predict(x_test, num_iteration=lgb_model.best_iteration_)
    error, r2, pearson, spearman, pr, sr = score_all(y_test, preds)
    
    
    if plot:
        train_pred = lgb_model.predict(x_train, num_iteration=lgb_model.best_iteration_)
        sns.regplot(y_test, preds, label='test')
        sns.regplot(y_train, train_pred, label='train')
        plt.title("GBT")
        plt.xlabel("Actual")
        plt.ylabel("Predicted")
        plt.legend()
        plt.show()
    
    return {
        'model': model_name,
        'feature_names': list(selected_feat),
        'feature_scores':list(scores.values),
        'n_feat': len(selected_feat),
        'mae': error,
        'r2': r2,
        'pearson': pearson,
        'spearman': spearman,
        'pr': pr,
        'sr': sr,
    }

In [None]:
x = run_subset_model(['proteomics', 'phospho'], 'Venetoclax')
x

In [None]:
def run_all():
    data_sources = [
        'proteomics',
        'rna_seq',
        'phospho',
        'wes',
        ['proteomics', 'phospho'],
        ['proteomics', 'rna_seq'],
        ['proteomics', 'wes'],

        ['phospho', 'rna_seq'],
        ['phospho', 'wes'],
        ['rna_seq', 'wes'],

        ['proteomics', 'phospho', 'wes'],
        ['rna_seq', 'phospho', 'wes'],
        ['proteomics', 'phospho', 'rna_seq'],
        ['proteomics', 'wes', 'rna_seq'],

        ['proteomics', 'phospho', 'rna_seq', 'wes'],
    ]

    all_results = []
    for i in reversed(sorted(drug_subset)):
        print(f"Working on {i}")
        for j in data_sources:
            all_results.append(run_subset_model(j, i))
    all_results = pd.concat(all_results, ignore_index=True)
    all_results.feature_names = all_results.feature_names.str.join('|')
    all_results.feature_scores = all_results.feature_scores.apply(return_string)
    all_results.to_csv('gbt_all_drugs_full_data_model_features_final.csv', )
    all_results.head()
# run_all()

In [None]:
data_sources = [['proteomics', 'phospho']]

drug_subset = ['Sorafenib', 'Panobinostat', 'Venetoclax']

all_results = []
for i in reversed(sorted(drug_subset)):
    print(f"Working on {i}")
    for j in data_sources:
        all_results.append(run_subset_model(j, i))
all_results = pd.concat(all_results, ignore_index=True)
all_results.feature_names = all_results.feature_names.str.join('|')
all_results.feature_scores = all_results.feature_scores.apply(return_string)
all_results.to_csv('gbt_full_data_select_drugs.csv', )
all_results.head()