# Ensemble Learning

## Weighted Mean Ensemble

In [None]:
from models.ensemble_weighted_mean import EnsembleWeightedMean

ensemble_direct_optimization = EnsembleWeightedMean(optimization=True)
ensemble_direct_optimization_with_features = EnsembleWeightedMean(optimization=True, with_features=True)
ensemble_weigthed_mean = EnsembleWeightedMean(optimization=False)

data = 'ensemble-pd-hek293t-pe2.csv'

ensemble_direct_optimization.fit(data)
direct_optimization_performance = ensemble_direct_optimization.test(data) 

ensemble_weigthed_mean.fit(data)
weighted_mean_performance = ensemble_weigthed_mean.test(data)

ensemble_direct_optimization_with_features.fit(data)
with_features_performance = ensemble_direct_optimization_with_features.test(data)

In [None]:
import numpy as np
import pandas as pd
from os.path import join as pjoin

direct_op_pearson, direct_op_spearman = direct_optimization_performance
performance_weighted_pearson, performance_weighted_spearman = weighted_mean_performance
performance_with_features_pearson, performance_with_features_spearman = with_features_performance

# join the performance values, ignore the common keys
direct_op_pearson.update(performance_weighted_pearson)
direct_op_spearman.update(performance_weighted_spearman)

direct_op_pearson.update(performance_with_features_pearson)
direct_op_spearman.update(performance_with_features_spearman)

performance_pearson = direct_op_pearson
performance_spearman = direct_op_spearman

# plot the performance as bar plot
import matplotlib.pyplot as plt
import seaborn as sns

alpha = 0.5
f_size = 12
# plot the bar plot on top of the strip plot
# bar plot should be shortened to emphasize the difference in values
for name, performance in zip(['Pearson', 'Spearman'], [performance_pearson, performance_spearman]):
    # performance = pd.DataFrame({'Models': list(performance.keys()), 'Performance': list(performance.values()), 'Category': [0 if 'op' in model or 'pwm' in model else 1 for model in performance.keys()]})
    # print(performance)
    # add a category column
    print(name)
    fig, ax = plt.subplots(figsize=(5, 2.5))
    ax.set_ylim(0.65, 0.9)
    colour_palette = iter(sns.color_palette('icefire', n_colors=3))
    colours = ['gray' if not ('opt' in model or 'pwm' in model) else next(colour_palette) for model in performance.keys()]
    sns.stripplot(data=performance, ax=ax, alpha=1, jitter=0.1, size=3, palette=colours)
    sns.barplot(data=performance, ax=ax, alpha=alpha, palette=colours, errorbar=None)
    ax.set_xlabel('Ensemble', fontsize=f_size)
    ax.set_ylabel(f'{name} correlation', fontsize=f_size)
    ax.tick_params(axis='both', which='major', labelsize=f_size)
    ax.tick_params(axis='both', which='minor', labelsize=f_size)
    # remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # show a horizontal line at the mean for 'opt', 'pwm' models of matching color
    for ind, model in enumerate(performance.keys()):
        if 'opt' in model or 'pwm' in model:
            ax.axhline(y=performance[model], color=colours[ind], linestyle='--', linewidth=0.5)
    # rotate the x tick labels
    plt.show()
    
    # perform paired t-test between opt pwm models and the rest
    from scipy.stats import ttest_ind
    adaboost_values = {model: performance[model] for model in performance.keys() if 'opt' in model or 'pwm' in model}
    rest_values = {model: performance[model] for model in performance.keys() if not ('opt' in model or 'pwm' in model)}

    for ensemble, performance_ensemble in zip(adaboost_values.keys(), adaboost_values.values()):
        for rest, performance_rest in zip(rest_values.keys(), rest_values.values()):
            t_stat, p_value = ttest_ind(performance_ensemble, performance_rest)
            print(f'{ensemble} vs {rest} t-statistic: {t_stat}, p-value: {p_value}')
            if p_value < 0.05:
                print('Significant')
            else:
                print('Not significant')
                
    # save the figure
    fig.savefig(pjoin('dissertation', 'figures', f'ensemble_{name.lower()}.pdf'), bbox_inches='tight')

In [1]:
# train performance weighted mean ensemble with features
from models.ensemble_weighted_mean import EnsembleWeightedMean

# performance weighted mean ensemble
ensemble_pwm = EnsembleWeightedMean(optimization=False, with_features=False)

data = 'ensemble-dp-hek293t-pe2.csv'

ensemble_pwm.fit(data)

Training ridge
Training xgb
Training rf


## Bagging



In [None]:
from models.ensemble_bagging import EnsembleBagging

data = 'ensemble-pd-hek293t-pe2.csv'

n_rounds = [1, 2, 3, 5, 10, 15]

performances_pearson = {}
performances_spearman = {}
for i in n_rounds:
    print(f'Bagging with {i} rounds')
    ensemble_bagging = EnsembleBagging(n_rounds=i)
    ensemble_bagging.fit(data)
    performance_pearson, performance_spearman = ensemble_bagging.test(data)
    performance_pearson[f'bag-{i}'] = performance_pearson.pop('bag')
    performance_spearman[f'bag-{i}'] = performance_spearman.pop('bag')
    performances_pearson.update(performance_pearson)
    performances_spearman.update(performance_spearman)

In [None]:
import  numpy as np
import pandas as pd
from os.path import join as pjoin
from scipy.stats import ttest_ind

# plot the performance as bar plot
import matplotlib.pyplot as plt
import seaborn as sns

alpha = 0.5
f_size = 12

for name, performance in zip(['Pearson', 'Spearman'], [performances_pearson, performances_spearman]):
    fig, ax = plt.subplots(figsize=(5, 2.5))
    ax.set_ylim(0.65, 0.9)
    colour_palette = iter(sns.color_palette('icefire', n_colors=len(n_rounds)))
    colours = ['gray' if 'bag' not in model else next(colour_palette) for model in performance.keys()]
    sns.stripplot(data=performance, ax=ax, alpha=1, jitter=0.1, size=3, palette=colours)
    sns.barplot(data=performance, ax=ax, alpha=alpha, palette=colours, errorbar=None)
    ax.set_xlabel('Model', fontsize=f_size)
    ax.set_ylabel(f'{name} correlation', fontsize=f_size)
    ax.tick_params(axis='both', which='major', labelsize=f_size)
    ax.tick_params(axis='both', which='minor', labelsize=f_size)
    # remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # show a horizontal line at the mean for adaboost model
    for ind, model in enumerate(performance.keys()):
        if 'bag' in model:
            ax.axhline(y=np.mean(performance[model]), color=colours[ind], linestyle='--', linewidth=0.5)
    # rotate the x tick labels
    ax.set_xticklabels(ax.get_xticklabels(), rotation=40, horizontalalignment='right')
    plt.show()

    # save the figure
    fig.savefig(pjoin('dissertation', 'figures', f'ensemble_bagging_{name.lower()}_round.png'), bbox_inches='tight', dpi=300)
    
    # perform paired t-test between opt pwm models and the rest
    from scipy.stats import ttest_ind
    adaboost_values = {model: performance[model] for model in performance.keys() if 'bag' in model}
    rest_values = {model: performance[model] for model in performance.keys() if 'bag' not in model}

    for ensemble, performance_ensemble in zip(adaboost_values.keys(), adaboost_values.values()):
        for rest, performance_rest in zip(rest_values.keys(), rest_values.values()):
            t_stat, p_value = ttest_ind(performance_ensemble, performance_rest)
            print(f'{ensemble} vs {rest} t-statistic: {t_stat}, p-value: {p_value}')
            if p_value < 0.05:
                print('Significant')
            else:
                print('Not significant')

In [None]:
from models.ensemble_bagging import EnsembleBagging

data = 'ensemble-pd-hek293t-pe2.csv'

percentages = [0.3, 0.5, 0.7, 0.9]

performances_pearson = {}
performances_spearman = {}
for i in percentages:
    ensemble_bagging = EnsembleBagging(n_rounds=3, sample_percentage=i)
    ensemble_bagging.fit(data)
    performance_pearson, performance_spearman = ensemble_bagging.test(data)
    performance_pearson[f'bag-{i}'] = performance_pearson.pop('bag')
    performance_spearman[f'bag-{i}'] = performance_spearman.pop('bag')
    performances_pearson.update(performance_pearson)
    performances_spearman.update(performance_spearman)

In [None]:
import  numpy as np
import pandas as pd
from os.path import join as pjoin
from scipy.stats import ttest_ind

# plot the performance as bar plot
import matplotlib.pyplot as plt
import seaborn as sns

alpha = 0.5
f_size = 12

for name, performance in zip(['Pearson', 'Spearman'], [performances_pearson, performances_spearman]):
    fig, ax = plt.subplots(figsize=(5, 2.5))
    ax.set_ylim(0.65, 0.9)
    colour_palette = iter(sns.color_palette('icefire', n_colors=len(percentages)))
    colours = ['gray' if 'bag' not in model else next(colour_palette) for model in performance.keys()]
    sns.stripplot(data=performance, ax=ax, alpha=1, jitter=0.1, size=3, palette=colours)
    sns.barplot(data=performance, ax=ax, alpha=alpha, palette=colours, errorbar=None)
    ax.set_xlabel('Model', fontsize=f_size)
    ax.set_ylabel(f'{name} correlation', fontsize=f_size)
    ax.tick_params(axis='both', which='major', labelsize=f_size)
    ax.tick_params(axis='both', which='minor', labelsize=f_size)
    # remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # show a horizontal line at the mean for adaboost model
    for ind, model in enumerate(performance.keys()):
        if 'bag' in model:
            ax.axhline(y=np.mean(performance[model]), color=colours[ind], linestyle='--', linewidth=0.5)
    # rotate the x tick labels
    ax.set_xticklabels(ax.get_xticklabels(), rotation=40, horizontalalignment='right')
    plt.show()

    # save the figure
    fig.savefig(pjoin('dissertation', 'figures', f'ensemble_{name.lower()}_bagging_percentage.pdf'), bbox_inches='tight', dpi=300)
    
    # perform paired t-test between opt pwm models and the rest
    from scipy.stats import ttest_ind
    adaboost_values = {model: performance[model] for model in performance.keys() if 'bag' in model}
    rest_values = {model: performance[model] for model in performance.keys() if 'bag' not in model}

    for ensemble, performance_ensemble in zip(adaboost_values.keys(), adaboost_values.values()):
        for rest, performance_rest in zip(rest_values.keys(), rest_values.values()):
            t_stat, p_value = ttest_ind(performance_ensemble, performance_rest)
            print(f'{ensemble} vs {rest} t-statistic: {t_stat}, p-value: {p_value}')
            if p_value < 0.05:
                print('Significant')
            else:
                print('Not significant')

## AdaBoost Ensemble

In [1]:
# tune adaboost models
from models.ensemble_adaboost import EnsembleAdaBoost
# disable warning
import warnings
warnings.filterwarnings('ignore')

data = 'ensemble-pd-hek293t-pe2.csv'

ensemble_adaboost = EnsembleAdaBoost()
params = ensemble_adaboost.tune(data)

Power: 1, Threshold: 0.05, Rounds: 1, Pearson: 0.8362684472728854, Spearman: 0.8618529173303514
Power: 1, Threshold: 0.1, Rounds: 1, Pearson: 0.8360670814416642, Spearman: 0.8614859544750895
Power: 1, Threshold: 0.2, Rounds: 1, Pearson: 0.8347947021759317, Spearman: 0.8610169140900504
Power: 1, Threshold: 0.3, Rounds: 1, Pearson: 0.8369271022734002, Spearman: 0.862071598752438
Power: 1, Threshold: 0.5, Rounds: 1, Pearson: 0.8393064682045722, Spearman: 0.8634157160382048
Power: 1, Threshold: 0.7, Rounds: 1, Pearson: 0.8371736899899517, Spearman: 0.8624066932620561
Power: 1, Threshold: 0.05, Rounds: 3, Pearson: 0.8296377485949542, Spearman: 0.8567639926328968
Power: 1, Threshold: 0.1, Rounds: 3, Pearson: 0.835181385758919, Spearman: 0.8602522584804607
Power: 1, Threshold: 0.2, Rounds: 3, Pearson: 0.8355581018442533, Spearman: 0.8600356652521848
Power: 1, Threshold: 0.3, Rounds: 3, Pearson: 0.8373111795262218, Spearman: 0.863030095370604
Power: 1, Threshold: 0.5, Rounds: 3, Pearson: 0.829

In [2]:
import pandas as pd
from os.path import join as pjoin

# convert params grid to dictionary
params = pd.DataFrame(params)

# save the parameters to a file
params.to_csv(pjoin('models', 'data', 'performance', 'ensemble_adaboost_params.csv'), index=False)

In [1]:
from models.ensemble_adaboost import EnsembleAdaBoost

data = 'ensemble-pd-hek293t-pe2.csv'

ensemble_adaboost_performances_pearson = {}
ensemble_adaboost_performances_spearman = {}

rounds = [1, 2, 3, 5, 10]

for round in rounds:  
    ensemble_adaboost = EnsembleAdaBoost(n_rounds=round)
    ensemble_adaboost.fit(data)
    ensemble_adaboost_performance_pearson, ensemble_adaboost_performance_spearman = ensemble_adaboost.test(data)
    # rename the keys to include the round number
    ensemble_adaboost_performance_pearson[f'ada-{round}'] = ensemble_adaboost_performance_pearson.pop('ada')
    ensemble_adaboost_performance_spearman[f'ada-{round}'] = ensemble_adaboost_performance_spearman.pop('ada')

    ensemble_adaboost_performances_pearson.update(ensemble_adaboost_performance_pearson)
    ensemble_adaboost_performances_spearman.update(ensemble_adaboost_performance_spearman)

Round 1 xgb
[0.23073747 0.23073747 0.23073747 ... 0.23073747 1.73895825 0.23073747]
Round 1 mlp




[0.27196443 0.08620591 0.08620591 ... 0.08620591 2.04966614 0.08620591]
Round 1 ridge
[0.16396746 0.09010845 0.05197357 ... 0.09010845 2.14245452 0.05197357]
Round 1 rf
[0.02836673 0.01558896 0.00899154 ... 0.01558896 0.37064937 0.00899154]
Round 1 dp
99


  return F.conv1d(input, weight, bias, self.stride,


[0.02855051 0.01568995 0.00904979 ... 0.01568995 0.37305061 0.00904979]
Round 1 xgb
[0.23388482 0.23388482 0.23388482 ... 0.23388482 0.23388482 0.23388482]
Round 1 mlp


configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.



[0.09980612 0.09980612 0.09980612 ... 0.09980612 0.27184386 0.09980612]
Round 1 ridge
[0.05699213 0.10445325 0.05699213 ... 0.10445325 0.28450133 0.05699213]
Round 1 rf
[0.00972456 0.01782285 0.00972456 ... 0.01782285 0.34121809 0.00972456]
Round 1 dp
99


  return F.conv1d(input, weight, bias, self.stride,


[0.00242624 0.00444673 0.01014391 ... 0.00444673 0.35593247 0.00242624]
Round 1 xgb
[0.23279516 0.23279516 0.23279516 ... 0.23279516 1.72993943 0.23279516]
Round 1 mlp
[0.08392788 0.27506989 0.08392788 ... 0.08392788 0.62368201 0.08392788]
Round 1 ridge
[0.05071687 0.16622227 0.08757203 ... 0.08757203 0.65076228 0.05071687]
Round 1 rf
[0.00869156 0.02848619 0.01500758 ... 0.01500758 0.11152378 0.00869156]
Round 1 dp
99


  return F.conv1d(input, weight, bias, self.stride,


[0.00437543 0.01434028 0.0153813  ... 0.0153813  0.11430096 0.00437543]
Round 1 xgb
[0.23708532 0.23708532 0.23708532 ... 0.23708532 0.23708532 0.23708532]
Round 1 mlp
[0.08409219 0.08409219 0.08409219 ... 0.08409219 0.2775151  0.2775151 ]
Round 1 ridge
[0.05157333 0.0875814  0.0875814  ... 0.0875814  0.28902994 0.28902994]
Round 1 rf
[0.00982972 0.01669275 0.01669275 ... 0.01669275 0.05508823 0.34174219]
Round 1 dp
99
[0.00290597 0.00493489 0.00493489 ... 0.00493489 0.05705448 0.35393992]
Round 1 xgb
[0.23216122 0.23216122 0.23216122 ... 0.23216122 0.23216122 0.23216122]
Round 1 mlp




[0.0800001  0.27420349 0.0800001  ... 0.0800001  0.27420349 0.0800001 ]
Round 1 ridge
[0.04711346 0.16148323 0.08326874 ... 0.08326874 0.28540686 0.04711346]
Round 1 rf
[0.00805859 0.02762113 0.01424282 ... 0.01424282 0.33553022 0.00805859]
Round 1 dp
99


  return F.conv1d(input, weight, bias, self.stride,


[0.00186763 0.00640139 0.00330087 ... 0.00330087 0.35263381 0.00186763]


FileNotFoundError: [Errno 2] No such file or directory: 'models/trained-models/ensemble/weighted-mean/dp-pd-hek293t-pe2-fold-1.pt'

In [None]:
import  numpy as np
import pandas as pd
from os.path import join as pjoin
from scipy.stats import ttest_ind
# plot the performance as bar plot
import matplotlib.pyplot as plt
import seaborn as sns

alpha = 0.5
f_size = 12

for name, performance in zip(['Pearson', 'Spearman'], [ensemble_adaboost_performances_pearson, ensemble_adaboost_performances_spearman]):
    fig, ax = plt.subplots(figsize=(5, 2.5))
    ax.set_ylim(0.65, 0.9)
    colour_palette = iter(sns.color_palette('icefire', n_colors=len(rounds)))
    colours = ['gray' if 'ada' not in model else next(colour_palette) for model in performance.keys()]
    sns.stripplot(data=performance, ax=ax, alpha=1, jitter=0.1, size=3, palette=colours)
    sns.barplot(data=performance, ax=ax, alpha=alpha, palette=colours, errorbar=None)
    ax.set_xlabel('Model', fontsize=f_size)
    ax.set_ylabel(f'{name} correlation', fontsize=f_size)
    ax.tick_params(axis='both', which='major', labelsize=f_size)
    ax.tick_params(axis='both', which='minor', labelsize=f_size)
    # remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # show a horizontal line at the mean for adaboost model
    for ind, model in enumerate(performance.keys()):
        if 'ada' in model:
            ax.axhline(y=np.mean(performance[model]), color=colours[ind], linestyle='--', linewidth=0.5)
    # rotate the x tick labels
    plt.show()
    
    # perform paired t-test between opt pwm models and the rest
    from scipy.stats import ttest_ind
    adaboost_values = {model: performance[model] for model in performance.keys() if 'ada' in model}
    rest_values = {model: performance[model] for model in performance.keys() if 'ada' not in model}

    for ensemble, performance_ensemble in zip(adaboost_values.keys(), adaboost_values.values()):
        for rest, performance_rest in zip(rest_values.keys(), rest_values.values()):
            t_stat, p_value = ttest_ind(performance_ensemble, performance_rest)
            print(f'{ensemble} vs {rest} t-statistic: {t_stat}, p-value: {p_value}')
            if p_value < 0.05:
                print('Significant')
            else:
                print('Not significant')