# Ensemble Learning

## Weighted Mean Ensemble

In [None]:
from models.ensemble_weighted_mean import EnsembleWeightedMean

ensemble_direct_optimization = EnsembleWeightedMean(optimization=True)
ensemble_direct_optimization_with_features = EnsembleWeightedMean(optimization=True, with_features=True)
ensemble_weigthed_mean = EnsembleWeightedMean(optimization=False)

data = 'ensemble-pd-hek293t-pe2.csv'

ensemble_direct_optimization.fit(data)
direct_optimization_performance = ensemble_direct_optimization.test(data) 

ensemble_weigthed_mean.fit(data)
weighted_mean_performance = ensemble_weigthed_mean.test(data)

ensemble_direct_optimization_with_features.fit(data)
with_features_performance = ensemble_direct_optimization_with_features.test(data)

In [None]:
import numpy as np
import pandas as pd
from os.path import join as pjoin

direct_op_pearson, direct_op_spearman = direct_optimization_performance
performance_weighted_pearson, performance_weighted_spearman = weighted_mean_performance
performance_with_features_pearson, performance_with_features_spearman = with_features_performance

# join the performance values, ignore the common keys
direct_op_pearson.update(performance_weighted_pearson)
direct_op_spearman.update(performance_weighted_spearman)

direct_op_pearson.update(performance_with_features_pearson)
direct_op_spearman.update(performance_with_features_spearman)

performance_pearson = direct_op_pearson
performance_spearman = direct_op_spearman

# plot the performance as bar plot
import matplotlib.pyplot as plt
import seaborn as sns

alpha = 0.5
f_size = 12
# plot the bar plot on top of the strip plot
# bar plot should be shortened to emphasize the difference in values
for name, performance in zip(['Pearson', 'Spearman'], [performance_pearson, performance_spearman]):
    # performance = pd.DataFrame({'Models': list(performance.keys()), 'Performance': list(performance.values()), 'Category': [0 if 'op' in model or 'pwm' in model else 1 for model in performance.keys()]})
    # print(performance)
    # add a category column
    print(name)
    fig, ax = plt.subplots(figsize=(5, 2.5))
    ax.set_ylim(0.65, 0.9)
    colour_palette = iter(sns.color_palette('icefire', n_colors=3))
    colours = ['gray' if not ('opt' in model or 'pwm' in model) else next(colour_palette) for model in performance.keys()]
    sns.stripplot(data=performance, ax=ax, alpha=1, jitter=0.1, size=3, palette=colours)
    sns.barplot(data=performance, ax=ax, alpha=alpha, palette=colours, errorbar=None)
    ax.set_xlabel('Ensemble', fontsize=f_size)
    ax.set_ylabel(f'{name} correlation', fontsize=f_size)
    ax.tick_params(axis='both', which='major', labelsize=f_size)
    ax.tick_params(axis='both', which='minor', labelsize=f_size)
    # remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # show a horizontal line at the mean for 'opt', 'pwm' models of matching color
    for ind, model in enumerate(performance.keys()):
        if 'opt' in model or 'pwm' in model:
            ax.axhline(y=performance[model], color=colours[ind], linestyle='--', linewidth=0.5)
    # rotate the x tick labels
    plt.show()
    
    # perform paired t-test between opt pwm models and the rest
    from scipy.stats import ttest_ind
    adaboost_values = {model: performance[model] for model in performance.keys() if 'opt' in model or 'pwm' in model}
    rest_values = {model: performance[model] for model in performance.keys() if not ('opt' in model or 'pwm' in model)}

    for ensemble, performance_ensemble in zip(adaboost_values.keys(), adaboost_values.values()):
        for rest, performance_rest in zip(rest_values.keys(), rest_values.values()):
            t_stat, p_value = ttest_ind(performance_ensemble, performance_rest)
            print(f'{ensemble} vs {rest} t-statistic: {t_stat}, p-value: {p_value}')
            if p_value < 0.05:
                print('Significant')
            else:
                print('Not significant')
                
    # save the figure
    fig.savefig(pjoin('dissertation', 'figures', f'ensemble_{name.lower()}.pdf'), bbox_inches='tight')

## Bagging



In [None]:
from models.ensemble_bagging import EnsembleBagging

data = 'ensemble-pd-hek293t-pe2.csv'

n_rounds = [1, 2, 3, 5, 10, 15]

performances_pearson = {}
performances_spearman = {}
for i in n_rounds:
    print(f'Bagging with {i} rounds')
    ensemble_bagging = EnsembleBagging(n_rounds=i)
    ensemble_bagging.fit(data)
    performance_pearson, performance_spearman = ensemble_bagging.test(data)
    performance_pearson[f'bag-{i}'] = performance_pearson.pop('bag')
    performance_spearman[f'bag-{i}'] = performance_spearman.pop('bag')
    performances_pearson.update(performance_pearson)
    performances_spearman.update(performance_spearman)

In [None]:
import  numpy as np
import pandas as pd
from os.path import join as pjoin
from scipy.stats import ttest_ind

# plot the performance as bar plot
import matplotlib.pyplot as plt
import seaborn as sns

alpha = 0.5
f_size = 12

for name, performance in zip(['Pearson', 'Spearman'], [performances_pearson, performances_spearman]):
    fig, ax = plt.subplots(figsize=(5, 2.5))
    ax.set_ylim(0.65, 0.9)
    colour_palette = iter(sns.color_palette('icefire', n_colors=len(n_rounds)))
    colours = ['gray' if 'bag' not in model else next(colour_palette) for model in performance.keys()]
    sns.stripplot(data=performance, ax=ax, alpha=1, jitter=0.1, size=3, palette=colours)
    sns.barplot(data=performance, ax=ax, alpha=alpha, palette=colours, errorbar=None)
    ax.set_xlabel('Model', fontsize=f_size)
    ax.set_ylabel(f'{name} correlation', fontsize=f_size)
    ax.tick_params(axis='both', which='major', labelsize=f_size)
    ax.tick_params(axis='both', which='minor', labelsize=f_size)
    # remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # show a horizontal line at the mean for adaboost model
    for ind, model in enumerate(performance.keys()):
        if 'bag' in model:
            ax.axhline(y=np.mean(performance[model]), color=colours[ind], linestyle='--', linewidth=0.5)
    # rotate the x tick labels
    ax.set_xticklabels(ax.get_xticklabels(), rotation=40, horizontalalignment='right')
    plt.show()

    # save the figure
    fig.savefig(pjoin('dissertation', 'figures', f'ensemble_bagging_{name.lower()}_round.png'), bbox_inches='tight', dpi=300)
    
    # perform paired t-test between opt pwm models and the rest
    from scipy.stats import ttest_ind
    adaboost_values = {model: performance[model] for model in performance.keys() if 'bag' in model}
    rest_values = {model: performance[model] for model in performance.keys() if 'bag' not in model}

    for ensemble, performance_ensemble in zip(adaboost_values.keys(), adaboost_values.values()):
        for rest, performance_rest in zip(rest_values.keys(), rest_values.values()):
            t_stat, p_value = ttest_ind(performance_ensemble, performance_rest)
            print(f'{ensemble} vs {rest} t-statistic: {t_stat}, p-value: {p_value}')
            if p_value < 0.05:
                print('Significant')
            else:
                print('Not significant')

In [None]:
from models.ensemble_bagging import EnsembleBagging

data = 'ensemble-pd-hek293t-pe2.csv'

percentages = [0.3, 0.5, 0.7, 0.9]

performances_pearson = {}
performances_spearman = {}
for i in percentages:
    ensemble_bagging = EnsembleBagging(n_rounds=3, sample_percentage=i)
    ensemble_bagging.fit(data)
    performance_pearson, performance_spearman = ensemble_bagging.test(data)
    performance_pearson[f'bag-{i}'] = performance_pearson.pop('bag')
    performance_spearman[f'bag-{i}'] = performance_spearman.pop('bag')
    performances_pearson.update(performance_pearson)
    performances_spearman.update(performance_spearman)

In [None]:
import  numpy as np
import pandas as pd
from os.path import join as pjoin
from scipy.stats import ttest_ind

# plot the performance as bar plot
import matplotlib.pyplot as plt
import seaborn as sns

alpha = 0.5
f_size = 12

for name, performance in zip(['Pearson', 'Spearman'], [performances_pearson, performances_spearman]):
    fig, ax = plt.subplots(figsize=(5, 2.5))
    ax.set_ylim(0.65, 0.9)
    colour_palette = iter(sns.color_palette('icefire', n_colors=len(percentages)))
    colours = ['gray' if 'bag' not in model else next(colour_palette) for model in performance.keys()]
    sns.stripplot(data=performance, ax=ax, alpha=1, jitter=0.1, size=3, palette=colours)
    sns.barplot(data=performance, ax=ax, alpha=alpha, palette=colours, errorbar=None)
    ax.set_xlabel('Model', fontsize=f_size)
    ax.set_ylabel(f'{name} correlation', fontsize=f_size)
    ax.tick_params(axis='both', which='major', labelsize=f_size)
    ax.tick_params(axis='both', which='minor', labelsize=f_size)
    # remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # show a horizontal line at the mean for adaboost model
    for ind, model in enumerate(performance.keys()):
        if 'bag' in model:
            ax.axhline(y=np.mean(performance[model]), color=colours[ind], linestyle='--', linewidth=0.5)
    # rotate the x tick labels
    ax.set_xticklabels(ax.get_xticklabels(), rotation=40, horizontalalignment='right')
    plt.show()

    # save the figure
    fig.savefig(pjoin('dissertation', 'figures', f'ensemble_{name.lower()}_bagging_percentage.pdf'), bbox_inches='tight', dpi=300)
    
    # perform paired t-test between opt pwm models and the rest
    from scipy.stats import ttest_ind
    adaboost_values = {model: performance[model] for model in performance.keys() if 'bag' in model}
    rest_values = {model: performance[model] for model in performance.keys() if 'bag' not in model}

    for ensemble, performance_ensemble in zip(adaboost_values.keys(), adaboost_values.values()):
        for rest, performance_rest in zip(rest_values.keys(), rest_values.values()):
            t_stat, p_value = ttest_ind(performance_ensemble, performance_rest)
            print(f'{ensemble} vs {rest} t-statistic: {t_stat}, p-value: {p_value}')
            if p_value < 0.05:
                print('Significant')
            else:
                print('Not significant')

## AdaBoost Ensemble

In [None]:
from models.ensemble_adaboost import EnsembleAdaBoost

data = 'ensemble-pd-hek293t-pe2.csv'

ensemble_adaboost_performances_pearson = {}
ensemble_adaboost_performances_spearman = {}

rounds = [1, 2, 3, 5, 10, 15]

for round in rounds:  
    ensemble_adaboost = EnsembleAdaBoost(n_rounds=round)
    ensemble_adaboost.fit(data)
    ensemble_adaboost_performance_pearson, ensemble_adaboost_performance_spearman = ensemble_adaboost.test(data)
    # rename the keys to include the round number
    ensemble_adaboost_performance_pearson[f'ada-{round}'] = ensemble_adaboost_performance_pearson.pop('ada')
    ensemble_adaboost_performance_spearman[f'ada-{round}'] = ensemble_adaboost_performance_spearman.pop('ada')

    ensemble_adaboost_performances_pearson.update(ensemble_adaboost_performance_pearson)
    ensemble_adaboost_performances_spearman.update(ensemble_adaboost_performance_spearman)

In [None]:
import  numpy as np
import pandas as pd
from os.path import join as pjoin
from scipy.stats import ttest_ind
# plot the performance as bar plot
import matplotlib.pyplot as plt
import seaborn as sns

alpha = 0.5
f_size = 12

for name, performance in zip(['Pearson', 'Spearman'], [ensemble_adaboost_performances_pearson, ensemble_adaboost_performances_spearman]):
    fig, ax = plt.subplots(figsize=(5, 2.5))
    ax.set_ylim(0.65, 0.9)
    colour_palette = iter(sns.color_palette('icefire', n_colors=len(rounds)))
    colours = ['gray' if 'ada' not in model else next(colour_palette) for model in performance.keys()]
    sns.stripplot(data=performance, ax=ax, alpha=1, jitter=0.1, size=3, palette=colours)
    sns.barplot(data=performance, ax=ax, alpha=alpha, palette=colours, errorbar=None)
    ax.set_xlabel('Model', fontsize=f_size)
    ax.set_ylabel(f'{name} correlation', fontsize=f_size)
    ax.tick_params(axis='both', which='major', labelsize=f_size)
    ax.tick_params(axis='both', which='minor', labelsize=f_size)
    # remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # show a horizontal line at the mean for adaboost model
    for ind, model in enumerate(performance.keys()):
        if 'ada' in model:
            ax.axhline(y=np.mean(performance[model]), color=colours[ind], linestyle='--', linewidth=0.5)
    # rotate the x tick labels
    plt.show()
    
    # perform paired t-test between opt pwm models and the rest
    from scipy.stats import ttest_ind
    adaboost_values = {model: performance[model] for model in performance.keys() if 'ada' in model}
    rest_values = {model: performance[model] for model in performance.keys() if 'ada' not in model}

    for ensemble, performance_ensemble in zip(adaboost_values.keys(), adaboost_values.values()):
        for rest, performance_rest in zip(rest_values.keys(), rest_values.values()):
            t_stat, p_value = ttest_ind(performance_ensemble, performance_rest)
            print(f'{ensemble} vs {rest} t-statistic: {t_stat}, p-value: {p_value}')
            if p_value < 0.05:
                print('Significant')
            else:
                print('Not significant')