# Unencoded baseline plots, devel version 0.4

## Setup

In [None]:
v = 'v0.4'
rslts_dir = 'results_unenc'
encdg_stts_ttl = ', unencoded'
encdg_stts_nam = 'unencoded'

In [None]:
import pandas as pd
import numpy as np

import umap
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

from scipy.spatial.distance import euclidean
from matplotlib.colors import ListedColormap
from joypy import joyplot

In [None]:
grey_colors = pd.read_csv('plot_color_files/grey_scale.tsv', sep = '\t', index_col = 0)

In [None]:
cancer_type_abbreviation_mapping = {
    'Clear cell renal cell carcinoma': 'KIRCKICH', # combined-in chromophobe
    'Colon adenocarcinoma': 'COADREAD', # combined-in rectal adeno
    'Pancreatic ductal adenocarcinoma': 'PAAD',
    'Breast carcinoma': 'BRCA',
    'Glioblastoma': 'LGGGBM', # combined-in low grade glioma
    'Lung squamous cell carcinoma': 'LUSC',
    'Lung adenocarcinoma': 'LUAD',
    'Endometrial carcinoma': 'UCEC',
    'Head and neck squamous cell carcinoma': 'HNSC',
    'Ovarian carcinoma': 'OV'
}

In [None]:
latent_dim = 'NaN'
epochs = '0'

In [None]:
latent_dim, epochs

In [None]:
mdls_ttl = 'Cell line + CPTAC'
mdls = 'cptac_+_cell_line'

In [None]:
file = 'cl_cp_prot_850.tsv'

In [None]:
dta_ttl = 'proteomics'
dta_typ = 'prot'
dta_typ_obj = pd.read_csv('data/cl_cp_prot_850.tsv', # insert file
                   sep = '\t', index_col = 0)
dta_typ_obj.shape

## UMAP #1

In [None]:
dta_typ_obj.head(1) # Sample IDs as index

In [None]:
reducer = umap.UMAP(n_components=2)
scaled_data = StandardScaler().fit_transform(dta_typ_obj.iloc[:, 2:])
embedding = reducer.fit_transform(scaled_data)
emb_df = pd.DataFrame(embedding, index = dta_typ_obj.index)
emb_lbld = pd.concat([emb_df, dta_typ_obj[['cancer_type', 'model_type']]], axis = 1)
emb_lbld.columns = ['UMAP_1', 'UMAP_2', 'cancer_type', 'model_type']

In [None]:
# Check plot labeling and file naming vars
mdls_ttl, dta_ttl, dta_typ, mdls

In [None]:
def umap_plot_to_disk(emb_lbld, mdls_ttl, dta_ttl, dta_typ, mdls):
    tcga_colors = pd.read_csv('plot_color_files/tcga_colors.tsv', sep = '\t', index_col = 0)
    unique_cancer_types = emb_lbld['cancer_type'].unique()
    custom_palette = {cancer_type: tcga_colors.loc[cancer_type_abbreviation_mapping.get(
        cancer_type, 'Unknown'), 'cohort_color'] for cancer_type in unique_cancer_types}
    emb_lbld.columns = ['UMAP_1', 'UMAP_2', 'Cancer type', 'Model type'] # Re-run reducer
    plt.figure(figsize=(5, 5))                                           # for iterations
    marker_dict = {'Tumor': '^', 'cell line': 'o'}
    sns.scatterplot(data=emb_lbld, x='UMAP_1', y='UMAP_2',
                    hue='Cancer type', style='Model type', markers=marker_dict,
                    palette=custom_palette, legend='full',
                    s = 200)
    plt.xlabel('UMAP_2', fontsize=16)
    plt.ylabel('UMAP_2', fontsize=16)
    plt.legend(title='Cancer Type', loc='upper left', bbox_to_anchor=(1, 1))
    plt.suptitle(mdls_ttl +', '+ dta_ttl, y = 1.002, fontsize = 20)
    plt.title('n = '+str(len(emb_lbld))+encdg_stts_ttl, fontsize = 18)
    legend = plt.legend(title='Sample attributes', title_fontsize='14', loc='upper left',
                        bbox_to_anchor=(1, 1), fontsize=12)
    headers_to_bold = ['Cancer type', 'Model type']
    for text in legend.texts:
        if text.get_text() in headers_to_bold:
            text.set_weight('bold')
    plt.rcParams['text.usetex'] = True
    plt.savefig(rslts_dir+'/umap_'+dta_typ+'_'+mdls+'_'+encdg_stts_nam+'_'+v+'.png',
                bbox_inches = 'tight', dpi = 300)
    return 'UMAP written to disk'
umap_plot_to_disk(emb_lbld, mdls_ttl, dta_ttl, dta_typ, mdls)

## LogReg function

In [None]:
# Model is model type or cancer type
def log_reg(dta_typ_obj, mode):
    col_X_strt = 2 # <-- Skip label columns
    f1_stor_frm = pd.DataFrame()

    # Logistic regression repeat loop
    for i in list(range(0, 15)): # <-- Error control

        # Train test split
        trn = dta_typ_obj.sample(round(len(dta_typ_obj) * .8))
        tst = dta_typ_obj.loc[~dta_typ_obj.index.isin(trn.index)]
    
        X_trn = trn.iloc[:, col_X_strt:]
        X_tst = tst.iloc[:, col_X_strt:]

        # Prediction targets, y is either model system or cancer type
        y_trn = trn[mode]
        y_tst = tst[mode]
        
        clf = LogisticRegression().fit(X_trn, y_trn)
        y_pred = clf.predict(X_tst)
        
        f1_by_class = f1_score(y_tst, y_pred, average=None)
        f1_df = pd.DataFrame({'Label': list(y_tst.unique()),
                              'F1_Score': f1_by_class})
        f1_stor_frm = pd.concat([f1_stor_frm, f1_df], axis = 0)
    return f1_stor_frm

## Logreg model-type plot - greyscale #2

In [None]:
mode_ttl = 'model type'
mode = 'model_type'
f1_stor_frm = log_reg(dta_typ_obj, mode)
sample_counts = dict(dta_typ_obj.model_type.value_counts())
def logreg_model_plot(f1_stor_frm, mdls, dta_typ, latent_dim, epochs, mode):
    plt.figure(figsize=(8, 4.5))
    sns.set_style("whitegrid")
    sns.set(font_scale=1.5)
    
    sns.barplot(x='Label', y='F1_Score', data=f1_stor_frm, palette=['#666666', '#999999'],
               errorbar=None)
    sns.swarmplot(x='Label', y='F1_Score', data=f1_stor_frm, color='#333333', size=14)
    
    plt.suptitle('Logistic regression, '+mode_ttl+', '+dta_ttl,
                 fontsize=24, y = 1.03)
    plt.title(mdls_ttl+encdg_stts_ttl, fontsize=20)
    plt.xlabel('Model Type', fontsize=20)
    plt.ylabel('F1 Score', fontsize=20)

    # Sample counts is global var
    new_labels = [f"{label}, n = {sample_counts[label]}" for label in sample_counts.keys()]
    plt.xticks(ticks=range(len(new_labels)), labels=new_labels, fontsize=20)
    
    plt.grid(color='grey', linestyle='-', linewidth=0.25, alpha=0.5)
    plt.rcParams['text.usetex'] = True
    plt.savefig(rslts_dir+'/log_reg_'+mode+'_'+dta_typ+'_'+encdg_stts_nam+'_'+v+'.png',
                bbox_inches='tight')
logreg_model_plot(f1_stor_frm, mdls, dta_typ, latent_dim, epochs, mode)

## LogReg cancer-type plot - TCGA colors #3

In [None]:
mode_ttl = 'cancer type'
mode = 'cancer_type'
f1_stor_frm = log_reg(dta_typ_obj, mode)
tcga_colors = pd.read_csv('plot_color_files/tcga_colors.tsv',
                          sep = '\t')
tcga_color_mapping = dict(zip(tcga_colors['tcga_cohorts'], tcga_colors['cohort_color']))
unique_labels = f1_stor_frm['Label'].unique()
palette_dict = {}
for label in unique_labels:
    tcga_abbreviation = cancer_type_abbreviation_mapping.get(label)
    color = tcga_color_mapping.get(tcga_abbreviation)
    if color:
        palette_dict[label] = color
def lgrg_plt_fnc(f1_stor_frm, mdls, data_name, latent_dim, epochs, mode):
    plt.figure(figsize=(8, 4))
    sns.set_style("whitegrid")

    ax = sns.barplot(
        x='Label', y='F1_Score', data=f1_stor_frm,
        palette=palette_dict,
        errorbar=None)
    sns.swarmplot(x='Label', y='F1_Score', data=f1_stor_frm,
                  color='#333333', size=7)
    
    plt.suptitle('Logistic regression, '+mode_ttl+', '+dta_ttl,
             fontsize=24, y = 1.04)
    plt.title(mdls_ttl, fontsize=20)
    plt.xlabel('Cancer type', fontsize=20)
    plt.ylabel('F1 Score', fontsize=20)
    
    plt.xticks(rotation=45, ha = 'right', fontsize = 16)
    plt.grid(color='grey', linestyle='-', linewidth=0.25, alpha=0.5)
    plt.rcParams['text.usetex'] = True
    plt.savefig(rslts_dir+'/log_reg_'+mode+'_'+dta_typ+'_'+v+'.png',
                bbox_inches='tight')
lgrg_plt_fnc(f1_stor_frm, mdls, dta_typ, latent_dim, epochs, mode)    

## Euclidean setup, #4 & 5

In [None]:
# Euclicean distance, model type
def mdl_typ_dist(sample, features, df):
    other_types = df[df['model_type'] != sample['model_type']]
    mean_features_other_types = other_types[features].mean()
    distance = euclidean(sample[features], mean_features_other_types)
    return distance

In [None]:
# Euclidean distance, cancer type
def cncr_typ_dist(sample, features, df):
    other_types = df[df['cancer_type'] != sample['cancer_type']]
    mean_features_other_types = other_types[features].mean()
    distance = euclidean(sample[features], mean_features_other_types)
    return distance

In [None]:
dta_typ_obj = pd.read_csv('data/'+file,
                   sep = '\t', index_col = 0)
feature_columns = dta_typ_obj.columns[2:]
dta_typ_obj['mdl_typ_dstncs'] = dta_typ_obj.apply(
    lambda row: mdl_typ_dist(row, feature_columns, dta_typ_obj), axis=1)
dta_typ_obj['cncr_typ_dstncs'] = dta_typ_obj.apply(
    lambda row: cncr_typ_dist(row, feature_columns, dta_typ_obj), axis=1)
new_cols = ['cancer_type', 'model_type', 'cncr_typ_dstncs', 'mdl_typ_dstncs'] + list(feature_columns)
dta_typ_obj = dta_typ_obj[new_cols]

## Eucldn Colrs, #4

In [None]:
mode_ttl = 'cancer type'
mode = 'cancer_type'
dstnc_typ = 'cncr_typ_dstncs'
average_distances = dta_typ_obj.groupby(
    mode)[dstnc_typ].mean().sort_values(ascending=False)
sorted_df = dta_typ_obj.loc[dta_typ_obj[mode].isin(average_distances.index)]
sorted_df[mode] = pd.Categorical(
    sorted_df[mode], categories=average_distances.index, ordered=True)
sorted_df = sorted_df.sort_values(mode)
tcga_colors = pd.read_csv('plot_color_files/tcga_colors.tsv', sep = '\t', index_col = 0)
custom_color_list = [tcga_colors.loc[cancer_type_abbreviation_mapping[cancer_type],'cohort_color'] for cancer_type in average_distances.index]
custom_colormap = ListedColormap(custom_color_list)

In [None]:
y_values = np.linspace(0.75, 0.068, 7)
plt.figure()
joyplot(data=sorted_df[[mode, dstnc_typ]], by=mode,
    figsize=(10, 6.5), colormap=custom_colormap,
    fade=True)

for y_value, cancer_type in zip(y_values, sorted_df[mode].unique()):
    count = dict(sorted_df[mode].value_counts())[cancer_type]
    x_position = sorted_df[dstnc_typ].max()
    plt.annotate(f"n={count}", xy=(x_position, y_value), verticalalignment='center')

plt.suptitle('Euclidean Distances, '+mode_ttl+', '+dta_ttl,
             fontsize=30, y = 1.01)
plt.title(mdls_ttl+ ', n = '+str(dta_typ_obj.shape[0])+encdg_stts_ttl,
          y = .92, x = .22, fontsize = 26)

plt.rcParams['text.usetex'] = True

plt.annotate(
    r'Variance of means: $\mathbf{' + f'{average_distances.var():.3f}' + '}$',
    xy=(0.01, 0.87), xycoords='axes fraction',
    ha='right', va='top')

plt.savefig(rslts_dir+'/euc-rdgln_'+mode+'_'+mdls+'_'+dta_typ+'_'+v+'.png',
            bbox_inches = 'tight', dpi = 300)

## Eucldn Gry, #5

### Build grey Euc ridge obj

In [None]:
mode_ttl = 'model type'
mode = 'model_type'
dstnc_typ = 'mdl_typ_dstncs'

abbreviation_mapping = {
    'cell line': 'cell line',
    'Tumor': 'Tumor',
    'HCMI': 'HCMI', # devel
}

average_distances = dta_typ_obj.groupby(
    mode)[dstnc_typ].mean().sort_values(ascending=False)
sorted_df = dta_typ_obj.loc[dta_typ_obj[mode].isin(average_distances.index)]
sorted_df[mode] = pd.Categorical(
    sorted_df[mode], categories=average_distances.index, ordered=True)
sorted_df = sorted_df.sort_values(mode)
custom_color_list = [grey_colors.loc[
                     abbreviation_mapping[
                     model_type],'quant_mode_color'] for model_type in average_distances.index]
custom_colormap = ListedColormap(custom_color_list)

### Euc ridge grey plot

In [None]:
y_values = np.linspace(0.52, 0.15, 2)
plt.figure()
joyplot(data=sorted_df[[mode, dstnc_typ]], by=mode,
    figsize=(10, 6.5), colormap=custom_colormap,
    fade=True)

for y_value, cancer_type in zip(y_values, sorted_df[mode].unique()):
    count = dict(sorted_df[mode].value_counts())[cancer_type]
    x_position = sorted_df[dstnc_typ].max()
    plt.annotate(f"n={count}", xy=(x_position, y_value), verticalalignment='center', fontsize = 24)

plt.suptitle('Euclidean Distances, '+mode_ttl+', '+dta_ttl,
             fontsize=30, y = 1.06)
plt.title(mdls_ttl+ ', n = '+str(dta_typ_obj.shape[0])+encdg_stts_ttl,
          y = .97, x = .4, fontsize = 26)

plt.rcParams['text.usetex'] = True

plt.annotate(
    r'Variance of means: $\mathbf{' + f'{average_distances.var():.3f}' + '}$',
    xy=(0.2, 0.87), xycoords='axes fraction',
    ha='right', va='top')

plt.savefig(rslts_dir+'/euc-rdgln_'+mode+'_'+mdls+'_'+dta_typ+'_'+v+'.png',
            bbox_inches = 'tight', dpi = 300)