## Setup

In [None]:
# encdd_v0.3.9.ipynb, corresponding to unenc 0.4
# From encdd_v0.2.ipynb

# template for encdd_v0.4 where there will be four loss term
    # can do y-mapping with two cancer types
    # or fix distanc functions to take multiple model and cancer types
    # want mult model and cancer type versions of both the cosine similarity and the MMD

In [None]:
# 2023-11-09 restart
# Submit abstract per current ops - 
    # add note on digital twin - 
# clear outputs and re-check code - 
# logs/v0.3.9_prot_20231102-080655
    # last out put dir, find latest unenc figs
# PUT NON-CLEARED OUTPUT VERSION TO UNCONTROLLED PARENT DIR

# ended with loss plots on 11/02

In [None]:
%whos

In [None]:
v = 'v0.3.9' # results_encdd dir created

In [None]:
v

## log_dir devel record

In [None]:
from pathlib import Path
import datetime
log_dir = Path("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

In [None]:
log_dir

In [None]:
log_dir = Path("logs", v + "_prot_" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
log_dir

In [None]:
log_dir = Path("logs", v + log_dta + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
log_dir

In [None]:
dta_ttl = 'proteomics'
dta_typ = 'prot'
log_dta = '_prot_'
dta_typ_unenc = pd.read_csv('data/cl_cp_prot_850.tsv',
                   sep = '\t', index_col = 0)

In [None]:
# ^sent inline to script
    # three parts - imports, set path, set plot label / file naming vars

## Build VAE

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import datetime # Nested in Path func
from pathlib import Path # log dir

# from argparse import ArgumentParser
from scipy.spatial import distance  # for cosine similarity calc
import numpy as np
from sklearn.preprocessing import LabelEncoder

model_type_encoder = LabelEncoder()
cancer_type_encoder = LabelEncoder()

def calculate_cosine_similarity(df: pd.DataFrame, model_type_column='model_type') -> (dict, dict):
    # Separate the DataFrame into two based on the model_type
    df_cell_line = df[df[model_type_column] == 'cell line'].drop(columns=[model_type_column])
    df_tumor = df[df[model_type_column] == 'Tumor'].drop(columns=[model_type_column])

    # Initialize dictionaries to store the results
    cosine_similarities_cell_line = {}
    cosine_similarities_tumor = {}

    # Identify the columns that are common and valid for mean calculation
    valid_columns = df_cell_line.select_dtypes(include=[np.number]).columns

    # Calculate the mean vector for each class for the relevant columns
    mean_vector_cell_line = df_cell_line[valid_columns].mean(axis=0).values
    mean_vector_tumor = df_tumor[valid_columns].mean(axis=0).values

    # Loop through each sample in cell_line and calculate cosine similarity to both mean vectors
    for index, row in df_cell_line.iterrows():
        sample_vector = row[valid_columns].values
        sim_to_cell_line = distance.cosine(sample_vector, mean_vector_cell_line)
        sim_to_tumor = distance.cosine(sample_vector, mean_vector_tumor)
        cosine_similarities_cell_line[index] = (sim_to_cell_line, sim_to_tumor)

    # Loop through each sample in tumor and calculate cosine similarity to both mean vectors
    for index, row in df_tumor.iterrows():
        sample_vector = row[valid_columns].values
        sim_to_cell_line = distance.cosine(sample_vector, mean_vector_cell_line)
        sim_to_tumor = distance.cosine(sample_vector, mean_vector_tumor)
        cosine_similarities_tumor[index] = (sim_to_cell_line, sim_to_tumor)

    # Combine the 2 dist dicts into one dict
    cosine_similarities_tumor.update(cosine_similarities_cell_line)
    cosine_similarities = cosine_similarities_tumor

    # Prefilled 0s lists, number of dictionary keys
    intra_cluster_tensor = list(range(df.shape[0]))
    inter_cluster_tensor = list(range(df.shape[0]))

    assert len(intra_cluster_tensor) == len(df), "Length of list is not as expected"
    assert len(inter_cluster_tensor) == len(df), "Length of list is not as expected"

    # Populate the zero lists with dist scores
    for key in cosine_similarities.keys():
        intra_cluster_tensor[key] = cosine_similarities[key][0]
        inter_cluster_tensor[key] = cosine_similarities[key][1]

    # Convert lists to tensors
    intra_cluster_tensor = tf.convert_to_tensor(intra_cluster_tensor, dtype=np.float32)
    inter_cluster_tensor = tf.convert_to_tensor(inter_cluster_tensor, dtype=np.float32)

    return intra_cluster_tensor, inter_cluster_tensor


# Define the Sampling Layer
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
    # Stock TensorFlow
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.random.normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon


# Define the VAE class
class VAE(keras.Model):
    def __init__(self, encoder, decoder, columns, **kwargs):
        super().__init__(**kwargs)
        self.encoder: keras.Model = encoder
        self.decoder: keras.Model = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
        self.distance_loss_tracker = keras.metrics.Mean(name="distance_loss")
        self.columns = columns

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
            self.distance_loss_tracker,
        ] 

    def train_step(self, data):
        with tf.GradientTape() as tape:
            converted_data: pd.DataFrame = pd.DataFrame(data.numpy(), columns=self.columns)
            model_type = converted_data["model_type"]
            model_type = model_type.astype(int)
            model_type = model_type_encoder.inverse_transform(model_type)
            data = converted_data.drop(columns=["model_type"])
            assert "model_type" not in data.columns, "model_type should not be in data"

            cancer_type = converted_data["cancer_type"]
            cancer_type = cancer_type.astype(int)
            cancer_type = cancer_type_encoder.inverse_transform(cancer_type)
            data = data.drop(columns=["cancer_type"])
            assert "cancer_type" not in data.columns, "cancer_type should not be in data"

            data = tf.convert_to_tensor(data)

            z_mean, z_log_var, z = self.encoder(data)

            labeled_embeddings: pd.DataFrame = pd.DataFrame(z.numpy())
            labeled_embeddings["model_type"] = model_type

            intra_cluster_distance, inter_cluster_distance = calculate_cosine_similarity(
                df=labeled_embeddings,
                model_type_column='model_type')

            reconstruction = self.decoder(z)
            reconstruction_loss = .1 * data.shape[1] * keras.losses.binary_crossentropy(data, reconstruction)
            kl_loss = - 0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
            distance_loss = - 100 * inter_cluster_distance
            total_loss = reconstruction_loss + kl_loss + distance_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        self.distance_loss_tracker.update_state(distance_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
            "distance_loss": self.distance_loss_tracker.result(),
        }


# Build Encoder
def build_encoder(feature_dim, latent_dim) -> keras.Model:
    encoder_inputs = keras.Input(shape=(feature_dim,), name="input_1")
    x = keras.layers.Dense(latent_dim, kernel_initializer='glorot_uniform', name="encoder_dense_1")(encoder_inputs)
    x = keras.layers.BatchNormalization(name="batchnorm")(x)
    z_mean = layers.Dense(latent_dim, name="z_mean")(x)
    z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
    z = Sampling()([z_mean, z_log_var])
    encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    return encoder


# Build Decoder
def build_decoder(feature_dim, latent_dim) -> keras.Model:
    latent_inputs = keras.Input(shape=(latent_dim,))
    x = keras.layers.Dense(feature_dim, kernel_initializer='glorot_uniform', activation='sigmoid')(latent_inputs)
    decoder_outputs = x
    decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
    return decoder

latent_dim = 50
learning_rate = 0.001

epochs = 30
epochs = 5
epochs = 10

batch_size = 128

## Run VAE

In [None]:
dta_ttl = 'proteomics'
dta_typ = 'prot'
log_dta = '_prot_'
dta_typ_unenc = pd.read_csv('data/cl_cp_prot_850.tsv',
                   sep = '\t', index_col = 0)

In [None]:
log_dir = Path("logs", v + log_dta + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

# if __name__ == '__main__':
# parser = ArgumentParser()
# parser.add_argument("-f", "--file", action="store", type=str, required=True)

# args = parser.parse_args()

# file: str = args.file
file = 'data/cl_cp_prot_850.tsv'

# d_typ_obj = pd.read_csv('data/cl_cp_cnvr_996.tsv', sep='\t', index_col=0)
df = pd.read_csv(file, sep='\t', index_col=0)

# v0 - scale data for baseline encoding
selected_df = df.iloc[:, 2:]

# Count features, to encoder build
feature_count = selected_df.shape[1]

scaler = MinMaxScaler()
selected_df = pd.DataFrame(
    scaler.fit_transform(selected_df),
    columns=selected_df.columns,
    index=selected_df.index)

selected_df["model_type"] = df["model_type"]
selected_df["cancer_type"] = df["cancer_type"]

selected_df["model_type"] = model_type_encoder.fit_transform(selected_df["model_type"])
# convert model type to int
selected_df["model_type"] = selected_df["model_type"].astype(int)
assert selected_df["model_type"].nunique() == 2, "There should be two classes"

selected_df["cancer_type"] = cancer_type_encoder.fit_transform(selected_df["cancer_type"])
# convert model type to int
selected_df["cancer_type"] = selected_df["cancer_type"].astype(int)

print(selected_df.shape)

# Build VAE
encoder = build_encoder(feature_count, latent_dim)  # feat count set above, lat dim is a var
decoder = build_decoder(feature_count, latent_dim)
vae = VAE(encoder, decoder, columns=selected_df.columns)
vae.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), run_eagerly=True)

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=Path(log_dir, "weights"),
                                                 save_weights_only=True,
                                                 verbose=1)

history = vae.fit(selected_df, epochs=epochs, batch_size=batch_size, shuffle=True,
                  callbacks=[tensorboard_callback, cp_callback])

# save history
history_df = pd.DataFrame(history.history)
# history_df.to_csv(Path(log_dir, "history.tsv", sep='\t'), index=False) # sep in bracket, comma delim?
history_df.to_csv(Path(log_dir, "history.tsv"), sep='\t', index=False)
# save model
vae.encoder.save(Path(log_dir, "encoder"))
vae.decoder.save(Path(log_dir, "decoder"))

## Loss curve plot, #0

In [None]:
# add cosine similarity label to distance loss curve sub component

In [None]:
import pandas as pd
import glob
import matplotlib.pyplot as plt

In [None]:
pwd

In [None]:
ls

In [None]:
ls logs

In [None]:
ls logs/v0.3.9_prot_20231102-080655/

In [None]:
prot_hist = pd.read_csv('logs/v0.3.9_prot_20231102-080655/history.tsv', sep = '\t')

In [None]:
plot_title = 'proteomics'

In [None]:
dta_ttl

In [None]:
prot_hist # Observations
            # 2023-11-01, same as previous test run on 5 epochs:
            # recon flatlines, KL goes down, distance becomes increasingly negative
            # Thoughts
                # Add constant like +100 to the distance loss

In [None]:
Path(log_dir, "test")

In [None]:
type(Path(log_dir, "test"))

In [None]:
log_dir

In [None]:
loss_dir_name = log_dir.parts[2]

In [None]:
log_dir.parts

In [None]:
print(log_dir)

In [None]:
log_dir.parts[1]

In [None]:
loss_plot_dir_matching_the_VAE_output_dir = log_dir.parts[1]

In [None]:
history_df

In [None]:
# Create main plot for overall loss
df = history_df
xlab = 'Epoch'
axis_font_size = 18

fig, main_ax = plt.subplots(figsize=(10, 8))
main_ax.plot(df['loss'], label='Total Loss', color='blue')
main_ax.set_title('Overall Loss and Individual Loss Components, ' + dta_ttl, fontsize = 20)
main_ax.set_xlabel(xlab, fontsize = axis_font_size)
main_ax.set_ylabel('Total Loss', fontsize = axis_font_size)
main_ax.legend()

width, height = .25, .25
a, b = .3, .65
inset_title_font_size = 14

# Reconstruction loss, upper left
ax1 = main_ax.inset_axes([a, b, width, height])  # x, y, width, height
ax1.plot(df['reconstruction_loss'], label='Recon Loss', color='green')
ax1.set_title('Reconstruction Loss', fontsize = inset_title_font_size)
ax1.set_xlabel(xlab)
ax1.set_ylabel('Recon Loss')

# KL loss, upper right
ax2 = main_ax.inset_axes([b, b, width, height])
ax2.plot(df['kl_loss'], label='KL Loss', color='red')
ax2.set_title('KL Loss', fontsize = inset_title_font_size)
ax2.set_xlabel(xlab)
ax2.set_ylabel('KL Loss')

# Distance loss, lower left - move to lower right
ax3 = main_ax.inset_axes([b, a-.02, width, height])
ax3.plot(df['distance_loss'], label='Distance Loss', color='orange')
ax3.set_title('Distance Loss', fontsize = inset_title_font_size)
ax3.set_xlabel(xlab)
ax3.set_ylabel('Distance Loss')

plt.tight_layout()
plt.savefig('loss_plots/'+loss_plot_dir_matching_the_VAE_output_dir+'cmposit_loss_plot.png')

## Quant setups

In [None]:
# Non-redundant, all 5 plots in sequential order
import pandas as pd
import numpy as np
import tensorflow as tf

import umap
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

from scipy.spatial.distance import euclidean
from matplotlib.colors import ListedColormap
from joypy import joyplot

In [None]:
grey_colors = pd.read_csv('plot_color_files/grey_scale.tsv', sep = '\t', index_col = 0)

In [None]:
# TCGA colors on plots, build versions for alt y-mappings
cancer_type_abbreviation_mapping = {
    'Clear cell renal cell carcinoma': 'KIRCKICH', # combined-in chromophobe
    'Colon adenocarcinoma': 'COADREAD', # combined-in rectal adeno
    'Pancreatic ductal adenocarcinoma': 'PAAD',
    'Breast carcinoma': 'BRCA',
    'Glioblastoma': 'LGGGBM', # combined-in low grade glioma
    'Lung squamous cell carcinoma': 'LUSC',
    'Lung adenocarcinoma': 'LUAD',
    'Endometrial carcinoma': 'UCEC',
    'Head and neck squamous cell carcinoma': 'HNSC',
    'Ovarian carcinoma': 'OV'
}

Design goal  
A plot design change should update both the pre and post plots

In [None]:
# From first complete LaTeX version of unencoded plots
    # unenc_v0.4.ipynb

In [None]:
encoder = tf.keras.models.load_model("logs/v0.3.9_prot_20231102-080655/encoder")

## Plot labeling, naming, disk reads

### lat dim / epochs / notes

In [None]:
# For intermediate disk load only, ADD EPOCH count to log dir file naming
latent_dim = 50 # Already in RAM if VAE run in same session
epochs = 10      # ' ' ' ' '

In [None]:
# LATENT FILE LOC:
    # ____________ nope
    # Looks like a trained ecoder is saved and loads the file (above) - done
    # what is the encdd plot target dir? - results_encdd - above unenc - subopt
    # how does the version and data type file naming line up with 
    # the unenc plots, - v0.4, just sitting in the results_unenc dir - done
        # kind of a misnomer as far as resluts go, its just plots, not VAE results
        # Manual archive process - done
    # save model - in logs/version_._/data_type/ - done
    # loss plots??

    # Starting to come back now, the v0.4 refered to the 4th loss term
    # This will not work for the current 7-cancer y label setup
    # because the model-type loss function is hard-coded for binary cell line vs. human
    # So, there is a mis mathch in finishing this run
    # Just get the LaTeX plot formatting form unenc v0.4 into this
    # Then go to y-mapping (file aready started?)
    # and pick up the pieces from there

### Toggle zero  
Toggle after plot two, back after plot four

In [None]:
# Flow starts with UMAP then this goes to plot #2 greyscale model logreg bar
    # THIS DOESN'T PERTAIN TO UMAP, SHOULD GO LATER - copy down to point of use - 
mode_ttl = 'model type'
mode = 'model_type'

In [None]:
# save this as notes version, manually copy to uncontrolled parent dir, clear this version

### Toggle one  
Leave for all five plots

In [None]:
mdls_ttl = 'Cell line + CPTAC'
mdls = 'cptac_+_cell_line'

In [None]:
mdls_ttl = 'Cell line + CPTAC + HCMI'
mdls = 'cptac_+_cell_line_+_hcmi'

### Toggle set two  
Leave for all five plots

In [None]:
dta_ttl = 'copy number'
dta_typ = 'cnvr'
dta_typ_unenc = pd.read_csv('data/cl_cp_cnvr_996.tsv',
                       sep = '\t', index_col = 0)
dta_typ_obj = pd.DataFrame(encoder.predict(dta_typ_unenc.iloc[:, 2:])[0])

In [None]:
# Obs - encoder shape error if run on wrong data type

In [None]:
dta_ttl = 'gene expression'
dta_typ = 'gexp'
dta_typ_unenc = pd.read_csv('data/cl_cp_gexp_998.tsv',
                       sep = '\t', index_col = 0)
dta_typ_obj = pd.DataFrame(encoder.predict(dta_typ_unenc.iloc[:, 2:])[0])

In [None]:
dta_ttl = 'proteomics'
dta_typ = 'prot'
log_dta = '_prot_'
dta_typ_unenc = pd.read_csv('data/cl_cp_prot_850.tsv',
                   sep = '\t', index_col = 0)
dta_typ_obj = pd.DataFrame(encoder.predict(dta_typ_unenc.iloc[:, 2:])[0])

### Continue

In [None]:
print(dta_ttl)
dta_typ_obj.head(1)

In [None]:
# Labels onto latent object
feature_columns = dta_typ_obj.columns
dta_typ_obj['cancer_type'] =  list(dta_typ_unenc['cancer_type'])
dta_typ_obj['model_type'] =  list(dta_typ_unenc['model_type'])
new_cols = ['cancer_type', 'model_type'] + list(feature_columns)
dta_typ_obj = dta_typ_obj[new_cols]
dta_typ_obj.head(1)

In [None]:
dta_typ_obj.columns # Why does this run with numeric column names the logreg function now?

In [None]:
# Write into results dir, manually at completion of run

In [None]:
# Encodded vars, unenc_v0.4
rslts_dir = 'results_encdd'
encdg_stts_ttl = ', encoded'
encdg_stts_nam = 'encoded'

## UMAP #1

In [None]:
reducer = umap.UMAP(n_components=2)
scaled_data = StandardScaler().fit_transform(dta_typ_obj.iloc[:, 2:])
embedding = reducer.fit_transform(scaled_data)
emb_df = pd.DataFrame(embedding, index = dta_typ_obj.index)
emb_lbld = pd.concat([emb_df, dta_typ_obj[['cancer_type', 'model_type']]], axis = 1)
emb_lbld.columns = ['UMAP_1', 'UMAP_2', 'cancer_type', 'model_type']

In [None]:
mdls_ttl, dta_ttl, dta_typ, mdls, latent_dim, encdg_stts_ttl

In [None]:
def umap_plot_to_disk(emb_lbld, mdls_ttl, dta_ttl, dta_typ, mdls):
    tcga_colors = pd.read_csv('plot_color_files/tcga_colors.tsv', sep = '\t', index_col = 0)
    unique_cancer_types = emb_lbld['cancer_type'].unique()
    custom_palette = {cancer_type: tcga_colors.loc[cancer_type_abbreviation_mapping.get(
        cancer_type, 'Unknown'), 'cohort_color'] for cancer_type in unique_cancer_types}
    emb_lbld.columns = ['UMAP_1', 'UMAP_2', 'Cancer type', 'Model type'] # Re-run reducer
    plt.figure(figsize=(5, 5))                                           # for iterations
    marker_dict = {'Tumor': '^', 'cell line': 'o'}
    sns.scatterplot(data=emb_lbld, x='UMAP_1', y='UMAP_2',
                    hue='Cancer type', style='Model type', markers=marker_dict,
                    palette=custom_palette, legend='full',
                    s = 200)
    plt.xlabel('UMAP_2', fontsize=16)
    plt.ylabel('UMAP_2', fontsize=16)
    plt.legend(title='Cancer Type', loc='upper left', bbox_to_anchor=(1, 1))
    plt.suptitle(mdls_ttl +', '+ dta_ttl, y = 1.002, fontsize = 20)
    plt.title('n = '+str(len(emb_lbld))+encdg_stts_ttl, fontsize = 18)
    legend = plt.legend(title='Sample attributes', title_fontsize='14', loc='upper left',
                        bbox_to_anchor=(1, 1), fontsize=12)
    headers_to_bold = ['Cancer type', 'Model type']
    for text in legend.texts:
        if text.get_text() in headers_to_bold:
            text.set_weight('bold')
    plt.rcParams['text.usetex'] = True
    plt.savefig(rslts_dir+'/umap_'+dta_typ+'_'+mdls+'_'+encdg_stts_nam+'_'+v+'.png',
                bbox_inches = 'tight', dpi = 300)
    return 'UMAP written to disk'
umap_plot_to_disk(emb_lbld, mdls_ttl, dta_ttl, dta_typ, mdls)

## LogReg function

In [None]:
# Model is model type or cancer type
def log_reg(dta_typ_obj, mode):
    col_X_strt = 2 # <-- Skip label columns
    f1_stor_frm = pd.DataFrame()

    # Logistic regression repeat loop
    for i in list(range(0, 15)): # <-- Error control

        # Train test split
        trn = dta_typ_obj.sample(round(len(dta_typ_obj) * .8))
        tst = dta_typ_obj.loc[~dta_typ_obj.index.isin(trn.index)]
    
        X_trn = trn.iloc[:, col_X_strt:]
        X_tst = tst.iloc[:, col_X_strt:]

        # Prediction targets, y is either model system or cancer type
        y_trn = trn[mode]
        y_tst = tst[mode]
        
        clf = LogisticRegression().fit(X_trn, y_trn)
        y_pred = clf.predict(X_tst)
        
        f1_by_class = f1_score(y_tst, y_pred, average=None)
        f1_df = pd.DataFrame({'Label': list(y_tst.unique()),
                              'F1_Score': f1_by_class})
        f1_stor_frm = pd.concat([f1_stor_frm, f1_df], axis = 0)
    return f1_stor_frm

## Logreg model-type plot - greyscale #2

In [None]:
mode_ttl = 'model type'
mode = 'model_type'
f1_stor_frm = log_reg(dta_typ_obj, mode)
sample_counts = dict(dta_typ_obj.model_type.value_counts())
def logreg_model_plot(f1_stor_frm, mdls, dta_typ, latent_dim, epochs, mode):
    plt.figure(figsize=(8, 4.5))
    sns.set_style("whitegrid")
    sns.set(font_scale=1.5)
    
    sns.barplot(x='Label', y='F1_Score', data=f1_stor_frm, palette=['#666666', '#999999'],
               errorbar=None)
    sns.swarmplot(x='Label', y='F1_Score', data=f1_stor_frm, color='#333333', size=14)
    
    plt.suptitle('Logistic regression, '+mode_ttl+', '+dta_ttl,
                 fontsize=24, y = 1.03)
    plt.title(mdls_ttl+encdg_stts_ttl, fontsize=20)
    plt.xlabel('Model Type', fontsize=20)
    plt.ylabel('F1 Score', fontsize=20)

    # Sample counts is global var
    new_labels = [f"{label}, n = {sample_counts[label]}" for label in sample_counts.keys()]
    plt.xticks(ticks=range(len(new_labels)), labels=new_labels, fontsize=20)
    
    plt.grid(color='grey', linestyle='-', linewidth=0.25, alpha=0.5)
    plt.rcParams['text.usetex'] = True
    plt.savefig(rslts_dir+'/log_reg_'+mode+'_'+dta_typ+'_'+encdg_stts_nam+'_'+v+'.png',
                bbox_inches='tight')
logreg_model_plot(f1_stor_frm, mdls, dta_typ, latent_dim, epochs, mode)

#### LogReg string column fix

In [None]:
dta_typ_obj.columns = dta_typ_obj.columns.astype(str)

In [None]:
dta_typ_obj.head(1)

In [None]:
dta_typ_obj.columns

In [None]:
f1_stor_frm = log_reg(dta_typ_obj, mode) # Single reuse function, no errors expected (?)

## LogReg cancer-type plot - TCGA colors #3

In [None]:
mode_ttl = 'cancer type'
mode = 'cancer_type'
f1_stor_frm = log_reg(dta_typ_obj, mode)
tcga_colors = pd.read_csv('plot_color_files/tcga_colors.tsv',
                          sep = '\t')
tcga_color_mapping = dict(zip(tcga_colors['tcga_cohorts'], tcga_colors['cohort_color']))
unique_labels = f1_stor_frm['Label'].unique()
palette_dict = {}
for label in unique_labels:
    tcga_abbreviation = cancer_type_abbreviation_mapping.get(label)
    color = tcga_color_mapping.get(tcga_abbreviation)
    if color:
        palette_dict[label] = color
def lgrg_plt_fnc(f1_stor_frm, mdls, data_name, latent_dim, epochs, mode):
    plt.figure(figsize=(8, 4))
    sns.set_style("whitegrid")

    ax = sns.barplot(
        x='Label', y='F1_Score', data=f1_stor_frm,
        palette=palette_dict,
        errorbar=None)
    sns.swarmplot(x='Label', y='F1_Score', data=f1_stor_frm,
                  color='#333333', size=7)
    
    plt.suptitle('Logistic regression, '+mode_ttl+', '+dta_ttl,
             fontsize=24, y = 1.04)
    plt.title(mdls_ttl+encdg_stts_ttl, fontsize=20)
    plt.xlabel('Cancer type', fontsize=20)
    plt.ylabel('F1 Score', fontsize=20)
    
    plt.xticks(rotation=45, ha = 'right', fontsize = 16)
    plt.grid(color='grey', linestyle='-', linewidth=0.25, alpha=0.5)
    plt.rcParams['text.usetex'] = True
    plt.savefig(rslts_dir+'/log_reg_'+mode+'_'+dta_typ+'_'+v+'.png',
                bbox_inches='tight')
lgrg_plt_fnc(f1_stor_frm, mdls, dta_typ, latent_dim, epochs, mode)

## Euclidean, #4 & 5

In [None]:
# Euclicean distance, model type
def mdl_typ_dist(sample, features, df):
    other_types = df[df['model_type'] != sample['model_type']]
    mean_features_other_types = other_types[features].mean()
    distance = euclidean(sample[features], mean_features_other_types)
    return distance

In [None]:
# Euclidean distance, cancer type
def cncr_typ_dist(sample, features, df):
    other_types = df[df['cancer_type'] != sample['cancer_type']]
    mean_features_other_types = other_types[features].mean()
    distance = euclidean(sample[features], mean_features_other_types)
    return distance

In [None]:
# dta_typ_obj = pd.read_csv('data/'+file,
#                    sep = '\t', index_col = 0)
feature_columns = dta_typ_obj.columns[2:]
dta_typ_obj['mdl_typ_dstncs'] = dta_typ_obj.apply(
    lambda row: mdl_typ_dist(row, feature_columns, dta_typ_obj), axis=1)
dta_typ_obj['cncr_typ_dstncs'] = dta_typ_obj.apply(
    lambda row: cncr_typ_dist(row, feature_columns, dta_typ_obj), axis=1)
new_cols = ['cancer_type', 'model_type', 'cncr_typ_dstncs', 'mdl_typ_dstncs'] + list(feature_columns)
dta_typ_obj = dta_typ_obj[new_cols]

## Eucldn Colrs, #4

In [None]:
mode_ttl = 'cancer type'
mode = 'cancer_type'
dstnc_typ = 'cncr_typ_dstncs'
average_distances = dta_typ_obj.groupby(
    mode)[dstnc_typ].mean().sort_values(ascending=False)
sorted_df = dta_typ_obj.loc[dta_typ_obj[mode].isin(average_distances.index)]
sorted_df[mode] = pd.Categorical(
    sorted_df[mode], categories=average_distances.index, ordered=True)
sorted_df = sorted_df.sort_values(mode)
tcga_colors = pd.read_csv('plot_color_files/tcga_colors.tsv', sep = '\t', index_col = 0)
custom_color_list = [tcga_colors.loc[cancer_type_abbreviation_mapping[cancer_type],'cohort_color'] for cancer_type in average_distances.index]
custom_colormap = ListedColormap(custom_color_list)

In [None]:
y_values = np.linspace(0.75, 0.068, 7)
plt.figure()
joyplot(data=sorted_df[[mode, dstnc_typ]], by=mode,
    figsize=(10, 6.5), colormap=custom_colormap,
    fade=True)

for y_value, cancer_type in zip(y_values, sorted_df[mode].unique()):
    count = dict(sorted_df[mode].value_counts())[cancer_type]
    x_position = sorted_df[dstnc_typ].max()
    plt.annotate(f"n={count}", xy=(x_position, y_value), verticalalignment='center')

plt.suptitle('Euclidean Distances, '+mode_ttl+', '+dta_ttl,
             fontsize=30, y = 1.01)
plt.title(mdls_ttl+ ', n = '+str(dta_typ_obj.shape[0])+encdg_stts_ttl,
          y = .92, x = .22, fontsize = 26)

plt.rcParams['text.usetex'] = True

plt.annotate(
    r'Variance of means: $\mathbf{' + f'{average_distances.var():.3f}' + '}$',
    xy=(0.01, 0.87), xycoords='axes fraction',
    ha='right', va='top')

plt.savefig(rslts_dir+'/euc-rdgln_'+mode+'_'+mdls+'_'+dta_typ+'_'+v+'.png',
            bbox_inches = 'tight', dpi = 300)

### Build grey Euc ridge obj

In [None]:
mode_ttl = 'model type'
mode = 'model_type'
dstnc_typ = 'mdl_typ_dstncs'

abbreviation_mapping = {
    'cell line': 'cell line',
    'Tumor': 'Tumor',
    'HCMI': 'HCMI', # devel
}

average_distances = dta_typ_obj.groupby(
    mode)[dstnc_typ].mean().sort_values(ascending=False)
sorted_df = dta_typ_obj.loc[dta_typ_obj[mode].isin(average_distances.index)]
sorted_df[mode] = pd.Categorical(
    sorted_df[mode], categories=average_distances.index, ordered=True)
sorted_df = sorted_df.sort_values(mode)
custom_color_list = [grey_colors.loc[
                     abbreviation_mapping[
                     model_type],'quant_mode_color'] for model_type in average_distances.index]
custom_colormap = ListedColormap(custom_color_list)

### Euc ridge grey plot

In [None]:
y_values = np.linspace(0.52, 0.15, 2)
plt.figure()
joyplot(data=sorted_df[[mode, dstnc_typ]], by=mode,
    figsize=(10, 6.5), colormap=custom_colormap,
    fade=True)

for y_value, cancer_type in zip(y_values, sorted_df[mode].unique()):
    count = dict(sorted_df[mode].value_counts())[cancer_type]
    x_position = sorted_df[dstnc_typ].max()
    plt.annotate(f"n={count}", xy=(x_position, y_value), verticalalignment='center', fontsize = 24)

plt.suptitle('Euclidean Distances, '+mode_ttl+', '+dta_ttl,
             fontsize=30, y = 1.06)
plt.title(mdls_ttl+ ', n = '+str(dta_typ_obj.shape[0])+encdg_stts_ttl,
          y = .97, x = .4, fontsize = 26)

plt.rcParams['text.usetex'] = True

plt.annotate(
    r'Variance of means: $\mathbf{' + f'{average_distances.var():.3f}' + '}$',
    xy=(0.2, 0.87), xycoords='axes fraction',
    ha='right', va='top')

plt.savefig(rslts_dir+'/euc-rdgln_'+mode+'_'+mdls+'_'+dta_typ+'_'+v+'.png',
            bbox_inches = 'tight', dpi = 300)

In [None]:
mode_ttl = 'model type'
mode = 'model_type'
dist_typ = 'mdl_typ_dstncs'

In [None]:
sorted_df, custom_colormap = avrg_dstnc(mode, dstnc_typ)

In [None]:
abbreviation_mapping = {
    'cell line': 'cell line',
    'Tumor': 'Tumor',
    'HCMI': 'HCMI', # test
}

In [None]:
average_distances = dta_typ_obj.groupby(
    mode)[dstnc_typ].mean().sort_values(ascending=False)
sorted_df = dta_typ_obj.loc[dta_typ_obj[mode].isin(average_distances.index)]
sorted_df[mode] = pd.Categorical(
    sorted_df[mode], categories=average_distances.index, ordered=True)
sorted_df = sorted_df.sort_values(mode)
tcga_colors = pd.read_csv('plot_color_files/tcga_colors.tsv', sep = '\t', index_col = 0) #
custom_color_list = [grey_colors.loc[
                     abbreviation_mapping[
                     model_type],'quant_mode_color'] for model_type in average_distances.index]
custom_colormap = ListedColormap(custom_color_list)

In [None]:
y_values = np.linspace(0.52, 0.15, 2)
plt.figure()
joyplot(data=sorted_df[[mode, dstnc_typ]], by=mode,
    figsize=(10, 6.5), colormap=custom_colormap,
    fade=True)

for y_value, cancer_type in zip(y_values, sorted_df[mode].unique()):
    count = dict(sorted_df[mode].value_counts())[cancer_type]
    x_position = sorted_df[dstnc_typ].max()
    plt.annotate(f"n={count}", xy=(x_position, y_value), verticalalignment='center', fontsize = 24)

plt.suptitle('Euclidean Distances, '+mode_ttl+', '+dta_ttl,
             fontsize=30, y = 1.06)
plt.title(mdls_ttl+ ', n = '+str(dta_typ_obj.shape[0]),
          y = .97, x = .4, fontsize = 26)

plt.rcParams['text.usetex'] = True

plt.annotate(
    r'Variance of means: $\mathbf{' + f'{average_distances.var():.3f}' + '}$',
    xy=(0.2, 0.87), xycoords='axes fraction',
    ha='right', va='top')

plt.savefig('results_encdd/rdgln_'+mode+'_'+mdls+'_'+dta_typ+'_v0.2.png',
            bbox_inches = 'tight', dpi = 300)