# Spreadsheet - cluster_averages 

### Return a dataframe of averages for each category given a genes x samples dataframe and a samples classification dictionary: 
* **Select your "Input Files" with the dropdown listboxes.**
* **Press "Get CLuster Averages" and the spreadsheet of averages will be written to a file by the same name with "_Clst_Avg" appended.**

In [1]:
# %%html
# <style>
# div.input {
#     display:none;
# }
# div.output_stderr{
#     display:none
# }
# </style>

In [2]:
#                                         single cell for select, open and transpose:
#                                         target directory set for docker run -v `pwd`:...   ==  mount user data
target_dir = '../../'

import warnings
warnings.filterwarnings('ignore')

import os
import sys
import pandas as pd
import numpy as np

from IPython.display import display
import ipywidgets as widgets

#                                         local function to read files and get common samples and write:
def get_cluster_averages(obie_jobie):
    if len(my_file_list) == 0 or my_file_list[0] == 'No Data':
        return
    
    file_name_1 = os.path.join(target_dir, flistbx_1.value)
    file_name_2 = os.path.join(target_dir, flistbx_2.value)

    spreadsheet_df = pd.read_csv(file_name_1, sep='\t', index_col=0, header=0)
    labels_df = pd.read_csv(file_name_2, sep='\t', index_col=0, names=['sample','cluster_number'])
    labels_dict = labels_df.to_dict()['cluster_number']
    cluster_numbers = list(np.unique(list(labels_dict.values())))
    labels = list(labels_dict.values())
    # labels == i is a boolean list
    cluster_ave_df = pd.DataFrame({i: spreadsheet_df.iloc[:, labels == i].mean(axis=1) for i in cluster_numbers})
    name_base_1, file_extension_1 = os.path.splitext(file_name_1)
    # print(os.path.basename(name_base_2))
    # print(os.path.relpath(name_base_2,start=target_dir))
    outfile_name = name_base_1 + '_Clst_Avg.tsv'
    cluster_ave_df.to_csv(outfile_name, sep='\t', index=True, header=True)
    print('Output written to\n', outfile_name)

#                                         Get list of (docker run -v) mounted files:
flist = os.listdir(target_dir)
FEXT = ['.tsv', '.txt', '.df']
my_file_list = []
for f in flist:
    if os.path.isfile(os.path.join(target_dir, f)):
        noNeed, f_ext = os.path.splitext(f)
        if f_ext in FEXT:
            my_file_list.append(f)

#                                         (docker run -v) mounted files was empty:
if len(my_file_list) <= 0:
    my_file_list.append('No Data')

#                                         Create and display the widget controls:
flistbx_1 = widgets.Dropdown(
    options=my_file_list,
    value=my_file_list[0],
    description='Select Spreadsheet File:'
)
display(flistbx_1)

flistbx_2 = widgets.Dropdown(
    options=my_file_list,
    value=my_file_list[0],
    description='Select Dictionary File:'
)
display(flistbx_2)

output_file_button = widgets.Button(
    description='Get Cluster Averages',
    disabled=False,
    button_style='',
    tooltip='get cluster averages button',
    data_file_key='output_file_name'
    )
output_file_button.on_click(get_cluster_averages)
display(output_file_button)


Output written to
 ./transform_data/gene_samples_small_Clst_Avg.tsv
