# Spreadsheet - select_categorical 

### From a genes x samples spreadsheet and a samples x phenotypes spreadsheet, return both spreadsheets with only the samples corresponding to a category in a phenotype: 
* **Select your "Input Files" in the first 2 the dropdown listboxes and then select the phenotype id and select category in the next 2 dropdown listboxes.**
* **Press "Select Categorical" and both spreadsheets with only the samples corresponding to a category in a phenotype will be written to files by the same names with "_Slct_Ctg" appended.**

In [1]:
# %%html
# <style>
# div.input {
#     display:none;
# }
# div.output_stderr{
#     display:none
# }
# </style>

In [2]:
#                                         single cell for select, open and select categorical:
#                                         target directory set for docker run -v `pwd`:...   ==  mount user data
target_dir = '../../'

import warnings
warnings.filterwarnings('ignore')

import os
import sys
import pandas as pd
import numpy

from IPython.display import display
import ipywidgets as widgets

#                                         local function to read files and select categorical and write:
def select_categorical(obie_jobie):
    if len(my_file_list) == 0 or my_file_list[0] == 'No Data':
        return
    
    file_name_1 = os.path.join(target_dir, flistbx_1.value)
    file_name_2 = os.path.join(target_dir, flistbx_2.value)
    spreadsheet_df = pd.read_csv(file_name_1, sep='\t', index_col=0, header=0)
    phenotype_df = pd.read_csv(file_name_2, sep='\t', index_col=0, header=0)
    phenotype_id = flistbx_3.value
    select_category = flistbx_4.value
    samples_list = phenotype_df.index[phenotype_df[phenotype_id] == select_category]
    #print(phenotype_df.index)
    # print(phenotype_df[phenotype_id][2])
    # print(samples_list)
    phenotype_category_df = phenotype_df.loc[samples_list]
    spreadsheet_category_df = spreadsheet_df[samples_list]
    
    name_base_1, file_extension_1 = os.path.splitext(file_name_1)
    outfile_name_1 = name_base_1 + '_Slct_Ctg.tsv'
    name_base_2, file_extension_2 = os.path.splitext(file_name_2)
    outfile_name_2 = name_base_2 + '_Slct_Ctg.tsv'
    spreadsheet_category_df.to_csv(outfile_name_1, sep='\t', index=True, header=True)
    phenotype_category_df.to_csv(outfile_name_2, sep='\t', index=True, header=True)
    print('Outputs written to\n', outfile_name_1,'\nand\n',outfile_name_2)

#                                         Get list of (docker run -v) mounted files:
flist = os.listdir(target_dir)
FEXT = ['.tsv', '.txt', '.df']
my_file_list = []
for f in flist:
    if os.path.isfile(os.path.join(target_dir, f)):
        noNeed, f_ext = os.path.splitext(f)
        if f_ext in FEXT:
            my_file_list.append(f)

#                                         (docker run -v) mounted files was empty:
if len(my_file_list) <= 0:
    my_file_list.append('No Data')
    
    
def all_phenotypes(file_rel_path):
    """get all the phenotypes, i.e. column names, of the samples x phenotypes dataframe, 
 which is read from the file_rel_path file """
    try:
        phenotype_df = pd.read_csv(os.path.join(target_dir,file_rel_path), sep='\t', index_col=0, header=0)
        return list(phenotype_df.columns)
    except:
        return ['No Data or Invalid File']


def nanunique(x):
    """a wrapper of the numpy.unique function that handles the NaN problem, 
    since numpy.unique will return multiple NaN's"""
    a = numpy.unique(x)
    r = []
    has_nan = False
    for i in a:
        if isinstance(i,float) and numpy.isnan(i):
            if has_nan: 
                continue
            else:
                has_nan = True
                r.append(i)
        else:
            r.append(i)
    return numpy.array(r)

def all_categories(file_rel_path,phenotype_id):
    """get all the categories, i.e. the values, of a specfic phenotype, 
    in the dataframe read from file_rel_path file"""
    phenotype_df = pd.read_csv(os.path.join(target_dir,file_rel_path), sep='\t', index_col=0, header=0)
    # print(list(np.unique(phenotype_df[phenotype_id])))
    # print(phenotype_id)
    # print(phenotype_df[phenotype_id])
    return list(nanunique(phenotype_df[phenotype_id]))
        

#                                         Create and display the widget controls:
flistbx_1 = widgets.Dropdown(
    options=my_file_list,
    value=my_file_list[0],
    description='Select File 1:'
)

flistbx_2 = widgets.Dropdown(
    options=my_file_list,
    value=my_file_list[0],
    description='Select File 2:'
)

flistbx_3 = widgets.Dropdown(
    options=all_phenotypes(my_file_list[0]),
    value=all_phenotypes(my_file_list[0])[0],
    description='Select Phenotype Id:'
)

flistbx_4 = widgets.Dropdown(
    options=all_categories(my_file_list[0],all_phenotypes(my_file_list[0])[0]),
    value=all_categories(my_file_list[0],all_phenotypes(my_file_list[0])[0])[0],
    description='Select Select Category:'
)

def handle_file_change(change):
    """the callback registered to handle changes in the 'value' 
    attribute of widget 'flist_2'"""
    flistbx_3.options = all_phenotypes(change['new']) 
    flistbx_3.value = all_phenotypes(change['new'])[0]
    flistbx_4.options = all_categories(change['new'],flistbx_3.value)
    flistbx_4.value = all_categories(change['new'],flistbx_3.value)[0]

flistbx_2.observe(handle_file_change, names='value')


def handle_phenotype_change(change):
    """the callback registered to handle changes in the 'value' 
    attribute of widget 'flist_3'"""
    flistbx_4.options = all_categories(flistbx_2.value,change['new'])
    flistbx_4.value = all_categories(flistbx_2.value,change['new'])[0]

flistbx_3.observe(handle_phenotype_change, names='value')

display(flistbx_1, flistbx_2, flistbx_3, flistbx_4)

output_file_button = widgets.Button(
    description='Select Categorical',
    disabled=False,
    button_style='',
    tooltip='select categorical button',
    data_file_key='output_file_name'
    )
output_file_button.on_click(select_categorical)
display(output_file_button)


Outputs written to
 ./transform_data/transform_5_spreadsheet_Slct_Ctg.tsv 
and
 ./transform_data/spreadsheet_Two_Slct_Ctg.tsv
