# KnowEnG-Research Clean and Run Samples Clustering

#### Setup Required:
* Clone Data Cleanup & Samples Clustering to the same directory as this notebook's parent directory.
* Manually create a results directory in that same parent directory

In [15]:
#         load the library code
import os
import sys
import time

from IPython.display import display

import numpy as np
import pandas as pd

sys.path.insert(1, '../../Data_Cleanup_Pipeline/src')
import data_cleanup_toolbox as dc

sys.path.insert(1, '../../Samples_Clustering_Pipeline/src')
import sample_clustering_toolbox as sc

sys.path.insert(1, '../src')
import KnowEnG_graphics as gu

import knpackage
import knpackage.toolbox as kn

def view_dictionary(run_parameters):
    for k in sorted(run_parameters.keys()):
        print('run_parameters[',k,'] = \n\t',run_parameters[k],'\n')

## Upload genomic  and/or  phenotypic spreadsheets.

In [16]:
local_results                                   = '../../results'
local_results                                   = kn.create_dir(os.path.abspath(local_results),'result') 
data_cleanup_yaml_dir                           = os.path.join(os.getcwd(), '../../Data_Cleanup_Pipeline/data/run_files')
data_cleanup_pars                               = kn.get_run_parameters(data_cleanup_yaml_dir, 'TEMPLATE_data_cleanup.yml')
data_cleanup_pars['pipeline_type']              = 'sample_clustering_pipeline'
spreadsheet_path                                = '../../Samples_Clustering_Pipeline/data/spreadsheets'
spreadsheet_file_name                           = 'tcga_ucec_somatic_mutation_data.df'
data_cleanup_pars['spreadsheet_name_full_path'] =  os.path.join(spreadsheet_path, spreadsheet_file_name)
pheno_file_name                                 = 'UCEC_phenotype.txt'
data_cleanup_pars['phenotype_name_full_path']   = os.path.join(spreadsheet_path, pheno_file_name)
data_cleanup_pars['run_directory']              = '../../Data_Cleanup_Pipeline/src'
data_cleanup_pars['results_directory']          = local_results

In [None]:
#                           uncomment to view the parameters if necessary
# view_dictionary(data_cleanup_pars)

In [18]:
tc0                    = time.time()
STATUS, message_string = dc.run_samples_clustering_pipeline(data_cleanup_pars)
cleanup_run_time       = time.time() - tc0

print('Data Cleanup run time ', cleanup_run_time)

for l in message_string:
    print(l)
if STATUS:
    print('\nOutput Files:')
    output_files_list = os.listdir(data_cleanup_pars['results_directory'])
    for file_name in output_files_list:
        print(file_name)

Data Cleanup run time  933.1685390472412
INFO: Successfully loaded input data: ../../Samples_Clustering_Pipeline/data/spreadsheets/tcga_ucec_somatic_mutation_data.df.
INFO: Start processing phenotype data.
INFO: Successfully loaded input data: ../../Samples_Clustering_Pipeline/data/spreadsheets/UCEC_phenotype.txt.
INFO: Start to run sanity check for phenotype data.
INFO: No NA detected in row index.
INFO: No duplicate column name detected in this data set.
INFO: No duplicate row name detected in this data set.
INFO: Found 248 intersections between phenotype and spreadsheet data.
INFO: Finished running sanity check for phenotype data.
INFO: Start processing user spreadsheet data.
INFO: Start to run sanity checks for user spreadsheet data.
INFO: No NA detected in row index.
INFO: No duplicate column name detected in this data set.
INFO: No duplicate row name detected in this data set.
INFO: Mapped 17490 genes to ensemble name.
INFO: Unable to map 57 genes to ensemble name.
INFO: Finished

## Cluster the data using a samples_clustering_parameters dictionary

In [19]:
samples_clustering_yaml_dir                           = os.path.join(os.getcwd(), '../../Samples_Clustering_Pipeline/data/run_files')
samples_clustering_pars                               = kn.get_run_parameters(samples_clustering_yaml_dir, 'BENCHMARK_7_SC_cc_net_nmf_parallel_shared.yml')
spreadsheet_path                                      = data_cleanup_pars['results_directory']
spreadsheet_file_name_base, ext                       = os.path.splitext(spreadsheet_file_name)
spreadsheet_cleaned_file_name                         = spreadsheet_file_name_base + '_ETL.tsv'
samples_clustering_pars['spreadsheet_name_full_path'] =  os.path.join(spreadsheet_path, spreadsheet_cleaned_file_name)
phenotype_file_name_base, ext                         = os.path.splitext(pheno_file_name)
phenotype_cleaned_file_name                           = phenotype_file_name_base + '_ETL.tsv'
samples_clustering_pars['phenotype_name_full_path']   =  os.path.join(spreadsheet_path, phenotype_cleaned_file_name)
network_path                                          = '../../Samples_Clustering_Pipeline/data/networks'
network_file_name                                     = 'keg_ST90_4col.edge'
samples_clustering_pars['gg_network_name_full_path']  =  os.path.join(network_path, network_file_name)
samples_clustering_pars['run_directory']              = '../../Samples_Clustering_Pipeline/src'
samples_clustering_pars['results_directory']          = data_cleanup_pars['results_directory']

In [6]:
# view_dictionary(samples_clustering_pars)

## Evaluate and view the clustering:

In [None]:
start_clock = time.time()

sc.run_cc_net_nmf(samples_clustering_pars)

run_time = time.time() - start_clock


print('samples clustering run time:\t', run_time, '\n')
results_dir_list           = os.listdir(samples_clustering_pars['results_directory'])
cc_5mat                    = None
cc_prefix                  = 'consensus_matrix'
cluster_evaluation_prefix  = 'clustering_evaluation_result'
cluster_eval_df = None
for l in results_dir_list:
    
    if l[0:len(cc_prefix)]   == cc_prefix:
        consensus_matrix_file = os.path.join(samples_clustering_pars['results_directory'], l)
        consensus_df          = pd.read_csv(consensus_matrix_file, sep='\t', header=0, index_col=0)
        cc_5mat               = consensus_df.as_matrix()
        
    if l[0:len(cluster_evaluation_prefix)] == cluster_evaluation_prefix:
        cluster_eval_filename               = os.path.join(samples_clustering_pars['results_directory'], l)
        cluster_eval_df                     = pd.read_csv(cluster_eval_filename, sep='\t', header=0, index_col=0)

Maximum_Consensus_Matrix_Display_Width = 1200
if cc_5mat is not None and cc_5mat.shape[1] < Maximum_Consensus_Matrix_Display_Width:
    I0                                      = sc.form_consensus_matrix_graphic(cc_5mat, samples_clustering_pars[ 'number_of_clusters' ])
    display(gu.mat_to_blue(I0))
    
if cluster_eval_df is not None:
    display(cluster_eval_df)

In [8]:
print('Other data files available in\n',samples_clustering_pars['results_directory'],'\n')
results_dir_list = os.listdir(samples_clustering_pars['results_directory'])
for l in results_dir_list:
    print(l)

Other data files available in
 /Users/lanier4/dlanier_KnowEnG/results/result1494010219371786 

clustering_evaluation_result_Fri_05_May_2017_14_06_19.049815893.tsv
consensus_matrix_cc_net_nmf_Fri_05_May_2017_14_06_18.042548894_viz.tsv
genes_averages_by_cluster_cc_net_nmf_Fri_05_May_2017_14_06_28.620660066_viz.tsv
genes_by_samples_heatmap_cc_net_nmf_Fri_05_May_2017_14_06_21.656224012_viz.tsv
genes_variance_cc_net_nmf_Fri_05_May_2017_14_06_28.736527919_viz.tsv
samples_label_by_cluster_cc_net_nmf_Fri_05_May_2017_14_06_18.172559976_viz.tsv
silhouette_average_cc_net_nmf_Fri_05_May_2017_14_06_18.171514987_viz.tsv
tcga_ucec_somatic_mutation_data_ETL.tsv
tcga_ucec_somatic_mutation_data_MAP.tsv
tcga_ucec_somatic_mutation_data_UNMAPPED.tsv
top_genes_by_cluster_cc_net_nmf_Fri_05_May_2017_14_06_28.770649909_download.tsv
UCEC_phenotype_ETL.tsv
