# Demo - mini pipeline - KnowEnG-Research

#### Setup Required:
* Create or select a suitable directory.
* In that directory create the run and test directories: test/run_dir/results
    * (these are referred to below and allow an easy transition to docker operation)
* Clone all the repositories into that directory.
    *  git clone https://github.com/dlanier/keg_test_tools.git
    *  git clone https://github.com/KnowEnG-Research/Data_Cleanup_Pipeline.git
    *  git clone https://github.com/KnowEnG-Research/Samples_Clustering_Pipeline.git
* In the test directory for both KnonwEnG pipelines run "make env_setup"
* Double check (painfully-slow) all the relative path names below

In [1]:
# Note: the  KnowEnG-Research/KnowEnG_Pipelines_Library version is required here
# with the KnowEnG-Research/Data_Cleanup_Pipeline.
import os
import sys
import time

sys.path.insert(1, '../../KnowEnG_Pipelines_Library')
sys.path.insert(1, '../../KnowEnG_Pipelines_Library.knpackage')
from knpackage import redis_utilities

dcp_src = '../../Data_Cleanup_Pipeline/src/'
sys.path.insert(1, dcp_src)
import data_cleanup_toolbox as dc

sys.path.insert(1, '../../keg_test_tools/src')
import dcp_test

sys.path.insert(1, '../../Samples_Clustering_Pipeline/src')
import sample_clustering_toolbox as sc

import knpackage.toolbox as kn

In [3]:
local_results = '../../test/run_dir'
yaml_dir = '../../Data_Cleanup_Pipeline/data/run_files'
yaml_file = 'TEMPLATE_data_cleanup.yml'

In [4]:
#   local convenience function
def show_dictionary(a_dict):
    for k in list(a_dict.keys()):
        print(k,':\t',a_dict[k])

## Locate and clean the data using a cleanup_parameters dictionary.
* Use the spreadsheet_dir and spreadsheet_name variables in the next cell to locate your file.
* The output will be in the local_results variable defined two cells above.

In [6]:
# Get the template yaml file
data_cleanup_pars = kn.get_run_parameters(yaml_dir, yaml_file)

spreadsheet_dir = '../../Samples_Clustering_Pipeline/data/spreadsheets'
spreadsheet_name = 'tcga_ucec_somatic_mutation_data.df'
cleaned_spreadsheet_expected_name = spreadsheet_name[:-3] + '_ETL.tsv'

data_cleanup_pars['spreadsheet_name_full_path'] = os.path.join(spreadsheet_dir,spreadsheet_name)

data_cleanup_pars['run_directory'] = '../../Data_Cleanup_Pipeline/src'
data_cleanup_pars['results_directory'] = local_results

# gene_priorization_pipeline, sample_clustering_pipeline, geneset_characterization_pipeline
data_cleanup_pars['pipeline_type'] = 'sample_clustering_pipeline'

show_dictionary(data_cleanup_pars)

tcga_ucec_somatic_mutation_data_ETL.tsv
taxonid :	 9606
results_directory :	 ../../test/run_dir
phenotype_full_path :	 ../data/spreadsheets/TEST_1_phenotype.tsv
spreadsheet_name_full_path :	 ../../Samples_Clustering_Pipeline/data/spreadsheets/tcga_ucec_somatic_mutation_data.df
run_file :	 TEMPLATE_data_cleanup.yml
source_hint :	 
run_directory :	 ../../Data_Cleanup_Pipeline/src
pipeline_type :	 sample_clustering_pipeline
redis_credential :	 {'port': 6379, 'host': 'knowredis.knowhub.org', 'password': 'KnowEnG'}


In [None]:
#         Run the samples clustering pipeline to get the output files
# SLOW SLOW --^<O>~<0>^-- unless running on the server with redis --^<O>~<0>^-- SLOW SLOW SLOW
t_zero = time.time()
STATUS, message_string = dc.run_samples_clustering_pipeline(data_cleanup_pars)
for l in message_string:
    print(l)
cleanup_run_time = time.time() - t_zero
if STATUS:
    print('data_cleanup_toolbox.run_samples_clustering_pipeline SUCCEEDED IN %0.3f sec'%(
        cleanup_run_time))
    output_files_list = os.listdir(data_cleanup_pars['results_directory'])
    print('\n')
    for file_name in output_files_list:
        print(file_name)
else:
    print('data_cleanup_toolbox.run_samples_clustering_pipeline SUCCEEDED IN %0.3f sec'%(
        cleanup_run_time))

## Cluster the data using a samples_clustering_parameters dictionary

### Note that you may need to run "make env_setup" from the .../Samples_Clustering_Pipeline/test    directory

In [30]:
samples_clustering_yaml_dir = os.path.join(os.getcwd(), 
                                           '../../Samples_Clustering_Pipeline/data/run_files')
samples_clustering_pars = kn.get_run_parameters(samples_clustering_yaml_dir, 
                                            'BENCHMARK_7_SC_cc_net_nmf_parallel_shared.yml')

spreadsheet_dir = data_cleanup_pars['results_directory']
spreadsheet_name = cleaned_spreadsheet_expected_name
samples_clustering_pars['spreadsheet_name_full_path'] = os.path.join(
    spreadsheet_dir, spreadsheet_name)
network_path = '../../Samples_Clustering_Pipeline/data/networks'
network_file_name = 'keg_ST90_4col.edge'
samples_clustering_pars['gg_network_name_full_path'] =\
    os.path.join(network_path, network_file_name)

samples_clustering_pars.pop('phenotype_name_full_path', None)

samples_clustering_pars['run_directory'] = '../../Samples_Clustering_Pipeline/src'
samples_clustering_pars['results_directory'] = local_results
samples_clustering_pars['tmp_directory'] = '/Users/mojo/BigDataTank/trifecta_tank'

samples_clustering_pars['number_of_bootstraps'] = 20
samples_clustering_pars['number_of_clusters'] = 3

samples_clustering_pars['rows_sampling_fraction'] = 0.8
samples_clustering_pars['cols_sampling_fraction'] = 0.8

samples_clustering_pars['rwr_restart_probability'] = 0.5

show_dictionary(samples_clustering_pars)

cols_sampling_fraction :	 0.8
rwr_restart_probability :	 0.5
run_directory :	 ../../Samples_Clustering_Pipeline/src
parallelism :	 4
rwr_max_iterations :	 100
results_directory :	 /Users/mojo/BigDataTank/trifecta_tank
run_file :	 BENCHMARK_7_SC_cc_net_nmf_parallel_shared.yml
tmp_directory :	 /Users/mojo/BigDataTank/trifecta_tank
top_number_of_genes :	 100
nmf_penalty_parameter :	 1400
nmf_max_iterations :	 10000
processing_method :	 parallel
rwr_convergence_tolerence :	 0.0001
gg_network_name_full_path :	 ../../Samples_Clustering_Pipeline/data/networks/keg_ST90_4col.edge
nmf_conv_check_freq :	 50
nmf_max_invariance :	 200
spreadsheet_name_full_path :	 /Users/mojo/BigDataTank/trifecta_tank/tcga_ucec_somatic_mutation_data_ETL.tsv
number_of_bootstraps :	 20
rows_sampling_fraction :	 0.8
method :	 cc_net_nmf
number_of_clusters :	 3


In [31]:
# Note that the printed output below is from a development version
# of samples clustering dependencies.
start_clock = time.time()
sc.run_cc_net_nmf(samples_clustering_pars)
run_time = time.time() - start_clock
print('samples clustering run time:\t', run_time)


		 run_cc_net_nmf
spreadsheet_mat (11128, 248): 112
network_mat (11128, 11128): 56
lap_diag (11128, 11128)
lap_pos (11128, 11128)
saved spreadsheet_mat to /Users/mojo/keg_tmp/keg_notebooks/A_Top_Secret/test_file_4_npdump in 0.0869 seconds
In find_and_save_cc_net_nmf_clusters_parallel: file name is  /Users/mojo/keg_tmp/keg_notebooks/A_Top_Secret/test_file_4_npdump
spreadsheet_mat = 
1.0
Using number_of_cores = 20
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_mat shape =  (11128, 248)
spreadsheet_m

In [32]:
results_dir_list = os.listdir(samples_clustering_pars['results_directory'])
for l in results_dir_list:
    print(l)

consensus_matrix_cc_net_nmf_Sun_12_Mar_2017_17_15_10.206341028_viz.tsv
genes_averages_by_cluster_cc_net_nmf_Sun_12_Mar_2017_17_15_21.663466930_viz.tsv
genes_by_samples_heatmap_cc_net_nmf_Sun_12_Mar_2017_17_15_12.265441894_viz.tsv
genes_variance_cc_net_nmf_Sun_12_Mar_2017_17_15_21.768404960_viz.tsv
samples_label_by_cluster_cc_net_nmf_Sun_12_Mar_2017_17_15_10.395998954_viz.tsv
silhouette_average_cc_net_nmf_Sun_12_Mar_2017_17_15_10.393580913_viz.tsv
tcga_ucec_somatic_mutation_data_ETL.tsv
tcga_ucec_somatic_mutation_data_MAP.tsv
tcga_ucec_somatic_mutation_data_UNMAPPED.tsv
top_genes_by_cluster_cc_net_nmf_Sun_12_Mar_2017_17_15_21.816582918_download.tsv
