In [1]:
import os
import sys
import time
import numpy as np
import pandas as pd

import knpackage.toolbox as kn

sys.path.insert(1, '../src')
import data_synth
import data_wrangler as dw

sys.path.insert(1, '../../Data_Cleanup_Pipeline/src/')
import data_cleanup_toolbox as dc

# data_cleanup_toolbox test: Gene Prioritization

In [3]:
KnowEnG_GP_dir = '../../Gene_Prioritization_Pipeline/data/networks'
network_full_file = os.path.join(KnowEnG_GP_dir, 'STRING_experimental_gene_gene.edge')
adj_mat, unique_gene_names = kn.get_sparse_network_matrix(network_full_file)

In [4]:
KnowEnG_DC_dir = '../../Data_Cleanup_Pipeline'
yml_file_name = 'data_cleanup.yml'
yml_dir = os.path.join(KnowEnG_DC_dir, 'data/run_files')
run_parameters = kn.get_run_parameters(yml_dir, yml_file_name)    

data_dir = './synth_data'
run_parameters['spreadsheet_name_full_path'] = os.path.join(data_dir, 'synth_5x5_spreadsheet.df')
run_parameters['phenotype_full_path'] = os.path.join(data_dir, 'synth_5x5_phenotype.df')

run_parameters['results_directory'] = './results'
run_parameters['run_directory'] = './'

dw.show_dictionary(run_parameters)

phenotype_full_path :	 ./synth_data/synth_5x5_phenotype.df
pipeline_type :	 gene_priorization_pipeline
redis_credential :	 {'password': 'KnowEnG', 'port': 6379, 'host': 'knowredis.knowhub.org'}
results_directory :	 ./results
run_directory :	 ./
run_file :	 data_cleanup.yml
source_hint :	 
spreadsheet_name_full_path :	 ./synth_data/synth_5x5_spreadsheet.df
taxonid :	 9606


## 1) create a spreadsheet and phenotype file that pass

In [10]:
random_df = data_synth.get_rand_dataframe(n_rows=5, n_cols=5, row_name_chars=15, col_name_chars=8)
random_df.to_csv(run_parameters['phenotype_full_path'], sep='\t')
random_df

Unnamed: 0,TDRYMUBC,AEOYLFIB,QWVAGEWV,EKONGXDX,WNZHJLRF
SOIIAZEUCSYQQ,0.264318,0.689766,0.305338,0.778831,0.446985
ELXLGMFDRLDLO,0.825476,0.330332,0.742672,0.949911,0.490361
EDYHISYRMWMTS,0.321523,0.549223,0.68887,0.974571,0.722941
XZHDTSVYXUPFS,0.322256,0.160509,0.358588,0.023793,0.418627
PBTYFMUZERSEI,0.395,0.906352,0.514123,0.127407,0.327006


In [11]:
ensembl_names_dict = {}
start_name = 0
for rand_name in list(random_df.index.values):
    ensembl_names_dict[rand_name] = unique_gene_names[start_name]
    start_name += 1

random_df.rename(index=ensembl_names_dict, inplace=True)
random_df.to_csv(run_parameters['spreadsheet_name_full_path'], sep='\t')
random_df

Unnamed: 0,TDRYMUBC,AEOYLFIB,QWVAGEWV,EKONGXDX,WNZHJLRF
ENSG00000000003,0.264318,0.689766,0.305338,0.778831,0.446985
ENSG00000000005,0.825476,0.330332,0.742672,0.949911,0.490361
ENSG00000000419,0.321523,0.549223,0.68887,0.974571,0.722941
ENSG00000000457,0.322256,0.160509,0.358588,0.023793,0.418627
ENSG00000000460,0.395,0.906352,0.514123,0.127407,0.327006


In [12]:
validation_flag, message = dc.run_gene_priorization_pipeline(run_parameters)
print(message)

This is a valid user spreadsheet. Proceed to next step analysis.


## negative values in spreadsheet - should pass, negative phenotype values - should fail

## create a spreadsheet and phenotype file with no intersection - should fail

In [13]:
random_df = data_synth.get_rand_dataframe(n_rows=5, n_cols=5, row_name_chars=15, col_name_chars=8)
random_df.to_csv(run_parameters['phenotype_full_path'], sep='\t')
random_df

Unnamed: 0,WIFSSYBZ,FPLQXFPI,OXVNSFAD,LISGICKC,ADTVIPDC
DBBKXZSZXBDZS,0.607454,0.724054,0.552735,0.126308,0.994306
YQMBZVYXJFPGQ,0.972533,0.797067,0.857623,0.106495,0.054251
NUGQXXYLJACHS,0.561656,0.467114,0.99271,0.643279,0.085763
KUVKEQQJDOIXK,0.174448,0.920479,0.603322,0.4564,0.701734
BFZRCIZYZIYMI,0.483582,0.209566,0.196854,0.600636,0.065224


In [14]:
validation_flag, message = dc.run_gene_priorization_pipeline(run_parameters)
print(message)

Cannot find intersection between user spreadsheet column and phenotype data.
