# Spreadsheets .G.  genes X samples special case test data files generator
* phenotype file with one row having so many "NA" values that bootstrap sampling produces empty or nearly empty rows.
* spreadsheet files to aggrovate the same

In [1]:
import os
import sys
import itertools
import time
import numpy as np
import pandas as pd

sys.path.insert(1, '../../KnowEnG_Pipelines_Library')
import knpackage.redis_utilities
sys.path.insert(1, '../../KnowEnG_Pipelines_Library/knpackage')
import knpackage.toolbox as kn

sys.path.insert(1, '../src')
import data_synth
import data_wrangler as dw

sys.path.insert(1, '../../Data_Cleanup_Pipeline/src/')
import data_cleanup_toolbox as dc

In [2]:
#                                              Set output directory
out_data_dir = '../data/GP_special_case/NA_80'
pheno_pearson_data_dir = out_data_dir
pheno_t_test_data_dir = out_data_dir

In [3]:
#                                              Define spreadsheet data
n_spreadsheet_rows = 12
n_phenotype_rows = 7 
n_spreadsheet_cols = 9
spreadsheet_shape = (n_spreadsheet_rows, n_spreadsheet_cols)
phenotype_shape = (n_phenotype_rows, n_spreadsheet_cols)

spreadsheet_data = np.random.random(spreadsheet_shape)
spreadsheet_data[spreadsheet_data < 0.5] = 0.0
spreadsheet_data[spreadsheet_data != 0.0] = 1.0
spreadsheet_data

array([[ 0.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  1.],
       [ 1.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  1.],
       [ 1.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,  0.],
       [ 0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  1.],
       [ 0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  0.],
       [ 0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.]])

In [4]:
#                                              Define phenotype data
pheno_pearson_data = np.random.random(phenotype_shape)
pheno_pearson_data = pheno_pearson_data - 0.24
pheno_pearson_data[pheno_pearson_data < 0.26] = 0.0
print(' pearson data:\n', pheno_pearson_data)

pheno_t_test_data = np.random.random(phenotype_shape)
pheno_t_test_data[pheno_t_test_data < 0.5] = 0.0
pheno_t_test_data[pheno_t_test_data != 0.0] = 1.0
print('\n t_test data:\n', pheno_t_test_data)

drug_names = data_synth.get_rand_unique_name_list(n_names=n_phenotype_rows, name_length=7)
drug_names

 pearson data:
 [[ 0.          0.          0.34452088  0.47227906  0.28130707  0.36308518
   0.          0.26665427  0.        ]
 [ 0.          0.35713199  0.          0.          0.          0.56337751
   0.30125909  0.61891223  0.        ]
 [ 0.69733128  0.          0.72615545  0.          0.322662    0.43242509
   0.          0.          0.        ]
 [ 0.          0.51617701  0.61376067  0.47053311  0.          0.
   0.34009851  0.40214373  0.68922356]
 [ 0.          0.29285254  0.51516262  0.          0.60383118  0.          0.
   0.58694326  0.        ]
 [ 0.          0.          0.63170117  0.          0.66296053  0.
   0.42195521  0.          0.75718137]
 [ 0.5022138   0.          0.45299663  0.          0.28225057  0.
   0.49233856  0.          0.        ]]

 t_test data:
 [[ 0.  0.  0.  1.  0.  1.  1.  0.  1.]
 [ 1.  1.  0.  1.  1.  1.  1.  1.  0.]
 [ 1.  1.  1.  1.  1.  0.  1.  1.  1.]
 [ 0.  0.  1.  1.  1.  0.  0.  0.  0.]
 [ 1.  1.  0.  0.  1.  0.  0.  1.  0.]
 [ 0.  1.  0.

['SVEIPFK', 'SCKALDE', 'VDRUXUI', 'JPASLFW', 'FMSPWML', 'SMDMQRO', 'GBTTTOI']

In [5]:
#                                              Define spreadsheet Nan data
some_nan_data = spreadsheet_data.copy()
some_nan_data[0,0] = np.nan
some_nan_data

array([[ nan,   1.,   1.,   1.,   0.,   1.,   0.,   1.,   0.],
       [  0.,   0.,   0.,   0.,   1.,   1.,   0.,   1.,   1.],
       [  1.,   0.,   1.,   1.,   1.,   1.,   1.,   0.,   1.],
       [  1.,   0.,   1.,   0.,   1.,   1.,   1.,   1.,   0.],
       [  0.,   1.,   0.,   1.,   0.,   0.,   1.,   0.,   1.],
       [  0.,   1.,   1.,   0.,   1.,   0.,   1.,   0.,   0.],
       [  1.,   1.,   1.,   1.,   0.,   1.,   1.,   0.,   1.],
       [  0.,   1.,   0.,   0.,   1.,   1.,   1.,   1.,   1.],
       [  1.,   0.,   1.,   1.,   0.,   1.,   1.,   1.,   0.],
       [  0.,   1.,   0.,   1.,   0.,   0.,   0.,   0.,   1.],
       [  1.,   0.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
       [  1.,   1.,   1.,   1.,   1.,   0.,   1.,   1.,   1.]])

In [6]:
#                                              Get gene - ensembl names list for simulation
KnowEnG_GP_dir = '../../Samples_Clustering_Pipeline/data/networks'
network_full_file = os.path.join(KnowEnG_GP_dir, 'keg_ST90_4col.edge')
adj_mat, ensembl_names = kn.get_sparse_network_matrix(network_full_file)
del adj_mat
raw_data_dir = '../../'

In [7]:
#                                              Get assorted gene names list for simulation
raw_data_dir = '../../../pipeline_spreadsheets/raw'
sp_file = 'Hsap.ccle.G.gene_mut.binary.df'
sp_4_gene_names_df = pd.read_csv(os.path.join(raw_data_dir,sp_file),sep='\t',index_col=0,header=0)
asorted_gene_names = list(sp_4_gene_names_df.index)
del sp_4_gene_names_df

## Create bad data: phenotype data with 80% "NA" in one row
* and some bad data elsewhere

In [8]:
#                                              Set first row to 80% NA values the rest to up to 20% NA (person)
pheno_pearson_NA_data = pheno_pearson_data.copy()
percent_Bad = 0.8

select_NA = np.random.permutation(n_spreadsheet_cols)
select_NA = select_NA[:int(np.round(percent_Bad * n_spreadsheet_cols))]
pheno_pearson_NA_data[0, select_NA] = np.nan

percent_Bad_II = 0.2
for r in range(1, n_phenotype_rows):
    select_NA = np.random.permutation(n_spreadsheet_cols)
    select_NA = select_NA[:int(np.round(np.random.random() * percent_Bad_II * n_spreadsheet_cols))]
    pheno_pearson_NA_data[r, select_NA] = np.nan
    
pheno_pearson_NA_data

array([[        nan,         nan,         nan,         nan,  0.28130707,
         0.36308518,         nan,         nan,         nan],
       [ 0.        ,         nan,  0.        ,  0.        ,  0.        ,
         0.56337751,         nan,  0.61891223,  0.        ],
       [        nan,  0.        ,  0.72615545,  0.        ,  0.322662  ,
         0.43242509,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.51617701,  0.61376067,  0.47053311,  0.        ,
         0.        ,  0.34009851,         nan,  0.68922356],
       [        nan,  0.29285254,  0.51516262,  0.        ,  0.60383118,
         0.        ,  0.        ,  0.58694326,  0.        ],
       [ 0.        ,  0.        ,  0.63170117,  0.        ,  0.66296053,
                nan,  0.42195521,  0.        ,  0.75718137],
       [        nan,  0.        ,  0.45299663,  0.        ,  0.28225057,
         0.        ,  0.49233856,  0.        ,  0.        ]])

In [9]:
#                                              Set first row to 80% NA values the rest to up to 20% NA (t_test)
pheno_t_test_NA_data = pheno_t_test_data.copy()
percent_Bad = 0.8

select_NA = np.random.permutation(n_spreadsheet_cols)
select_NA = select_NA[:int(np.round(percent_Bad * n_spreadsheet_cols))]
pheno_t_test_NA_data[0, select_NA] = np.nan

percent_Bad_II = 0.2
for r in range(1, n_phenotype_rows):
    select_NA = np.random.permutation(n_spreadsheet_cols)
    select_NA = select_NA[:int(np.round(np.random.random() * percent_Bad_II * n_spreadsheet_cols))]
    pheno_t_test_NA_data[r, select_NA] = np.nan
    
pheno_t_test_NA_data

array([[ nan,  nan,   0.,  nan,  nan,  nan,   1.,  nan,  nan],
       [  1.,   1.,   0.,   1.,   1.,   1.,   1.,   1.,  nan],
       [  1.,   1.,   1.,   1.,   1.,   0.,   1.,  nan,   1.],
       [  0.,   0.,   1.,   1.,   1.,   0.,  nan,   0.,   0.],
       [  1.,   1.,   0.,   0.,   1.,   0.,   0.,   1.,   0.],
       [  0.,   1.,   0.,   0.,   1.,   1.,   0.,   0.,   1.],
       [  1.,   0.,   0.,   0.,   1.,   1.,   0.,   1.,   0.]])

In [18]:
#                                              Output a spreadsheet and phenotype with paired names
DO_WRITE_OUT = True
BASE_NAME = 'spreadsheet_C_'
good_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')
rand_names = data_synth.get_rand_unique_name_list(n_names=n_spreadsheet_cols, name_length=5)
gene_names = ensembl_names[0:n_spreadsheet_rows]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
if DO_WRITE_OUT:
    spreadsheet_df.to_csv(good_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)



pheno_pearson_df = pd.DataFrame(pheno_pearson_NA_data, index=drug_names, columns=rand_names)
if DO_WRITE_OUT:
    pheno_pearson_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                          index=True, header=True)
print('\n\n\n', BASE_NAME + '.P.pearson.tsv', '\n', pheno_pearson_df)

pheno_pearson_df = pheno_pearson_df.transpose()
if DO_WRITE_OUT:
    pheno_pearson_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.T.tsv'), sep='\t', 
                          index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.T.tsv', '\n', pheno_pearson_df)



pheno_t_test_df = pd.DataFrame(pheno_t_test_NA_data, index=drug_names, columns=rand_names)
if DO_WRITE_OUT:
    pheno_t_test_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                          index=True, header=True)
print('\n\n\n', BASE_NAME + '.P.t_test.tsv', '\n', pheno_t_test_df)

pheno_t_test_df = pheno_t_test_df.transpose()
if DO_WRITE_OUT:
    pheno_t_test_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.T.tsv'), sep='\t', 
                          index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.T.tsv', '\n', pheno_t_test_df)

spreadsheet_C_.G.tsv 
                  SNHGW  DSVZL  LMIBW  QGLJS  VDJAT  XLGSY  BHZNK  SDCGW  FNAJX
ENSG00000000005    0.0    1.0    1.0    1.0    0.0    1.0    0.0    1.0    0.0
ENSG00000000419    0.0    0.0    0.0    0.0    1.0    1.0    0.0    1.0    1.0
ENSG00000000457    1.0    0.0    1.0    1.0    1.0    1.0    1.0    0.0    1.0
ENSG00000000460    1.0    0.0    1.0    0.0    1.0    1.0    1.0    1.0    0.0
ENSG00000000938    0.0    1.0    0.0    1.0    0.0    0.0    1.0    0.0    1.0
ENSG00000000971    0.0    1.0    1.0    0.0    1.0    0.0    1.0    0.0    0.0
ENSG00000001084    1.0    1.0    1.0    1.0    0.0    1.0    1.0    0.0    1.0
ENSG00000001167    0.0    1.0    0.0    0.0    1.0    1.0    1.0    1.0    1.0
ENSG00000001497    1.0    0.0    1.0    1.0    0.0    1.0    1.0    1.0    0.0
ENSG00000001617    0.0    1.0    0.0    1.0    0.0    0.0    0.0    0.0    1.0
ENSG00000001626    1.0    0.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0
ENSG00000001630    1.0    1.0

## Get the files returned by Data Cleanup, transpose and re-write for use in GP

In [19]:
post_clean_dir = '../../Data_Cleanup_Pipeline/test/run_dir/results'
os.listdir(post_clean_dir)

['.DS_Store',
 'log_gene_prioritization_pipeline.yml',
 'spreadsheet_C_.G_ETL.tsv',
 'spreadsheet_C_.G_MAP.tsv',
 'spreadsheet_C_.G_UNMAPPED.tsv',
 'spreadsheet_C_.P.pearson.T_ETL.tsv']

In [26]:
po_pheno_pearson_df = pd.read_csv(
    os.path.join(post_clean_dir, 'spreadsheet_C_.P.pearson.T_ETL.tsv'), sep='\t', index_col=0, header=0)
print('post cleanup:\n', po_pheno_pearson_df)
po_pheno_pearson_df = po_pheno_pearson_df.transpose()
po_pheno_pearson_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson_PC_from_T.tsv'))
po_pheno_pearson_df

post cleanup:
          SNHGW     DSVZL     LMIBW     QGLJS     VDJAT     XLGSY     BHZNK  \
SVEIPFK    NaN       NaN       NaN       NaN  0.281307  0.363085       NaN   
SCKALDE    0.0       NaN  0.000000  0.000000  0.000000  0.563378       NaN   
VDRUXUI    NaN  0.000000  0.726155  0.000000  0.322662  0.432425  0.000000   
JPASLFW    0.0  0.516177  0.613761  0.470533  0.000000  0.000000  0.340099   
FMSPWML    NaN  0.292853  0.515163  0.000000  0.603831  0.000000  0.000000   
SMDMQRO    0.0  0.000000  0.631701  0.000000  0.662961       NaN  0.421955   
GBTTTOI    NaN  0.000000  0.452997  0.000000  0.282251  0.000000  0.492339   

            SDCGW     FNAJX  
SVEIPFK       NaN       NaN  
SCKALDE  0.618912  0.000000  
VDRUXUI  0.000000  0.000000  
JPASLFW       NaN  0.689224  
FMSPWML  0.586943  0.000000  
SMDMQRO  0.000000  0.757181  
GBTTTOI  0.000000  0.000000  


Unnamed: 0,SVEIPFK,SCKALDE,VDRUXUI,JPASLFW,FMSPWML,SMDMQRO,GBTTTOI
SNHGW,,0.0,,0.0,,0.0,
DSVZL,,,0.0,0.516177,0.292853,0.0,0.0
LMIBW,,0.0,0.726155,0.613761,0.515163,0.631701,0.452997
QGLJS,,0.0,0.0,0.470533,0.0,0.0,0.0
VDJAT,0.281307,0.0,0.322662,0.0,0.603831,0.662961,0.282251
XLGSY,0.363085,0.563378,0.432425,0.0,0.0,,0.0
BHZNK,,,0.0,0.340099,0.0,0.421955,0.492339
SDCGW,,0.618912,0.0,,0.586943,0.0,0.0
FNAJX,,0.0,0.0,0.689224,0.0,0.757181,0.0


In [23]:
po_pheno_pearson_df = pd.read_csv(
    os.path.join(post_clean_dir, 'spreadsheet_C_.P.pearson.T_ETL.tsv'), sep='\t', index_col=0, header=0)
po_pheno_pearson_df

Unnamed: 0,SNHGW,DSVZL,LMIBW,QGLJS,VDJAT,XLGSY,BHZNK,SDCGW,FNAJX
SVEIPFK,,,,,0.281307,0.363085,,,
SCKALDE,0.0,,0.0,0.0,0.0,0.563378,,0.618912,0.0
VDRUXUI,,0.0,0.726155,0.0,0.322662,0.432425,0.0,0.0,0.0
JPASLFW,0.0,0.516177,0.613761,0.470533,0.0,0.0,0.340099,,0.689224
FMSPWML,,0.292853,0.515163,0.0,0.603831,0.0,0.0,0.586943,0.0
SMDMQRO,0.0,0.0,0.631701,0.0,0.662961,,0.421955,0.0,0.757181
GBTTTOI,,0.0,0.452997,0.0,0.282251,0.0,0.492339,0.0,0.0


In [None]:
po_pheno_pearson_df = po_pheno_pearson_df.transpose()
po_pheno_pearson_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson_PC_from_T.tsv'))

In [27]:
post_clean_dir = '../../Data_Cleanup_Pipeline/test/run_dir/results'
os.listdir(post_clean_dir)

['.DS_Store', 'log_gene_prioritization_pipeline.yml']