# Spreadsheets .G.  genes X samples special case test data files generator
* phenotype file with one row having so many "NA" values that bootstrap sampling produces empty or nearly empty rows.
* spreadsheet files to aggrovate the same

In [1]:
import os
import sys
import itertools
import time
import numpy as np
import pandas as pd

sys.path.insert(1, '../../KnowEnG_Pipelines_Library')
import knpackage.redis_utilities
sys.path.insert(1, '../../KnowEnG_Pipelines_Library/knpackage')
import knpackage.toolbox as kn

sys.path.insert(1, '../src')
import data_synth
import data_wrangler as dw

sys.path.insert(1, '../../Data_Cleanup_Pipeline/src/')
import data_cleanup_toolbox as dc

In [2]:
#                                              Set output directory
out_data_dir = '../data/GP_special_case/NA_80'
pheno_pearson_data_dir = out_data_dir
pheno_t_test_data_dir = out_data_dir

In [3]:
#                                              Define spreadsheet data
n_spreadsheet_rows = 10
n_spreadsheet_cols = 10
spreadsheet_shape = (n_spreadsheet_rows, n_spreadsheet_cols)

spreadsheet_data = np.random.random(spreadsheet_shape)
spreadsheet_data[spreadsheet_data < 0.5] = 0.0
spreadsheet_data[spreadsheet_data != 0.0] = 1.0
spreadsheet_data

array([[ 0.,  0.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  0.],
       [ 1.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  0.,  0.],
       [ 1.,  1.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  1.],
       [ 0.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  1.],
       [ 0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.]])

In [4]:
#                                              Define phenotype data
pheno_pearson_data = np.random.random(spreadsheet_shape)
pheno_pearson_data = pheno_pearson_data - 0.24
pheno_pearson_data[pheno_pearson_data < 0.26] = 0.0
print(' pearson data:\n', pheno_pearson_data)

pheno_t_test_data = np.random.random(spreadsheet_shape)
pheno_t_test_data[pheno_t_test_data < 0.5] = 0.0
pheno_t_test_data[pheno_t_test_data != 0.0] = 1.0
print('\n t_test data:\n', pheno_t_test_data)

drug_names = data_synth.get_rand_unique_name_list(n_names=n_spreadsheet_rows, name_length=7)
drug_names

 pearson data:
 [[ 0.34224992  0.67804299  0.          0.          0.          0.
   0.45465323  0.33003557  0.63063268  0.40359334]
 [ 0.          0.          0.          0.57356497  0.          0.
   0.32752679  0.49000592  0.52517691  0.60188106]
 [ 0.          0.          0.42774474  0.55369497  0.          0.28941135
   0.          0.          0.41051662  0.43556742]
 [ 0.54804132  0.          0.          0.42934368  0.          0.
   0.64139926  0.26346743  0.39275985  0.        ]
 [ 0.          0.29391454  0.28232303  0.53683982  0.          0.59504041
   0.          0.65704163  0.7339407   0.44149888]
 [ 0.          0.          0.6372794   0.34379558  0.          0.
   0.28105551  0.          0.          0.52075051]
 [ 0.262115    0.          0.          0.3155276   0.          0.          0.
   0.          0.          0.47020995]
 [ 0.67981726  0.          0.50088571  0.          0.3270369   0.69045738
   0.45666225  0.          0.          0.57471769]
 [ 0.28201243  0.       

['MTGFFJC',
 'OQAKXPB',
 'JVVYTZI',
 'EQAOCFK',
 'EPUMLUW',
 'QYLOQXZ',
 'IIRLLZM',
 'WWLIJVJ',
 'TYXWRHJ',
 'RIEXTAW']

In [5]:
#                                              Define spreadsheet Nan data
some_nan_data = spreadsheet_data.copy()
some_nan_data[0,0] = np.nan
some_nan_data

array([[ nan,   0.,   1.,   1.,   1.,   0.,   1.,   0.,   0.,   1.],
       [  0.,   1.,   0.,   1.,   1.,   1.,   1.,   0.,   1.,   0.],
       [  0.,   0.,   1.,   0.,   1.,   1.,   0.,   1.,   0.,   0.],
       [  1.,   0.,   0.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
       [  1.,   1.,   1.,   1.,   1.,   0.,   1.,   0.,   1.,   0.],
       [  1.,   1.,   0.,   1.,   1.,   0.,   1.,   1.,   0.,   0.],
       [  1.,   1.,   0.,   0.,   1.,   1.,   1.,   1.,   1.,   1.],
       [  1.,   1.,   1.,   1.,   1.,   0.,   1.,   0.,   1.,   1.],
       [  0.,   1.,   1.,   1.,   0.,   0.,   0.,   1.,   0.,   1.],
       [  0.,   1.,   1.,   0.,   1.,   0.,   1.,   0.,   0.,   0.]])

In [6]:
#                                              Get gene - ensembl names list for simulation
KnowEnG_GP_dir = '../../Samples_Clustering_Pipeline/data/networks'
network_full_file = os.path.join(KnowEnG_GP_dir, 'keg_ST90_4col.edge')
adj_mat, ensembl_names = kn.get_sparse_network_matrix(network_full_file)
del adj_mat
raw_data_dir = '../../'

In [7]:
#                                              Get assorted gene names list for simulation
raw_data_dir = '../../../pipeline_spreadsheets/raw'
sp_file = 'Hsap.ccle.G.gene_mut.binary.df'
sp_4_gene_names_df = pd.read_csv(os.path.join(raw_data_dir,sp_file),sep='\t',index_col=0,header=0)
asorted_gene_names = list(sp_4_gene_names_df.index)
del sp_4_gene_names_df

## Create bad data: phenotype data with 80% "NA" in one row
* and some bad data elsewhere

In [8]:
#                                              Set first row to 80% NA values the rest to up to 20% NA (person)
pheno_pearson_NA_data = pheno_pearson_data.copy()
percent_Bad = 0.8

select_NA = np.random.permutation(n_spreadsheet_cols)
select_NA = select_NA[:int(np.round(percent_Bad * n_spreadsheet_cols))]
pheno_pearson_NA_data[0, select_NA] = np.nan

percent_Bad_II = 0.2
for r in range(1, n_spreadsheet_rows):
    select_NA = np.random.permutation(n_spreadsheet_cols)
    select_NA = select_NA[:int(np.round(np.random.random() * percent_Bad_II * n_spreadsheet_cols))]
    pheno_pearson_NA_data[r, select_NA] = np.nan
    
pheno_pearson_NA_data

array([[        nan,         nan,         nan,         nan,  0.        ,
                nan,         nan,         nan,         nan,  0.40359334],
       [ 0.        ,  0.        ,  0.        ,  0.57356497,         nan,
         0.        ,  0.32752679,  0.49000592,  0.52517691,  0.60188106],
       [ 0.        ,  0.        ,  0.42774474,  0.55369497,  0.        ,
         0.28941135,  0.        ,  0.        ,         nan,  0.43556742],
       [ 0.54804132,  0.        ,  0.        ,  0.42934368,  0.        ,
         0.        ,  0.64139926,         nan,  0.39275985,  0.        ],
       [ 0.        ,  0.29391454,  0.28232303,  0.53683982,  0.        ,
         0.59504041,  0.        ,         nan,  0.7339407 ,  0.44149888],
       [ 0.        ,  0.        ,  0.6372794 ,  0.34379558,         nan,
         0.        ,  0.28105551,  0.        ,  0.        ,         nan],
       [ 0.262115  ,  0.        ,  0.        ,  0.3155276 ,  0.        ,
         0.        ,  0.        ,  0.        

In [9]:
#                                              Set first row to 80% NA values the rest to up to 20% NA (t_test)
pheno_t_test_NA_data = pheno_t_test_data.copy()
percent_Bad = 0.8

select_NA = np.random.permutation(n_spreadsheet_cols)
select_NA = select_NA[:int(np.round(percent_Bad * n_spreadsheet_cols))]
pheno_t_test_NA_data[0, select_NA] = np.nan

percent_Bad_II = 0.2
for r in range(1, n_spreadsheet_rows):
    select_NA = np.random.permutation(n_spreadsheet_cols)
    select_NA = select_NA[:int(np.round(np.random.random() * percent_Bad_II * n_spreadsheet_cols))]
    pheno_t_test_NA_data[r, select_NA] = np.nan
    
pheno_t_test_NA_data

array([[  0.,  nan,  nan,  nan,  nan,  nan,  nan,  nan,   1.,  nan],
       [  0.,   0.,   1.,   0.,   0.,   0.,   1.,   0.,   0.,  nan],
       [  0.,   0.,  nan,   0.,   1.,   0.,   1.,   1.,   0.,   1.],
       [  0.,   1.,   0.,   1.,  nan,   0.,   1.,   1.,   1.,  nan],
       [ nan,   1.,   1.,   0.,   0.,   0.,   1.,   1.,   1.,   1.],
       [  0.,   0.,   0.,   1.,   1.,   1.,   0.,  nan,   0.,   1.],
       [  0.,   0.,   1.,   0.,   1.,  nan,   1.,   0.,   1.,   0.],
       [ nan,   1.,   0.,   0.,   1.,  nan,   1.,   1.,   0.,   0.],
       [  0.,   1.,   0.,   1.,   0.,   0.,   0.,   1.,  nan,   1.],
       [  1.,  nan,   1.,   1.,   1.,   1.,   0.,   0.,   0.,   0.]])

In [12]:
#                                              Output a spreadsheet and phenotype with paired names
DO_WRITE_OUT = True
BASE_NAME = 'spreadsheet_C_'
good_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')
rand_names = data_synth.get_rand_unique_name_list(n_names=n_spreadsheet_cols, name_length=5)
gene_names = ensembl_names[0:n_spreadsheet_rows]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
if DO_WRITE_OUT:
    spreadsheet_df.to_csv(good_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_NA_data
if DO_WRITE_OUT:
    spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                          index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_NA_data
if DO_WRITE_OUT:
    spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                          index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_C_.G.tsv 
                  LYNTN  RACUF  GJICK  NRWCQ  AEDNY  IWWPW  RGGPY  EANLT  \
ENSG00000000005    0.0    0.0    1.0    1.0    1.0    0.0    1.0    0.0   
ENSG00000000419    0.0    1.0    0.0    1.0    1.0    1.0    1.0    0.0   
ENSG00000000457    0.0    0.0    1.0    0.0    1.0    1.0    0.0    1.0   
ENSG00000000460    1.0    0.0    0.0    1.0    1.0    1.0    1.0    1.0   
ENSG00000000938    1.0    1.0    1.0    1.0    1.0    0.0    1.0    0.0   
ENSG00000000971    1.0    1.0    0.0    1.0    1.0    0.0    1.0    1.0   
ENSG00000001084    1.0    1.0    0.0    0.0    1.0    1.0    1.0    1.0   
ENSG00000001167    1.0    1.0    1.0    1.0    1.0    0.0    1.0    0.0   
ENSG00000001497    0.0    1.0    1.0    1.0    0.0    0.0    0.0    1.0   
ENSG00000001617    0.0    1.0    1.0    0.0    1.0    0.0    1.0    0.0   

                 REOIA  FHIEM  
ENSG00000000005    0.0    1.0  
ENSG00000000419    1.0    0.0  
ENSG00000000457    0.0    0.0  
ENSG00000000460    1.0 

In [16]:
pearson_name = os.path.join(pheno_t_test_data_dir, 'spreadsheet_C_.P.pearson.tsv')
pheno_pearson_df = pd.read_csv(pearson_name, sep='\t', index_col=0, header=0)
pheno_pearson_df

Unnamed: 0,LYNTN,RACUF,GJICK,NRWCQ,AEDNY,IWWPW,RGGPY,EANLT,REOIA,FHIEM
MTGFFJC,,,,,0.0,,,,,0.403593
OQAKXPB,0.0,0.0,0.0,0.573565,,0.0,0.327527,0.490006,0.525177,0.601881
JVVYTZI,0.0,0.0,0.427745,0.553695,0.0,0.289411,0.0,0.0,,0.435567
EQAOCFK,0.548041,0.0,0.0,0.429344,0.0,0.0,0.641399,,0.39276,0.0
EPUMLUW,0.0,0.293915,0.282323,0.53684,0.0,0.59504,0.0,,0.733941,0.441499
QYLOQXZ,0.0,0.0,0.637279,0.343796,,0.0,0.281056,0.0,0.0,
IIRLLZM,0.262115,0.0,0.0,0.315528,0.0,0.0,0.0,0.0,0.0,0.47021
WWLIJVJ,0.679817,0.0,0.500886,0.0,0.327037,0.690457,,0.0,0.0,0.574718
TYXWRHJ,0.282012,0.0,0.375354,0.0,,0.753516,0.0,0.0,0.654905,0.0
RIEXTAW,0.0,0.377335,,0.293141,0.0,0.517649,0.736621,0.518037,0.630972,0.380292


In [17]:
pheno_pearson_df.loc['MTGFFJC']

LYNTN         NaN
RACUF         NaN
GJICK         NaN
NRWCQ         NaN
AEDNY    0.000000
IWWPW         NaN
RGGPY         NaN
EANLT         NaN
REOIA         NaN
FHIEM    0.403593
Name: MTGFFJC, dtype: float64