# Spreadsheets .G.  genes X samples test data files generator
* write a small data frame file for all thinkable cases of input spreadsheet data for Gene Prioritization
* write small data frames for each of those to use as in Gene Prioratization t-test and pearson
* naming conventions used to allow directory processing with dcp_test.py module

In [2]:
import os
import sys
import itertools
import time
import numpy as np
import pandas as pd

sys.path.insert(1, '../../KnowEnG_Pipelines_Library')
import knpackage.redis_utilities
sys.path.insert(1, '../../KnowEnG_Pipelines_Library/knpackage')
import knpackage.toolbox as kn

sys.path.insert(1, '../src')
import data_synth
import data_wrangler as dw

sys.path.insert(1, '../../Data_Cleanup_Pipeline/src/')
import data_cleanup_toolbox as dc

In [3]:
out_data_dir = '../data/gene_prioritization/spreadsheets_GP'
pheno_pearson_data_dir = '../data/gene_prioritization/phenotypes_pearson'
pheno_t_test_data_dir = '../data/gene_prioritization/phenotypes_t_test'

# data_cleanup_toolbox test: Set of 14 Spreadsheets

In [4]:
n_spreadsheet_rows = 5
n_spreadsheet_cols = 5
spreadsheet_shape = (n_spreadsheet_rows, n_spreadsheet_cols)

spreadsheet_data = np.random.random(spreadsheet_shape)
spreadsheet_data[spreadsheet_data < 0.5] = 0.0
spreadsheet_data[spreadsheet_data != 0.0] = 1.0
spreadsheet_data

array([[ 0.,  0.,  1.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  1.,  1.,  0.],
       [ 0.,  0.,  1.,  1.,  1.],
       [ 0.,  1.,  1.,  0.,  0.]])

In [5]:
pheno_pearson_data = np.random.random(spreadsheet_shape)
pheno_pearson_data = pheno_pearson_data - 0.24
pheno_pearson_data[pheno_pearson_data < 0.26] = 0.0
print(' pearson data:\n', pheno_pearson_data)

pheno_t_test_data = np.random.random(spreadsheet_shape)
pheno_t_test_data[pheno_t_test_data < 0.5] = 0.0
pheno_t_test_data[pheno_t_test_data != 0.0] = 1.0
print('\n t_test data:\n', pheno_t_test_data)

drug_names = data_synth.get_rand_unique_name_list(n_names=n_spreadsheet_rows, name_length=7)
drug_names

 pearson data:
 [[ 0.51160277  0.          0.          0.          0.43598909]
 [ 0.          0.37204852  0.50897706  0.33792711  0.        ]
 [ 0.38213223  0.          0.35999772  0.58188271  0.40611118]
 [ 0.          0.          0.53085313  0.          0.43448512]
 [ 0.62642916  0.3122002   0.62636389  0.40624609  0.        ]]

 t_test data:
 [[ 0.  1.  0.  1.  1.]
 [ 1.  1.  0.  0.  1.]
 [ 1.  1.  0.  0.  0.]
 [ 0.  0.  1.  1.  1.]
 [ 0.  1.  1.  1.  0.]]


['ZQROWJH', 'FPOGJTV', 'ZHYFPNT', 'GPNZVNF', 'EVEBSWJ']

In [6]:
BASE_NAME = 'spreadsheet_DNE_'
DNE_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')
spreadsheet_df = data_synth.get_rand_dataframe(
                    n_spreadsheet_rows, n_spreadsheet_cols, row_name_chars=5, col_name_chars=8)

spreadsheet_df.to_csv(DNE_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_DNE_.G.tsv 
        DRBZQELQ  PCHMTIWX  BCLGQBLD  DDSYCBQW  BJMLKKOS
TNCPU  0.057155  0.960947  0.004021  0.852355  0.939237
TBFLR  0.745729  0.437480  0.704217  0.215821  0.021234
OBLBA  0.125570  0.005251  0.074374  0.680869  0.990242
GVXMV  0.716452  0.403491  0.751326  0.875176  0.422258
PKOCV  0.372661  0.945583  0.289780  0.805036  0.882049

 spreadsheet_DNE_.P.pearson.tsv 
          DRBZQELQ  PCHMTIWX  BCLGQBLD  DDSYCBQW  BJMLKKOS
ZQROWJH  0.511603  0.000000  0.000000  0.000000  0.435989
FPOGJTV  0.000000  0.372049  0.508977  0.337927  0.000000
ZHYFPNT  0.382132  0.000000  0.359998  0.581883  0.406111
GPNZVNF  0.000000  0.000000  0.530853  0.000000  0.434485
EVEBSWJ  0.626429  0.312200  0.626364  0.406246  0.000000

 spreadsheet_DNE_.P.t_test.tsv 
          DRBZQELQ  PCHMTIWX  BCLGQBLD  DDSYCBQW  BJMLKKOS
ZQROWJH       0.0       1.0       0.0       1.0       1.0
FPOGJTV       1.0       1.0       0.0       0.0       1.0
ZHYFPNT       1.0       1.0       0.0       0.0 

In [7]:
some_neg_data = spreadsheet_data.copy()
some_neg_data[0,0] = -1
some_neg_data

array([[-1.,  0.,  1.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  1.,  1.,  0.],
       [ 0.,  0.,  1.,  1.,  1.],
       [ 0.,  1.,  1.,  0.,  0.]])

In [8]:
some_nan_data = spreadsheet_data.copy()
some_nan_data[0,0] = np.nan
some_nan_data

array([[ nan,   0.,   1.,   1.,   0.],
       [  1.,   0.,   0.,   0.,   1.],
       [  0.,   0.,   1.,   1.,   0.],
       [  0.,   0.,   1.,   1.,   1.],
       [  0.,   1.,   1.,   0.,   0.]])

In [9]:
KnowEnG_GP_dir = '../../Samples_Clustering_Pipeline/data/networks'
network_full_file = os.path.join(KnowEnG_GP_dir, 'keg_ST90_4col.edge')
adj_mat, ensembl_names = kn.get_sparse_network_matrix(network_full_file)
del adj_mat
raw_data_dir = '../../'

In [10]:
raw_data_dir = '../../../pipeline_spreadsheets/raw'
sp_file = 'Hsap.ccle.G.gene_mut.binary.df'
sp_4_gene_names_df = pd.read_csv(os.path.join(raw_data_dir,sp_file),sep='\t',index_col=0,header=0)
asorted_gene_names = list(sp_4_gene_names_df.index)
del sp_4_gene_names_df

In [11]:
BASE_NAME = 'spreadsheet_A_'
good_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')
rand_names = data_synth.get_rand_unique_name_list(n_names=5, name_length=5)
gene_names = ensembl_names[0:n_spreadsheet_rows]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(good_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_A_.G.tsv 
                  DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ENSG00000000005    0.0    0.0    1.0    1.0    0.0
ENSG00000000419    1.0    0.0    0.0    0.0    1.0
ENSG00000000457    0.0    0.0    1.0    1.0    0.0
ENSG00000000460    0.0    0.0    1.0    1.0    1.0
ENSG00000000938    0.0    1.0    1.0    0.0    0.0

 spreadsheet_A_.P.pearson.tsv 
             DAZNX     RTFQJ     DVZAD     BOKOZ     XYREQ
ZQROWJH  0.511603  0.000000  0.000000  0.000000  0.435989
FPOGJTV  0.000000  0.372049  0.508977  0.337927  0.000000
ZHYFPNT  0.382132  0.000000  0.359998  0.581883  0.406111
GPNZVNF  0.000000  0.000000  0.530853  0.000000  0.434485
EVEBSWJ  0.626429  0.312200  0.626364  0.406246  0.000000

 spreadsheet_A_.P.t_test.tsv 
          DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ZQROWJH    0.0    1.0    0.0    1.0    1.0
FPOGJTV    1.0    1.0    0.0    0.0    1.0
ZHYFPNT    1.0    1.0    0.0    0.0    0.0
GPNZVNF    0.0    0.0    1.0    1.0    1.0
EVEBSWJ    0.0    1.0    1.0    1.0    0.0


In [12]:
BASE_NAME = 'spreadsheet_B_'
OK_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')
gene_names_B = gene_names.copy()
gene_names_B[0] = asorted_gene_names[0]
gene_names_B[1] = asorted_gene_names[1]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names_B, columns=rand_names)
spreadsheet_df.to_csv(OK_spreadsheet_name, sep='\t', index=True, header=True)

print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_B_.G.tsv 
                  DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
AAK1               0.0    0.0    1.0    1.0    0.0
AATK               1.0    0.0    0.0    0.0    1.0
ENSG00000000457    0.0    0.0    1.0    1.0    0.0
ENSG00000000460    0.0    0.0    1.0    1.0    1.0
ENSG00000000938    0.0    1.0    1.0    0.0    0.0

 spreadsheet_B_.P.pearson.tsv 
             DAZNX     RTFQJ     DVZAD     BOKOZ     XYREQ
ZQROWJH  0.511603  0.000000  0.000000  0.000000  0.435989
FPOGJTV  0.000000  0.372049  0.508977  0.337927  0.000000
ZHYFPNT  0.382132  0.000000  0.359998  0.581883  0.406111
GPNZVNF  0.000000  0.000000  0.530853  0.000000  0.434485
EVEBSWJ  0.626429  0.312200  0.626364  0.406246  0.000000

 spreadsheet_B_.P.t_test.tsv 
          DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ZQROWJH    0.0    1.0    0.0    1.0    1.0
FPOGJTV    1.0    1.0    0.0    0.0    1.0
ZHYFPNT    1.0    1.0    0.0    0.0    0.0
GPNZVNF    0.0    0.0    1.0    1.0    1.0
EVEBSWJ    0.0    1.0    1.0    1.0    0.0


In [13]:
BASE_NAME = 'spreadsheet_duplicate_cols_'
dup_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

rand_dup_names = rand_names.copy()
rand_dup_names[1] = rand_dup_names[2]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_dup_names)
spreadsheet_df.to_csv(dup_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_duplicate_cols_.G.tsv 
                  DAZNX  DVZAD  DVZAD  BOKOZ  XYREQ
ENSG00000000005    0.0    0.0    1.0    1.0    0.0
ENSG00000000419    1.0    0.0    0.0    0.0    1.0
ENSG00000000457    0.0    0.0    1.0    1.0    0.0
ENSG00000000460    0.0    0.0    1.0    1.0    1.0
ENSG00000000938    0.0    1.0    1.0    0.0    0.0

 spreadsheet_duplicate_cols_.P.pearson.tsv 
             DAZNX     DVZAD     DVZAD     BOKOZ     XYREQ
ZQROWJH  0.511603  0.000000  0.000000  0.000000  0.435989
FPOGJTV  0.000000  0.372049  0.508977  0.337927  0.000000
ZHYFPNT  0.382132  0.000000  0.359998  0.581883  0.406111
GPNZVNF  0.000000  0.000000  0.530853  0.000000  0.434485
EVEBSWJ  0.626429  0.312200  0.626364  0.406246  0.000000

 spreadsheet_duplicate_cols_.P.t_test.tsv 
          DAZNX  DVZAD  DVZAD  BOKOZ  XYREQ
ZQROWJH    0.0    1.0    0.0    1.0    1.0
FPOGJTV    1.0    1.0    0.0    0.0    1.0
ZHYFPNT    1.0    1.0    0.0    0.0    0.0
GPNZVNF    0.0    0.0    1.0    1.0    1.0
EVEB

In [14]:
BASE_NAME = 'spreadsheet_duplicate_rows_'
dup_row_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

gene_dup_names = gene_names.copy()
gene_dup_names[1] = gene_dup_names[2]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_dup_names, columns=rand_names)
spreadsheet_df.to_csv(dup_row_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_duplicate_rows_.G.tsv 
                  DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ENSG00000000005    0.0    0.0    1.0    1.0    0.0
ENSG00000000457    1.0    0.0    0.0    0.0    1.0
ENSG00000000457    0.0    0.0    1.0    1.0    0.0
ENSG00000000460    0.0    0.0    1.0    1.0    1.0
ENSG00000000938    0.0    1.0    1.0    0.0    0.0

 spreadsheet_duplicate_rows_.P.pearson.tsv 
             DAZNX     RTFQJ     DVZAD     BOKOZ     XYREQ
ZQROWJH  0.511603  0.000000  0.000000  0.000000  0.435989
ZHYFPNT  0.000000  0.372049  0.508977  0.337927  0.000000
ZHYFPNT  0.382132  0.000000  0.359998  0.581883  0.406111
GPNZVNF  0.000000  0.000000  0.530853  0.000000  0.434485
EVEBSWJ  0.626429  0.312200  0.626364  0.406246  0.000000

 spreadsheet_duplicate_rows_.P.t_test.tsv 
          DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ZQROWJH    0.0    1.0    0.0    1.0    1.0
ZHYFPNT    1.0    1.0    0.0    0.0    1.0
ZHYFPNT    1.0    1.0    0.0    0.0    0.0
GPNZVNF    0.0    0.0    1.0    1.0    1.0
EVEB

In [15]:
BASE_NAME = 'spreadsheet_duplicates_rows_AND_cols_'
dup_row_and_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_dup_names, columns=rand_dup_names)
spreadsheet_df.to_csv(dup_row_and_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_duplicates_rows_AND_cols_.G.tsv 
                  DAZNX  DVZAD  DVZAD  BOKOZ  XYREQ
ENSG00000000005    0.0    0.0    1.0    1.0    0.0
ENSG00000000457    1.0    0.0    0.0    0.0    1.0
ENSG00000000457    0.0    0.0    1.0    1.0    0.0
ENSG00000000460    0.0    0.0    1.0    1.0    1.0
ENSG00000000938    0.0    1.0    1.0    0.0    0.0

 spreadsheet_duplicates_rows_AND_cols_.P.pearson.tsv 
             DAZNX     DVZAD     DVZAD     BOKOZ     XYREQ
ZQROWJH  0.511603  0.000000  0.000000  0.000000  0.435989
ZHYFPNT  0.000000  0.372049  0.508977  0.337927  0.000000
ZHYFPNT  0.382132  0.000000  0.359998  0.581883  0.406111
GPNZVNF  0.000000  0.000000  0.530853  0.000000  0.434485
EVEBSWJ  0.626429  0.312200  0.626364  0.406246  0.000000

 spreadsheet_duplicates_rows_AND_cols_.P.t_test.tsv 
          DAZNX  DVZAD  DVZAD  BOKOZ  XYREQ
ZQROWJH    0.0    1.0    0.0    1.0    1.0
ZHYFPNT    1.0    1.0    0.0    0.0    1.0
ZHYFPNT    1.0    1.0    0.0    0.0    0.0
GPNZVNF    0.0   

In [16]:
BASE_NAME = 'spreadsheet_EMPTY_cols_'
empty_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(data=None, columns=rand_names)
spreadsheet_df.to_csv(empty_col_spreadsheet_name, sep='\t', index=None, header=True)
print(BASE_NAME + '.G.tsv', '\n')
spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df

spreadsheet_EMPTY_cols_.G.tsv 



Unnamed: 0,DAZNX,RTFQJ,DVZAD,BOKOZ,XYREQ


In [17]:
BASE_NAME = 'spreadsheet_EMPTY_rows_'
empty_row_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(data=None, index=gene_names)
spreadsheet_df.to_csv(empty_row_spreadsheet_name, sep='\t', index=True, header=None)
print(BASE_NAME + '.G.tsv', '\n')

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df

spreadsheet_EMPTY_rows_.G.tsv 



ENSG00000000005
ENSG00000000419
ENSG00000000457
ENSG00000000460
ENSG00000000938


In [18]:
BASE_NAME = 'spreadsheet_NA_cols_'
NA_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

na_col_names = rand_names.copy()
na_col_names[2] = 'NA'
na_row_names = gene_names.copy()
na_row_names[2] = 'NA'
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=na_col_names)
spreadsheet_df.to_csv(NA_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_NA_cols_.G.tsv 
                  DAZNX  RTFQJ   NA  BOKOZ  XYREQ
ENSG00000000005    0.0    0.0  1.0    1.0    0.0
ENSG00000000419    1.0    0.0  0.0    0.0    1.0
ENSG00000000457    0.0    0.0  1.0    1.0    0.0
ENSG00000000460    0.0    0.0  1.0    1.0    1.0
ENSG00000000938    0.0    1.0  1.0    0.0    0.0

 spreadsheet_NA_cols_.P.pearson.tsv 
             DAZNX     RTFQJ        NA     BOKOZ     XYREQ
ZQROWJH  0.511603  0.000000  0.000000  0.000000  0.435989
FPOGJTV  0.000000  0.372049  0.508977  0.337927  0.000000
ZHYFPNT  0.382132  0.000000  0.359998  0.581883  0.406111
GPNZVNF  0.000000  0.000000  0.530853  0.000000  0.434485
EVEBSWJ  0.626429  0.312200  0.626364  0.406246  0.000000

 spreadsheet_NA_cols_.P.t_test.tsv 
          DAZNX  RTFQJ   NA  BOKOZ  XYREQ
ZQROWJH    0.0    1.0  0.0    1.0    1.0
FPOGJTV    1.0    1.0  0.0    0.0    1.0
ZHYFPNT    1.0    1.0  0.0    0.0    0.0
GPNZVNF    0.0    0.0  1.0    1.0    1.0
EVEBSWJ    0.0    1.0  1.0    1.0    0.0


In [19]:
BASE_NAME = 'spreadsheet_NA_rows_'
NA_rows_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=na_row_names, columns=rand_names)
spreadsheet_df.to_csv(NA_rows_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
row_names_here = list(spreadsheet_df.index.values)
row_na_dict = {row_names_here[2]: 'NA'}
spreadsheet_df.rename(index=row_na_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_NA_rows_.G.tsv 
                  DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ENSG00000000005    0.0    0.0    1.0    1.0    0.0
ENSG00000000419    1.0    0.0    0.0    0.0    1.0
NA                 0.0    0.0    1.0    1.0    0.0
ENSG00000000460    0.0    0.0    1.0    1.0    1.0
ENSG00000000938    0.0    1.0    1.0    0.0    0.0

 spreadsheet_NA_rows_.P.pearson.tsv 
             DAZNX     RTFQJ     DVZAD     BOKOZ     XYREQ
ZQROWJH  0.511603  0.000000  0.000000  0.000000  0.435989
FPOGJTV  0.000000  0.372049  0.508977  0.337927  0.000000
NA       0.382132  0.000000  0.359998  0.581883  0.406111
GPNZVNF  0.000000  0.000000  0.530853  0.000000  0.434485
EVEBSWJ  0.626429  0.312200  0.626364  0.406246  0.000000

 spreadsheet_NA_rows_.P.t_test.tsv 
          DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ZQROWJH    0.0    1.0    0.0    1.0    1.0
FPOGJTV    1.0    1.0    0.0    0.0    1.0
NA         1.0    1.0    0.0    0.0    0.0
GPNZVNF    0.0    0.0    1.0    1.0    1.0
EVEBSWJ    0.0    1.0    

In [20]:
BASE_NAME = 'spreadsheet_NAs_rows_and_cols_'
NA_row_and_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=na_row_names, columns=na_col_names)
spreadsheet_df.to_csv(NA_row_and_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
row_names_here = list(spreadsheet_df.index.values)
row_na_dict = {row_names_here[2]: 'NA'}
spreadsheet_df.rename(index=row_na_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_NAs_rows_and_cols_.G.tsv 
                  DAZNX  RTFQJ   NA  BOKOZ  XYREQ
ENSG00000000005    0.0    0.0  1.0    1.0    0.0
ENSG00000000419    1.0    0.0  0.0    0.0    1.0
NA                 0.0    0.0  1.0    1.0    0.0
ENSG00000000460    0.0    0.0  1.0    1.0    1.0
ENSG00000000938    0.0    1.0  1.0    0.0    0.0

 spreadsheet_NAs_rows_and_cols_.P.pearson.tsv 
             DAZNX     RTFQJ        NA     BOKOZ     XYREQ
ZQROWJH  0.511603  0.000000  0.000000  0.000000  0.435989
FPOGJTV  0.000000  0.372049  0.508977  0.337927  0.000000
NA       0.382132  0.000000  0.359998  0.581883  0.406111
GPNZVNF  0.000000  0.000000  0.530853  0.000000  0.434485
EVEBSWJ  0.626429  0.312200  0.626364  0.406246  0.000000

 spreadsheet_NAs_rows_and_cols_.P.t_test.tsv 
          DAZNX  RTFQJ   NA  BOKOZ  XYREQ
ZQROWJH    0.0    1.0  0.0    1.0    1.0
FPOGJTV    1.0    1.0  0.0    0.0    1.0
NA         1.0    1.0  0.0    0.0    0.0
GPNZVNF    0.0    0.0  1.0    1.0    1.0
EVEBSWJ    0.0   

In [21]:
BASE_NAME = 'spreadsheet_NAN_data_'
NAN_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(some_nan_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(NAN_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}
pheno_NAN_data = pheno_pearson_data.copy()
pheno_NAN_data[0,0] = np.nan
spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_NAN_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)

pheno_NAN_data = pheno_t_test_data.copy()
pheno_NAN_data[0,0] = np.nan
spreadsheet_df[:][:] = pheno_NAN_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)


spreadsheet_NAN_data_.G.tsv 
                  DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ENSG00000000005    NaN    0.0    1.0    1.0    0.0
ENSG00000000419    1.0    0.0    0.0    0.0    1.0
ENSG00000000457    0.0    0.0    1.0    1.0    0.0
ENSG00000000460    0.0    0.0    1.0    1.0    1.0
ENSG00000000938    0.0    1.0    1.0    0.0    0.0

 spreadsheet_NAN_data_.P.pearson.tsv 
             DAZNX     RTFQJ     DVZAD     BOKOZ     XYREQ
ZQROWJH       NaN  0.000000  0.000000  0.000000  0.435989
FPOGJTV  0.000000  0.372049  0.508977  0.337927  0.000000
ZHYFPNT  0.382132  0.000000  0.359998  0.581883  0.406111
GPNZVNF  0.000000  0.000000  0.530853  0.000000  0.434485
EVEBSWJ  0.626429  0.312200  0.626364  0.406246  0.000000

 spreadsheet_NAN_data_.P.t_test.tsv 
          DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ZQROWJH    NaN    1.0    0.0    1.0    1.0
FPOGJTV    1.0    1.0    0.0    0.0    1.0
ZHYFPNT    1.0    1.0    0.0    0.0    0.0
GPNZVNF    0.0    0.0    1.0    1.0    1.0
EVEBSWJ    0.0    1.0 

In [22]:
BASE_NAME = 'spreadsheet_Negative_data_'
NEG_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(some_neg_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(NEG_spreadsheet_name, sep='\t', index=True, header=True)

print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

pheno_neg = pheno_pearson_data.copy()
pheno_neg[0,0] = -1
spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_neg

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)

pheno_neg = pheno_t_test_data.copy()
pheno_neg[0,0] = -1
spreadsheet_df[:][:] = pheno_neg

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)


NEG_2_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.2.G.tsv')
spreadsheet_df.to_csv(NEG_2_spreadsheet_name, sep='\t', index=True, header=True)

pheno_obtuse = pheno_t_test_data.copy()
pheno_obtuse[0,0] = 2
spreadsheet_df[:][:] = pheno_obtuse

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.2.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.2.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_Negative_data_.G.tsv 
                  DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ENSG00000000005   -1.0    0.0    1.0    1.0    0.0
ENSG00000000419    1.0    0.0    0.0    0.0    1.0
ENSG00000000457    0.0    0.0    1.0    1.0    0.0
ENSG00000000460    0.0    0.0    1.0    1.0    1.0
ENSG00000000938    0.0    1.0    1.0    0.0    0.0

 spreadsheet_Negative_data_.P.pearson.tsv 
             DAZNX     RTFQJ     DVZAD     BOKOZ     XYREQ
ZQROWJH -1.000000  0.000000  0.000000  0.000000  0.435989
FPOGJTV  0.000000  0.372049  0.508977  0.337927  0.000000
ZHYFPNT  0.382132  0.000000  0.359998  0.581883  0.406111
GPNZVNF  0.000000  0.000000  0.530853  0.000000  0.434485
EVEBSWJ  0.626429  0.312200  0.626364  0.406246  0.000000

 spreadsheet_Negative_data_.P.t_test.tsv 
          DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ZQROWJH   -1.0    1.0    0.0    1.0    1.0
FPOGJTV    1.0    1.0    0.0    0.0    1.0
ZHYFPNT    1.0    1.0    0.0    0.0    0.0
GPNZVNF    0.0    0.0    1.0    1.0    1.0
EVEBSWJ

In [23]:
BASE_NAME = 'spreadsheet_ALPHA_data_'
alpha_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
spreadsheet_df.loc[gene_names[0], rand_names[0]] = 'abc'

spreadsheet_df.to_csv(alpha_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data
spreadsheet_df.loc[drug_names[0],rand_names[0]] = 'abc'

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data
spreadsheet_df.loc[drug_names[0],rand_names[0]] = 'abc'

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_ALPHA_data_.G.tsv 
                 DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ENSG00000000005   abc    0.0    1.0    1.0    0.0
ENSG00000000419     1    0.0    0.0    0.0    1.0
ENSG00000000457     0    0.0    1.0    1.0    0.0
ENSG00000000460     0    0.0    1.0    1.0    1.0
ENSG00000000938     0    1.0    1.0    0.0    0.0

 spreadsheet_ALPHA_data_.P.pearson.tsv 
         DAZNX     RTFQJ     DVZAD     BOKOZ     XYREQ
ZQROWJH   abc  0.000000  0.000000  0.000000  0.435989
FPOGJTV     1  0.372049  0.508977  0.337927  0.000000
ZHYFPNT     0  0.000000  0.359998  0.581883  0.406111
GPNZVNF     0  0.000000  0.530853  0.000000  0.434485
EVEBSWJ     0  0.312200  0.626364  0.406246  0.000000

 spreadsheet_ALPHA_data_.P.t_test.tsv 
         DAZNX  RTFQJ  DVZAD  BOKOZ  XYREQ
ZQROWJH   abc    1.0    0.0    1.0    1.0
FPOGJTV     1    1.0    0.0    0.0    1.0
ZHYFPNT     0    1.0    0.0    0.0    0.0
GPNZVNF     0    0.0    1.0    1.0    1.0
EVEBSWJ     0    1.0    1.0    1.0    0.0
