# Spreadsheets .G.  genes X samples test data files generator
* write a small data frame file for all thinkable cases of input spreadsheet data for Samples Clustering
* write small data frames for each of those to use as in Gene Prioratization t-test and pearson
* naming conventions used to allow directory processing with dcp_test.py module


In [1]:
import os
import sys
import itertools
import time
import numpy as np
import pandas as pd

sys.path.insert(1, '../../KnowEnG_Pipelines_Library')
import knpackage.redis_utilities
sys.path.insert(1, '../../KnowEnG_Pipelines_Library/knpackage')
import knpackage.toolbox as kn

sys.path.insert(1, '../src')
import data_synth
import data_wrangler as dw

sys.path.insert(1, '../../Data_Cleanup_Pipeline/src/')
import data_cleanup_toolbox as dc

In [2]:
out_data_dir = './spreadsheets.G.GP.etc'
pheno_pearson_data_dir = './phenotypes_pearson.P.GP.etc'
pheno_t_test_data_dir = './phenotypes_t_test.P.GP.etc'

# data_cleanup_toolbox test: Set of 14 Spreadsheets

In [3]:
n_spreadsheet_rows = 5
n_spreadsheet_cols = 5
spreadsheet_shape = (n_spreadsheet_rows, n_spreadsheet_cols)

spreadsheet_data = np.random.random(spreadsheet_shape)
spreadsheet_data[spreadsheet_data < 0.5] = 0.0
spreadsheet_data[spreadsheet_data != 0.0] = 1.0
spreadsheet_data

array([[ 0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  1.,  0.],
       [ 1.,  1.,  1.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  1.]])

In [4]:
pheno_pearson_data = np.random.random(spreadsheet_shape)
pheno_pearson_data = pheno_pearson_data - 0.24
pheno_pearson_data[pheno_pearson_data < 0.26] = 0.0
print(' pearson data:\n', pheno_pearson_data)

pheno_t_test_data = np.random.random(spreadsheet_shape)
pheno_t_test_data[pheno_t_test_data < 0.5] = 0.0
pheno_t_test_data[pheno_t_test_data != 0.0] = 1.0
print('\n t_test data:\n', pheno_t_test_data)

drug_names = data_synth.get_rand_unique_name_list(n_names=n_spreadsheet_rows, name_length=7)
drug_names

 pearson data:
 [[ 0.30635999  0.39294443  0.59708679  0.          0.        ]
 [ 0.          0.          0.42259654  0.57593148  0.42586068]
 [ 0.68140819  0.46674142  0.49987416  0.          0.        ]
 [ 0.36085201  0.3334128   0.          0.          0.65430327]
 [ 0.          0.          0.51722099  0.35458625  0.71300165]]

 t_test data:
 [[ 0.  1.  1.  1.  0.]
 [ 1.  1.  1.  1.  0.]
 [ 0.  1.  1.  0.  1.]
 [ 0.  1.  1.  1.  1.]
 [ 1.  1.  0.  0.  1.]]


['SIQZCRH', 'NKJPYEX', 'IKLIYFV', 'HYQYDWG', 'SRZBDFD']

In [5]:
BASE_NAME = 'spreadsheet_DNE_'
DNE_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')
spreadsheet_df = data_synth.get_rand_dataframe(
                    n_spreadsheet_rows, n_spreadsheet_cols, row_name_chars=5, col_name_chars=8)

spreadsheet_df.to_csv(DNE_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_DNE_.G.tsv 
        SCAOHZUQ  MYVHVWNH  UZZXRUEX  ZCPBAKFS  HGVZYJUR
AVGOD  0.704764  0.854563  0.150846  0.339013  0.839310
NICTB  0.782305  0.016950  0.841749  0.977263  0.227105
TKGZR  0.507466  0.005960  0.162253  0.506260  0.379391
NEDNI  0.655496  0.435101  0.886430  0.783672  0.603121
QFYTM  0.519511  0.970069  0.420482  0.360855  0.343659

 spreadsheet_DNE_.P.pearson.tsv 
          SCAOHZUQ  MYVHVWNH  UZZXRUEX  ZCPBAKFS  HGVZYJUR
SIQZCRH  0.306360  0.392944  0.597087  0.000000  0.000000
NKJPYEX  0.000000  0.000000  0.422597  0.575931  0.425861
IKLIYFV  0.681408  0.466741  0.499874  0.000000  0.000000
HYQYDWG  0.360852  0.333413  0.000000  0.000000  0.654303
SRZBDFD  0.000000  0.000000  0.517221  0.354586  0.713002

 spreadsheet_DNE_.P.t_test.tsv 
          SCAOHZUQ  MYVHVWNH  UZZXRUEX  ZCPBAKFS  HGVZYJUR
SIQZCRH       0.0       1.0       1.0       1.0       0.0
NKJPYEX       1.0       1.0       1.0       1.0       0.0
IKLIYFV       0.0       1.0       1.0       0.0 

In [6]:
some_neg_data = spreadsheet_data.copy()
some_neg_data[0,0] = -1
some_neg_data

array([[-1.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  1.,  0.],
       [ 1.,  1.,  1.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  1.]])

In [7]:
some_nan_data = spreadsheet_data.copy()
some_nan_data[0,0] = np.nan
some_nan_data

array([[ nan,   1.,   0.,   0.,   0.],
       [  1.,   0.,   0.,   1.,   0.],
       [  1.,   0.,   0.,   1.,   0.],
       [  1.,   1.,   1.,   1.,   0.],
       [  0.,   0.,   1.,   0.,   1.]])

In [8]:
KnowEnG_GP_dir = '../../Samples_Clustering_Pipeline/data/networks'
network_full_file = os.path.join(KnowEnG_GP_dir, 'keg_ST90_4col.edge')
adj_mat, ensembl_names = kn.get_sparse_network_matrix(network_full_file)
del adj_mat
raw_data_dir = '../../'

In [9]:
raw_data_dir = '../../../pipeline_spreadsheets/raw'
sp_file = 'Hsap.ccle.G.gene_mut.binary.df'
sp_4_gene_names_df = pd.read_csv(os.path.join(raw_data_dir,sp_file),sep='\t',index_col=0,header=0)
asorted_gene_names = list(sp_4_gene_names_df.index)
del sp_4_gene_names_df

In [10]:
BASE_NAME = 'spreadsheet_A_'
good_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')
rand_names = data_synth.get_rand_unique_name_list(n_names=5, name_length=5)
gene_names = ensembl_names[0:n_spreadsheet_rows]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(good_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_A_.G.tsv 
                  CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
ENSG00000000005    0.0    1.0    0.0    0.0    0.0
ENSG00000000419    1.0    0.0    0.0    1.0    0.0
ENSG00000000457    1.0    0.0    0.0    1.0    0.0
ENSG00000000460    1.0    1.0    1.0    1.0    0.0
ENSG00000000938    0.0    0.0    1.0    0.0    1.0

 spreadsheet_A_.P.pearson.tsv 
             CFRZG     VLNHU     WRNQP     QRXCG     UGYRV
SIQZCRH  0.306360  0.392944  0.597087  0.000000  0.000000
NKJPYEX  0.000000  0.000000  0.422597  0.575931  0.425861
IKLIYFV  0.681408  0.466741  0.499874  0.000000  0.000000
HYQYDWG  0.360852  0.333413  0.000000  0.000000  0.654303
SRZBDFD  0.000000  0.000000  0.517221  0.354586  0.713002

 spreadsheet_A_.P.t_test.tsv 
          CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
SIQZCRH    0.0    1.0    1.0    1.0    0.0
NKJPYEX    1.0    1.0    1.0    1.0    0.0
IKLIYFV    0.0    1.0    1.0    0.0    1.0
HYQYDWG    0.0    1.0    1.0    1.0    1.0
SRZBDFD    1.0    1.0    0.0    0.0    1.0


In [11]:
BASE_NAME = 'spreadsheet_B_'
OK_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')
gene_names_B = gene_names.copy()
gene_names_B[0] = asorted_gene_names[0]
gene_names_B[1] = asorted_gene_names[1]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names_B, columns=rand_names)
spreadsheet_df.to_csv(OK_spreadsheet_name, sep='\t', index=True, header=True)

print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_B_.G.tsv 
                  CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
AAK1               0.0    1.0    0.0    0.0    0.0
AATK               1.0    0.0    0.0    1.0    0.0
ENSG00000000457    1.0    0.0    0.0    1.0    0.0
ENSG00000000460    1.0    1.0    1.0    1.0    0.0
ENSG00000000938    0.0    0.0    1.0    0.0    1.0

 spreadsheet_B_.P.pearson.tsv 
             CFRZG     VLNHU     WRNQP     QRXCG     UGYRV
SIQZCRH  0.306360  0.392944  0.597087  0.000000  0.000000
NKJPYEX  0.000000  0.000000  0.422597  0.575931  0.425861
IKLIYFV  0.681408  0.466741  0.499874  0.000000  0.000000
HYQYDWG  0.360852  0.333413  0.000000  0.000000  0.654303
SRZBDFD  0.000000  0.000000  0.517221  0.354586  0.713002

 spreadsheet_B_.P.t_test.tsv 
          CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
SIQZCRH    0.0    1.0    1.0    1.0    0.0
NKJPYEX    1.0    1.0    1.0    1.0    0.0
IKLIYFV    0.0    1.0    1.0    0.0    1.0
HYQYDWG    0.0    1.0    1.0    1.0    1.0
SRZBDFD    1.0    1.0    0.0    0.0    1.0


In [12]:
BASE_NAME = 'spreadsheet_duplicate_cols_'
dup_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

rand_dup_names = rand_names.copy()
rand_dup_names[1] = rand_dup_names[2]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_dup_names)
spreadsheet_df.to_csv(dup_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_duplicate_cols_.G.tsv 
                  CFRZG  WRNQP  WRNQP  QRXCG  UGYRV
ENSG00000000005    0.0    1.0    0.0    0.0    0.0
ENSG00000000419    1.0    0.0    0.0    1.0    0.0
ENSG00000000457    1.0    0.0    0.0    1.0    0.0
ENSG00000000460    1.0    1.0    1.0    1.0    0.0
ENSG00000000938    0.0    0.0    1.0    0.0    1.0

 spreadsheet_duplicate_cols_.P.pearson.tsv 
             CFRZG     WRNQP     WRNQP     QRXCG     UGYRV
SIQZCRH  0.306360  0.392944  0.597087  0.000000  0.000000
NKJPYEX  0.000000  0.000000  0.422597  0.575931  0.425861
IKLIYFV  0.681408  0.466741  0.499874  0.000000  0.000000
HYQYDWG  0.360852  0.333413  0.000000  0.000000  0.654303
SRZBDFD  0.000000  0.000000  0.517221  0.354586  0.713002

 spreadsheet_duplicate_cols_.P.t_test.tsv 
          CFRZG  WRNQP  WRNQP  QRXCG  UGYRV
SIQZCRH    0.0    1.0    1.0    1.0    0.0
NKJPYEX    1.0    1.0    1.0    1.0    0.0
IKLIYFV    0.0    1.0    1.0    0.0    1.0
HYQYDWG    0.0    1.0    1.0    1.0    1.0
SRZB

In [13]:
BASE_NAME = 'spreadsheet_duplicate_rows_'
dup_row_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

gene_dup_names = gene_names.copy()
gene_dup_names[1] = gene_dup_names[2]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_dup_names, columns=rand_names)
spreadsheet_df.to_csv(dup_row_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_duplicate_rows_.G.tsv 
                  CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
ENSG00000000005    0.0    1.0    0.0    0.0    0.0
ENSG00000000457    1.0    0.0    0.0    1.0    0.0
ENSG00000000457    1.0    0.0    0.0    1.0    0.0
ENSG00000000460    1.0    1.0    1.0    1.0    0.0
ENSG00000000938    0.0    0.0    1.0    0.0    1.0

 spreadsheet_duplicate_rows_.P.pearson.tsv 
             CFRZG     VLNHU     WRNQP     QRXCG     UGYRV
SIQZCRH  0.306360  0.392944  0.597087  0.000000  0.000000
IKLIYFV  0.000000  0.000000  0.422597  0.575931  0.425861
IKLIYFV  0.681408  0.466741  0.499874  0.000000  0.000000
HYQYDWG  0.360852  0.333413  0.000000  0.000000  0.654303
SRZBDFD  0.000000  0.000000  0.517221  0.354586  0.713002

 spreadsheet_duplicate_rows_.P.t_test.tsv 
          CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
SIQZCRH    0.0    1.0    1.0    1.0    0.0
IKLIYFV    1.0    1.0    1.0    1.0    0.0
IKLIYFV    0.0    1.0    1.0    0.0    1.0
HYQYDWG    0.0    1.0    1.0    1.0    1.0
SRZB

In [14]:
BASE_NAME = 'spreadsheet_duplicate_rows_AND_cols_'
dup_row_and_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_dup_names, columns=rand_dup_names)
spreadsheet_df.to_csv(dup_row_and_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_duplicate_rows_AND_cols_.G.tsv 
                  CFRZG  WRNQP  WRNQP  QRXCG  UGYRV
ENSG00000000005    0.0    1.0    0.0    0.0    0.0
ENSG00000000457    1.0    0.0    0.0    1.0    0.0
ENSG00000000457    1.0    0.0    0.0    1.0    0.0
ENSG00000000460    1.0    1.0    1.0    1.0    0.0
ENSG00000000938    0.0    0.0    1.0    0.0    1.0

 spreadsheet_duplicate_rows_AND_cols_.P.pearson.tsv 
             CFRZG     WRNQP     WRNQP     QRXCG     UGYRV
SIQZCRH  0.306360  0.392944  0.597087  0.000000  0.000000
IKLIYFV  0.000000  0.000000  0.422597  0.575931  0.425861
IKLIYFV  0.681408  0.466741  0.499874  0.000000  0.000000
HYQYDWG  0.360852  0.333413  0.000000  0.000000  0.654303
SRZBDFD  0.000000  0.000000  0.517221  0.354586  0.713002

 spreadsheet_duplicate_rows_AND_cols_.P.t_test.tsv 
          CFRZG  WRNQP  WRNQP  QRXCG  UGYRV
SIQZCRH    0.0    1.0    1.0    1.0    0.0
IKLIYFV    1.0    1.0    1.0    1.0    0.0
IKLIYFV    0.0    1.0    1.0    0.0    1.0
HYQYDWG    0.0    1.

In [15]:
BASE_NAME = 'spreadsheet_EMPTY_cols_'
empty_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(data=None, columns=rand_names)
spreadsheet_df.to_csv(empty_col_spreadsheet_name, sep='\t', index=None, header=True)
print(BASE_NAME + '.G.tsv', '\n')
spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df

spreadsheet_EMPTY_cols_.G.tsv 



Unnamed: 0,CFRZG,VLNHU,WRNQP,QRXCG,UGYRV


In [16]:
BASE_NAME = 'spreadsheet_EMPTY_rows_'
empty_row_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(data=None, index=gene_names)
spreadsheet_df.to_csv(empty_row_spreadsheet_name, sep='\t', index=True, header=None)
print(BASE_NAME + '.G.tsv', '\n')

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df

spreadsheet_EMPTY_rows_.G.tsv 



ENSG00000000005
ENSG00000000419
ENSG00000000457
ENSG00000000460
ENSG00000000938


In [17]:
BASE_NAME = 'spreadsheet_NA_cols_'
NA_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

na_col_names = rand_names.copy()
na_col_names[2] = 'NA'
na_row_names = gene_names.copy()
na_row_names[2] = 'NA'
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=na_col_names)
spreadsheet_df.to_csv(NA_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_NA_cols_.G.tsv 
                  CFRZG  VLNHU   NA  QRXCG  UGYRV
ENSG00000000005    0.0    1.0  0.0    0.0    0.0
ENSG00000000419    1.0    0.0  0.0    1.0    0.0
ENSG00000000457    1.0    0.0  0.0    1.0    0.0
ENSG00000000460    1.0    1.0  1.0    1.0    0.0
ENSG00000000938    0.0    0.0  1.0    0.0    1.0

 spreadsheet_NA_cols_.P.pearson.tsv 
             CFRZG     VLNHU        NA     QRXCG     UGYRV
SIQZCRH  0.306360  0.392944  0.597087  0.000000  0.000000
NKJPYEX  0.000000  0.000000  0.422597  0.575931  0.425861
IKLIYFV  0.681408  0.466741  0.499874  0.000000  0.000000
HYQYDWG  0.360852  0.333413  0.000000  0.000000  0.654303
SRZBDFD  0.000000  0.000000  0.517221  0.354586  0.713002

 spreadsheet_NA_cols_.P.t_test.tsv 
          CFRZG  VLNHU   NA  QRXCG  UGYRV
SIQZCRH    0.0    1.0  1.0    1.0    0.0
NKJPYEX    1.0    1.0  1.0    1.0    0.0
IKLIYFV    0.0    1.0  1.0    0.0    1.0
HYQYDWG    0.0    1.0  1.0    1.0    1.0
SRZBDFD    1.0    1.0  0.0    0.0    1.0


In [18]:
BASE_NAME = 'spreadsheet_NA_rows_'
NA_rows_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=na_row_names, columns=rand_names)
spreadsheet_df.to_csv(NA_rows_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
row_names_here = list(spreadsheet_df.index.values)
row_na_dict = {row_names_here[2]: 'NA'}
spreadsheet_df.rename(index=row_na_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_NA_rows_.G.tsv 
                  CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
ENSG00000000005    0.0    1.0    0.0    0.0    0.0
ENSG00000000419    1.0    0.0    0.0    1.0    0.0
NA                 1.0    0.0    0.0    1.0    0.0
ENSG00000000460    1.0    1.0    1.0    1.0    0.0
ENSG00000000938    0.0    0.0    1.0    0.0    1.0

 spreadsheet_NA_rows_.P.pearson.tsv 
             CFRZG     VLNHU     WRNQP     QRXCG     UGYRV
SIQZCRH  0.306360  0.392944  0.597087  0.000000  0.000000
NKJPYEX  0.000000  0.000000  0.422597  0.575931  0.425861
NA       0.681408  0.466741  0.499874  0.000000  0.000000
HYQYDWG  0.360852  0.333413  0.000000  0.000000  0.654303
SRZBDFD  0.000000  0.000000  0.517221  0.354586  0.713002

 spreadsheet_NA_rows_.P.t_test.tsv 
          CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
SIQZCRH    0.0    1.0    1.0    1.0    0.0
NKJPYEX    1.0    1.0    1.0    1.0    0.0
NA         0.0    1.0    1.0    0.0    1.0
HYQYDWG    0.0    1.0    1.0    1.0    1.0
SRZBDFD    1.0    1.0    

In [19]:
BASE_NAME = 'spreadsheet_NA_rows_and_cols_'
NA_row_and_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=na_row_names, columns=na_col_names)
spreadsheet_df.to_csv(NA_row_and_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
row_names_here = list(spreadsheet_df.index.values)
row_na_dict = {row_names_here[2]: 'NA'}
spreadsheet_df.rename(index=row_na_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_NA_rows_and_cols_.G.tsv 
                  CFRZG  VLNHU   NA  QRXCG  UGYRV
ENSG00000000005    0.0    1.0  0.0    0.0    0.0
ENSG00000000419    1.0    0.0  0.0    1.0    0.0
NA                 1.0    0.0  0.0    1.0    0.0
ENSG00000000460    1.0    1.0  1.0    1.0    0.0
ENSG00000000938    0.0    0.0  1.0    0.0    1.0

 spreadsheet_NA_rows_and_cols_.P.pearson.tsv 
             CFRZG     VLNHU        NA     QRXCG     UGYRV
SIQZCRH  0.306360  0.392944  0.597087  0.000000  0.000000
NKJPYEX  0.000000  0.000000  0.422597  0.575931  0.425861
NA       0.681408  0.466741  0.499874  0.000000  0.000000
HYQYDWG  0.360852  0.333413  0.000000  0.000000  0.654303
SRZBDFD  0.000000  0.000000  0.517221  0.354586  0.713002

 spreadsheet_NA_rows_and_cols_.P.t_test.tsv 
          CFRZG  VLNHU   NA  QRXCG  UGYRV
SIQZCRH    0.0    1.0  1.0    1.0    0.0
NKJPYEX    1.0    1.0  1.0    1.0    0.0
NA         0.0    1.0  1.0    0.0    1.0
HYQYDWG    0.0    1.0  1.0    1.0    1.0
SRZBDFD    1.0    1.

In [20]:
BASE_NAME = 'spreadsheet_NAN_data_'
NAN_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(some_nan_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(NAN_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}
pheno_NAN_data = pheno_pearson_data.copy()
pheno_NAN_data[0,0] = np.nan
spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_NAN_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)

pheno_NAN_data = pheno_t_test_data.copy()
pheno_NAN_data[0,0] = np.nan
spreadsheet_df[:][:] = pheno_NAN_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)


spreadsheet_NAN_data_.G.tsv 
                  CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
ENSG00000000005    NaN    1.0    0.0    0.0    0.0
ENSG00000000419    1.0    0.0    0.0    1.0    0.0
ENSG00000000457    1.0    0.0    0.0    1.0    0.0
ENSG00000000460    1.0    1.0    1.0    1.0    0.0
ENSG00000000938    0.0    0.0    1.0    0.0    1.0

 spreadsheet_NAN_data_.P.pearson.tsv 
             CFRZG     VLNHU     WRNQP     QRXCG     UGYRV
SIQZCRH       NaN  0.392944  0.597087  0.000000  0.000000
NKJPYEX  0.000000  0.000000  0.422597  0.575931  0.425861
IKLIYFV  0.681408  0.466741  0.499874  0.000000  0.000000
HYQYDWG  0.360852  0.333413  0.000000  0.000000  0.654303
SRZBDFD  0.000000  0.000000  0.517221  0.354586  0.713002

 spreadsheet_NAN_data_.P.t_test.tsv 
          CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
SIQZCRH    NaN    1.0    1.0    1.0    0.0
NKJPYEX    1.0    1.0    1.0    1.0    0.0
IKLIYFV    0.0    1.0    1.0    0.0    1.0
HYQYDWG    0.0    1.0    1.0    1.0    1.0
SRZBDFD    1.0    1.0 

In [21]:
BASE_NAME = 'spreadsheet_Negative_data_'
NEG_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(some_neg_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(NEG_spreadsheet_name, sep='\t', index=True, header=True)

print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

pheno_neg = pheno_pearson_data.copy()
pheno_neg[0,0] = -1
spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_neg

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)

pheno_neg = pheno_t_test_data.copy()
pheno_neg[0,0] = -1
spreadsheet_df[:][:] = pheno_neg

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

pheno_obtuse = pheno_t_test_data.copy()
pheno_obtuse[0,0] = 2
spreadsheet_df[:][:] = pheno_obtuse

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.2.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.2.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_Negative_data_.G.tsv 
                  CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
ENSG00000000005   -1.0    1.0    0.0    0.0    0.0
ENSG00000000419    1.0    0.0    0.0    1.0    0.0
ENSG00000000457    1.0    0.0    0.0    1.0    0.0
ENSG00000000460    1.0    1.0    1.0    1.0    0.0
ENSG00000000938    0.0    0.0    1.0    0.0    1.0

 spreadsheet_Negative_data_.P.pearson.tsv 
             CFRZG     VLNHU     WRNQP     QRXCG     UGYRV
SIQZCRH -1.000000  0.392944  0.597087  0.000000  0.000000
NKJPYEX  0.000000  0.000000  0.422597  0.575931  0.425861
IKLIYFV  0.681408  0.466741  0.499874  0.000000  0.000000
HYQYDWG  0.360852  0.333413  0.000000  0.000000  0.654303
SRZBDFD  0.000000  0.000000  0.517221  0.354586  0.713002

 spreadsheet_Negative_data_.P.t_test.tsv 
          CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
SIQZCRH   -1.0    1.0    1.0    1.0    0.0
NKJPYEX    1.0    1.0    1.0    1.0    0.0
IKLIYFV    0.0    1.0    1.0    0.0    1.0
HYQYDWG    0.0    1.0    1.0    1.0    1.0
SRZBDFD

In [22]:
BASE_NAME = 'spreadsheet_ALPHA_data_'
alpha_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
spreadsheet_df.loc[gene_names[0], rand_names[0]] = 'abc'

spreadsheet_df.to_csv(alpha_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data
spreadsheet_df.loc[drug_names[0],rand_names[0]] = 'abc'

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data
spreadsheet_df.loc[drug_names[0],rand_names[0]] = 'abc'

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_ALPHA_data_.G.tsv 
                 CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
ENSG00000000005   abc    1.0    0.0    0.0    0.0
ENSG00000000419     1    0.0    0.0    1.0    0.0
ENSG00000000457     1    0.0    0.0    1.0    0.0
ENSG00000000460     1    1.0    1.0    1.0    0.0
ENSG00000000938     0    0.0    1.0    0.0    1.0

 spreadsheet_ALPHA_data_.P.pearson.tsv 
         CFRZG     VLNHU     WRNQP     QRXCG     UGYRV
SIQZCRH   abc  0.392944  0.597087  0.000000  0.000000
NKJPYEX     1  0.000000  0.422597  0.575931  0.425861
IKLIYFV     1  0.466741  0.499874  0.000000  0.000000
HYQYDWG     1  0.333413  0.000000  0.000000  0.654303
SRZBDFD     0  0.000000  0.517221  0.354586  0.713002

 spreadsheet_ALPHA_data_.P.t_test.tsv 
         CFRZG  VLNHU  WRNQP  QRXCG  UGYRV
SIQZCRH   abc    1.0    1.0    1.0    0.0
NKJPYEX     1    1.0    1.0    1.0    0.0
IKLIYFV     1    1.0    1.0    0.0    1.0
HYQYDWG     1    1.0    1.0    1.0    1.0
SRZBDFD     0    1.0    0.0    0.0    1.0
