# Spreadsheets .G.  genes X samples test data files generator
* write a small data frame file for all thinkable cases of input spreadsheet data for Samples Clustering
* write small data frames for each of those to use as in Gene Prioratization t-test and pearson
* naming conventions used to allow directory processing with dcp_test.py module


In [11]:
import os
import sys
import itertools
import time
import numpy as np
import pandas as pd

sys.path.insert(1, '../../KnowEnG_Pipelines_Library')
import knpackage.redis_utilities
sys.path.insert(1, '../../KnowEnG_Pipelines_Library/knpackage')
import knpackage.toolbox as kn

sys.path.insert(1, '../src')
import data_synth
import data_wrangler as dw

sys.path.insert(1, '../../Data_Cleanup_Pipeline/src/')
import data_cleanup_toolbox as dc

In [12]:
out_data_dir = './spreadsheets.G.etc'
pheno_pearson_data_dir = './phenotypes_pearson.P.etc'
pheno_t_test_data_dir = './phenotypes_t_test.P.etc'

# data_cleanup_toolbox test: Set of 14 Spreadsheets

In [13]:
n_spreadsheet_rows = 5
n_spreadsheet_cols = 5
spreadsheet_shape = (n_spreadsheet_rows, n_spreadsheet_cols)

spreadsheet_data = np.random.random(spreadsheet_shape)
spreadsheet_data[spreadsheet_data < 0.5] = 0.0
spreadsheet_data[spreadsheet_data != 0.0] = 1.0
spreadsheet_data

array([[ 1.,  0.,  1.,  0.,  1.],
       [ 0.,  1.,  0.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  1.,  1.],
       [ 1.,  1.,  0.,  0.,  0.]])

In [14]:
pheno_pearson_data = np.random.random(spreadsheet_shape)
pheno_pearson_data = pheno_pearson_data - 0.24
pheno_pearson_data[pheno_pearson_data < 0.26] = 0.0
print(' pearson data:\n', pheno_pearson_data)

pheno_t_test_data = np.random.random(spreadsheet_shape)
pheno_t_test_data[pheno_t_test_data < 0.5] = 0.0
pheno_t_test_data[pheno_t_test_data != 0.0] = 1.0
print('\n t_test data:\n', pheno_t_test_data)

drug_names = data_synth.get_rand_unique_name_list(n_names=n_spreadsheet_rows, name_length=7)
drug_names

 pearson data:
 [[ 0.49980498  0.          0.          0.          0.31744934]
 [ 0.51935288  0.          0.46879014  0.          0.        ]
 [ 0.49608264  0.67194818  0.          0.          0.4767176 ]
 [ 0.52969083  0.56521641  0.75311023  0.51331935  0.        ]
 [ 0.          0.56587829  0.74051295  0.56616055  0.66029975]]

 t_test data:
 [[ 1.  0.  1.  1.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 0.  1.  1.  1.  1.]
 [ 0.  0.  1.  1.  1.]]


['SWLXSSS', 'TJOJYEU', 'XBMQOMR', 'ZQQYKOO', 'LLHLQYY']

In [15]:
BASE_NAME = 'spreadsheet_DNE_'
DNE_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')
spreadsheet_df = data_synth.get_rand_dataframe(
                    n_spreadsheet_rows, n_spreadsheet_cols, row_name_chars=5, col_name_chars=8)

spreadsheet_df.to_csv(DNE_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_DNE_.G.tsv 
        TPSUSVNT  LVTHVKWJ  OTUQSCHA  SUPBKEEW  SLKMNWVB
UGKLP  0.279451  0.150513  0.990819  0.449800  0.054732
ZVOUL  0.033592  0.198001  0.656441  0.563804  0.789581
RBAKY  0.936109  0.642663  0.281792  0.427561  0.055155
MXKVB  0.184696  0.316675  0.674886  0.548009  0.505123
PISZQ  0.269313  0.690633  0.425534  0.710203  0.427368

 spreadsheet_DNE_.P.pearson.tsv 
          TPSUSVNT  LVTHVKWJ  OTUQSCHA  SUPBKEEW  SLKMNWVB
SWLXSSS  0.499805  0.000000  0.000000  0.000000  0.317449
TJOJYEU  0.519353  0.000000  0.468790  0.000000  0.000000
XBMQOMR  0.496083  0.671948  0.000000  0.000000  0.476718
ZQQYKOO  0.529691  0.565216  0.753110  0.513319  0.000000
LLHLQYY  0.000000  0.565878  0.740513  0.566161  0.660300

 spreadsheet_DNE_.P.t_test.tsv 
          TPSUSVNT  LVTHVKWJ  OTUQSCHA  SUPBKEEW  SLKMNWVB
SWLXSSS       1.0       0.0       1.0       1.0       0.0
TJOJYEU       0.0       0.0       0.0       0.0       0.0
XBMQOMR       0.0       0.0       1.0       0.0 

In [16]:
some_neg_data = spreadsheet_data.copy()
some_neg_data[0,0] = -1
some_neg_data

array([[-1.,  0.,  1.,  0.,  1.],
       [ 0.,  1.,  0.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  1.,  1.],
       [ 1.,  1.,  0.,  0.,  0.]])

In [17]:
some_nan_data = spreadsheet_data.copy()
some_nan_data[0,0] = np.nan
some_nan_data

array([[ nan,   0.,   1.,   0.,   1.],
       [  0.,   1.,   0.,   1.,   1.],
       [  0.,   0.,   0.,   0.,   1.],
       [  1.,   0.,   0.,   1.,   1.],
       [  1.,   1.,   0.,   0.,   0.]])

In [18]:
KnowEnG_GP_dir = '../../Samples_Clustering_Pipeline/data/networks'
network_full_file = os.path.join(KnowEnG_GP_dir, 'keg_ST90_4col.edge')
adj_mat, ensembl_names = kn.get_sparse_network_matrix(network_full_file)
del adj_mat
raw_data_dir = '../../'

In [10]:
raw_data_dir = '../../../BigDataTank/pipeline_spreadsheets/raw'
sp_file = 'Hsap.ccle.G.gene_mut.binary.df'
sp_4_gene_names_df = pd.read_csv(os.path.join(raw_data_dir,sp_file),sep='\t',index_col=0,header=0)
asorted_gene_names = list(sp_4_gene_names_df.index)
del sp_4_gene_names_df

In [19]:
BASE_NAME = 'spreadsheet_A_'
good_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')
rand_names = data_synth.get_rand_unique_name_list(n_names=5, name_length=5)
gene_names = ensembl_names[0:n_spreadsheet_rows]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(good_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_A_.G.tsv 
                  LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
ENSG00000000005    1.0    0.0    1.0    0.0    1.0
ENSG00000000419    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    0.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    0.0    1.0    1.0
ENSG00000000938    1.0    1.0    0.0    0.0    0.0

 spreadsheet_A_.P.pearson.tsv 
             LIWLX     WNPQW     OZDEA     PTQJC     GYJTZ
SWLXSSS  0.499805  0.000000  0.000000  0.000000  0.317449
TJOJYEU  0.519353  0.000000  0.468790  0.000000  0.000000
XBMQOMR  0.496083  0.671948  0.000000  0.000000  0.476718
ZQQYKOO  0.529691  0.565216  0.753110  0.513319  0.000000
LLHLQYY  0.000000  0.565878  0.740513  0.566161  0.660300

 spreadsheet_A_.P.t_test.tsv 
          LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
SWLXSSS    1.0    0.0    1.0    1.0    0.0
TJOJYEU    0.0    0.0    0.0    0.0    0.0
XBMQOMR    0.0    0.0    1.0    0.0    0.0
ZQQYKOO    0.0    1.0    1.0    1.0    1.0
LLHLQYY    0.0    0.0    1.0    1.0    1.0


In [20]:
BASE_NAME = 'spreadsheet_B_'
OK_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')
gene_names_B = gene_names.copy()
gene_names_B[0] = asorted_gene_names[0]
gene_names_B[1] = asorted_gene_names[1]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names_B, columns=rand_names)
spreadsheet_df.to_csv(OK_spreadsheet_name, sep='\t', index=True, header=True)

print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_B_.G.tsv 
                  LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
AAK1               1.0    0.0    1.0    0.0    1.0
AATK               0.0    1.0    0.0    1.0    1.0
ENSG00000000457    0.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    0.0    1.0    1.0
ENSG00000000938    1.0    1.0    0.0    0.0    0.0

 spreadsheet_B_.P.pearson.tsv 
             LIWLX     WNPQW     OZDEA     PTQJC     GYJTZ
SWLXSSS  0.499805  0.000000  0.000000  0.000000  0.317449
TJOJYEU  0.519353  0.000000  0.468790  0.000000  0.000000
XBMQOMR  0.496083  0.671948  0.000000  0.000000  0.476718
ZQQYKOO  0.529691  0.565216  0.753110  0.513319  0.000000
LLHLQYY  0.000000  0.565878  0.740513  0.566161  0.660300

 spreadsheet_B_.P.t_test.tsv 
          LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
SWLXSSS    1.0    0.0    1.0    1.0    0.0
TJOJYEU    0.0    0.0    0.0    0.0    0.0
XBMQOMR    0.0    0.0    1.0    0.0    0.0
ZQQYKOO    0.0    1.0    1.0    1.0    1.0
LLHLQYY    0.0    0.0    1.0    1.0    1.0


In [21]:
BASE_NAME = 'spreadsheet_duplicate_cols_'
dup_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

rand_dup_names = rand_names.copy()
rand_dup_names[1] = rand_dup_names[2]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_dup_names)
spreadsheet_df.to_csv(dup_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_duplicate_cols_.G.tsv 
                  LIWLX  OZDEA  OZDEA  PTQJC  GYJTZ
ENSG00000000005    1.0    0.0    1.0    0.0    1.0
ENSG00000000419    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    0.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    0.0    1.0    1.0
ENSG00000000938    1.0    1.0    0.0    0.0    0.0

 spreadsheet_duplicate_cols_.P.pearson.tsv 
             LIWLX     OZDEA     OZDEA     PTQJC     GYJTZ
SWLXSSS  0.499805  0.000000  0.000000  0.000000  0.317449
TJOJYEU  0.519353  0.000000  0.468790  0.000000  0.000000
XBMQOMR  0.496083  0.671948  0.000000  0.000000  0.476718
ZQQYKOO  0.529691  0.565216  0.753110  0.513319  0.000000
LLHLQYY  0.000000  0.565878  0.740513  0.566161  0.660300

 spreadsheet_duplicate_cols_.P.t_test.tsv 
          LIWLX  OZDEA  OZDEA  PTQJC  GYJTZ
SWLXSSS    1.0    0.0    1.0    1.0    0.0
TJOJYEU    0.0    0.0    0.0    0.0    0.0
XBMQOMR    0.0    0.0    1.0    0.0    0.0
ZQQYKOO    0.0    1.0    1.0    1.0    1.0
LLHL

In [22]:
BASE_NAME = 'spreadsheet_duplicate_rows_'
dup_row_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

gene_dup_names = gene_names.copy()
gene_dup_names[1] = gene_dup_names[2]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_dup_names, columns=rand_names)
spreadsheet_df.to_csv(dup_row_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_duplicate_rows_.G.tsv 
                  LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
ENSG00000000005    1.0    0.0    1.0    0.0    1.0
ENSG00000000457    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    0.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    0.0    1.0    1.0
ENSG00000000938    1.0    1.0    0.0    0.0    0.0

 spreadsheet_duplicate_rows_.P.pearson.tsv 
             LIWLX     WNPQW     OZDEA     PTQJC     GYJTZ
SWLXSSS  0.499805  0.000000  0.000000  0.000000  0.317449
XBMQOMR  0.519353  0.000000  0.468790  0.000000  0.000000
XBMQOMR  0.496083  0.671948  0.000000  0.000000  0.476718
ZQQYKOO  0.529691  0.565216  0.753110  0.513319  0.000000
LLHLQYY  0.000000  0.565878  0.740513  0.566161  0.660300

 spreadsheet_duplicate_rows_.P.t_test.tsv 
          LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
SWLXSSS    1.0    0.0    1.0    1.0    0.0
XBMQOMR    0.0    0.0    0.0    0.0    0.0
XBMQOMR    0.0    0.0    1.0    0.0    0.0
ZQQYKOO    0.0    1.0    1.0    1.0    1.0
LLHL

In [23]:
BASE_NAME = 'spreadsheet_duplicate_rows_AND_cols_'
dup_row_and_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_dup_names, columns=rand_dup_names)
spreadsheet_df.to_csv(dup_row_and_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_duplicate_rows_AND_cols_.G.tsv 
                  LIWLX  OZDEA  OZDEA  PTQJC  GYJTZ
ENSG00000000005    1.0    0.0    1.0    0.0    1.0
ENSG00000000457    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    0.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    0.0    1.0    1.0
ENSG00000000938    1.0    1.0    0.0    0.0    0.0

 spreadsheet_duplicate_rows_AND_cols_.P.pearson.tsv 
             LIWLX     OZDEA     OZDEA     PTQJC     GYJTZ
SWLXSSS  0.499805  0.000000  0.000000  0.000000  0.317449
XBMQOMR  0.519353  0.000000  0.468790  0.000000  0.000000
XBMQOMR  0.496083  0.671948  0.000000  0.000000  0.476718
ZQQYKOO  0.529691  0.565216  0.753110  0.513319  0.000000
LLHLQYY  0.000000  0.565878  0.740513  0.566161  0.660300

 spreadsheet_duplicate_rows_AND_cols_.P.t_test.tsv 
          LIWLX  OZDEA  OZDEA  PTQJC  GYJTZ
SWLXSSS    1.0    0.0    1.0    1.0    0.0
XBMQOMR    0.0    0.0    0.0    0.0    0.0
XBMQOMR    0.0    0.0    1.0    0.0    0.0
ZQQYKOO    0.0    1.

In [24]:
BASE_NAME = 'spreadsheet_EMPTY_cols_'
empty_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(data=None, columns=rand_names)
spreadsheet_df.to_csv(empty_col_spreadsheet_name, sep='\t', index=None, header=True)
print(BASE_NAME + '.G.tsv', '\n')
spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df

spreadsheet_EMPTY_cols_.G.tsv 



Unnamed: 0,LIWLX,WNPQW,OZDEA,PTQJC,GYJTZ


In [25]:
BASE_NAME = 'spreadsheet_EMPTY_rows_'
empty_row_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(data=None, index=gene_names)
spreadsheet_df.to_csv(empty_row_spreadsheet_name, sep='\t', index=True, header=None)
print(BASE_NAME + '.G.tsv', '\n')

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df

spreadsheet_EMPTY_rows_.G.tsv 



ENSG00000000005
ENSG00000000419
ENSG00000000457
ENSG00000000460
ENSG00000000938


In [26]:
BASE_NAME = 'spreadsheet_NA_cols_'
NA_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

na_col_names = rand_names.copy()
na_col_names[2] = 'NA'
na_row_names = gene_names.copy()
na_row_names[2] = 'NA'
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=na_col_names)
spreadsheet_df.to_csv(NA_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_NA_cols_.G.tsv 
                  LIWLX  WNPQW   NA  PTQJC  GYJTZ
ENSG00000000005    1.0    0.0  1.0    0.0    1.0
ENSG00000000419    0.0    1.0  0.0    1.0    1.0
ENSG00000000457    0.0    0.0  0.0    0.0    1.0
ENSG00000000460    1.0    0.0  0.0    1.0    1.0
ENSG00000000938    1.0    1.0  0.0    0.0    0.0

 spreadsheet_NA_cols_.P.pearson.tsv 
             LIWLX     WNPQW        NA     PTQJC     GYJTZ
SWLXSSS  0.499805  0.000000  0.000000  0.000000  0.317449
TJOJYEU  0.519353  0.000000  0.468790  0.000000  0.000000
XBMQOMR  0.496083  0.671948  0.000000  0.000000  0.476718
ZQQYKOO  0.529691  0.565216  0.753110  0.513319  0.000000
LLHLQYY  0.000000  0.565878  0.740513  0.566161  0.660300

 spreadsheet_NA_cols_.P.t_test.tsv 
          LIWLX  WNPQW   NA  PTQJC  GYJTZ
SWLXSSS    1.0    0.0  1.0    1.0    0.0
TJOJYEU    0.0    0.0  0.0    0.0    0.0
XBMQOMR    0.0    0.0  1.0    0.0    0.0
ZQQYKOO    0.0    1.0  1.0    1.0    1.0
LLHLQYY    0.0    0.0  1.0    1.0    1.0


In [27]:
BASE_NAME = 'spreadsheet_NA_rows_'
NA_rows_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=na_row_names, columns=rand_names)
spreadsheet_df.to_csv(NA_rows_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
row_names_here = list(spreadsheet_df.index.values)
row_na_dict = {row_names_here[2]: 'NA'}
spreadsheet_df.rename(index=row_na_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_NA_rows_.G.tsv 
                  LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
ENSG00000000005    1.0    0.0    1.0    0.0    1.0
ENSG00000000419    0.0    1.0    0.0    1.0    1.0
NA                 0.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    0.0    1.0    1.0
ENSG00000000938    1.0    1.0    0.0    0.0    0.0

 spreadsheet_NA_rows_.P.pearson.tsv 
             LIWLX     WNPQW     OZDEA     PTQJC     GYJTZ
SWLXSSS  0.499805  0.000000  0.000000  0.000000  0.317449
TJOJYEU  0.519353  0.000000  0.468790  0.000000  0.000000
NA       0.496083  0.671948  0.000000  0.000000  0.476718
ZQQYKOO  0.529691  0.565216  0.753110  0.513319  0.000000
LLHLQYY  0.000000  0.565878  0.740513  0.566161  0.660300

 spreadsheet_NA_rows_.P.t_test.tsv 
          LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
SWLXSSS    1.0    0.0    1.0    1.0    0.0
TJOJYEU    0.0    0.0    0.0    0.0    0.0
NA         0.0    0.0    1.0    0.0    0.0
ZQQYKOO    0.0    1.0    1.0    1.0    1.0
LLHLQYY    0.0    0.0    

In [28]:
BASE_NAME = 'spreadsheet_NA_rows_and_cols_'
NA_row_and_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=na_row_names, columns=na_col_names)
spreadsheet_df.to_csv(NA_row_and_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
row_names_here = list(spreadsheet_df.index.values)
row_na_dict = {row_names_here[2]: 'NA'}
spreadsheet_df.rename(index=row_na_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_NA_rows_and_cols_.G.tsv 
                  LIWLX  WNPQW   NA  PTQJC  GYJTZ
ENSG00000000005    1.0    0.0  1.0    0.0    1.0
ENSG00000000419    0.0    1.0  0.0    1.0    1.0
NA                 0.0    0.0  0.0    0.0    1.0
ENSG00000000460    1.0    0.0  0.0    1.0    1.0
ENSG00000000938    1.0    1.0  0.0    0.0    0.0

 spreadsheet_NA_rows_and_cols_.P.pearson.tsv 
             LIWLX     WNPQW        NA     PTQJC     GYJTZ
SWLXSSS  0.499805  0.000000  0.000000  0.000000  0.317449
TJOJYEU  0.519353  0.000000  0.468790  0.000000  0.000000
NA       0.496083  0.671948  0.000000  0.000000  0.476718
ZQQYKOO  0.529691  0.565216  0.753110  0.513319  0.000000
LLHLQYY  0.000000  0.565878  0.740513  0.566161  0.660300

 spreadsheet_NA_rows_and_cols_.P.t_test.tsv 
          LIWLX  WNPQW   NA  PTQJC  GYJTZ
SWLXSSS    1.0    0.0  1.0    1.0    0.0
TJOJYEU    0.0    0.0  0.0    0.0    0.0
NA         0.0    0.0  1.0    0.0    0.0
ZQQYKOO    0.0    1.0  1.0    1.0    1.0
LLHLQYY    0.0    0.

In [29]:
BASE_NAME = 'spreadsheet_NAN_data_'
NAN_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(some_nan_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(NAN_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}
pheno_NAN_data = pheno_pearson_data.copy()
pheno_NAN_data[0,0] = np.nan
spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_NAN_data

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)

pheno_NAN_data = pheno_t_test_data.copy()
pheno_NAN_data[0,0] = np.nan
spreadsheet_df[:][:] = pheno_NAN_data

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)


spreadsheet_NAN_data_.G.tsv 
                  LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
ENSG00000000005    NaN    0.0    1.0    0.0    1.0
ENSG00000000419    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    0.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    0.0    1.0    1.0
ENSG00000000938    1.0    1.0    0.0    0.0    0.0

 spreadsheet_NAN_data_.P.pearson.tsv 
             LIWLX     WNPQW     OZDEA     PTQJC     GYJTZ
SWLXSSS       NaN  0.000000  0.000000  0.000000  0.317449
TJOJYEU  0.519353  0.000000  0.468790  0.000000  0.000000
XBMQOMR  0.496083  0.671948  0.000000  0.000000  0.476718
ZQQYKOO  0.529691  0.565216  0.753110  0.513319  0.000000
LLHLQYY  0.000000  0.565878  0.740513  0.566161  0.660300

 spreadsheet_NAN_data_.P.t_test.tsv 
          LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
SWLXSSS    NaN    0.0    1.0    1.0    0.0
TJOJYEU    0.0    0.0    0.0    0.0    0.0
XBMQOMR    0.0    0.0    1.0    0.0    0.0
ZQQYKOO    0.0    1.0    1.0    1.0    1.0
LLHLQYY    0.0    0.0 

In [30]:
BASE_NAME = 'spreadsheet_Negative_data_'
NEG_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(some_neg_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(NEG_spreadsheet_name, sep='\t', index=True, header=True)

print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

pheno_neg = pheno_pearson_data.copy()
pheno_neg[0,0] = -1
spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_neg

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)

pheno_neg = pheno_t_test_data.copy()
pheno_neg[0,0] = -1
spreadsheet_df[:][:] = pheno_neg

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

pheno_obtuse = pheno_t_test_data.copy()
pheno_obtuse[0,0] = 2
spreadsheet_df[:][:] = pheno_obtuse

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.2.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.2.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_Negative_data_.G.tsv 
                  LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
ENSG00000000005   -1.0    0.0    1.0    0.0    1.0
ENSG00000000419    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    0.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    0.0    1.0    1.0
ENSG00000000938    1.0    1.0    0.0    0.0    0.0

 spreadsheet_Negative_data_.P.pearson.tsv 
             LIWLX     WNPQW     OZDEA     PTQJC     GYJTZ
SWLXSSS -1.000000  0.000000  0.000000  0.000000  0.317449
TJOJYEU  0.519353  0.000000  0.468790  0.000000  0.000000
XBMQOMR  0.496083  0.671948  0.000000  0.000000  0.476718
ZQQYKOO  0.529691  0.565216  0.753110  0.513319  0.000000
LLHLQYY  0.000000  0.565878  0.740513  0.566161  0.660300

 spreadsheet_Negative_data_.P.t_test.tsv 
          LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
SWLXSSS   -1.0    0.0    1.0    1.0    0.0
TJOJYEU    0.0    0.0    0.0    0.0    0.0
XBMQOMR    0.0    0.0    1.0    0.0    0.0
ZQQYKOO    0.0    1.0    1.0    1.0    1.0
LLHLQYY

In [31]:
BASE_NAME = 'spreadsheet_ALPHA_data_'
alpha_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
spreadsheet_df.loc[gene_names[0], rand_names[0]] = 'abc'

spreadsheet_df.to_csv(alpha_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.tsv', '\n', spreadsheet_df)

drug_names_dict = {spreadsheet_df.index.values[k]: drug_names[k] for k in range(len(drug_names))}

spreadsheet_df.rename(index=drug_names_dict, inplace=True)
spreadsheet_df[:][:] = pheno_pearson_data
spreadsheet_df.loc[drug_names[0],rand_names[0]] = 'abc'

spreadsheet_df.to_csv(os.path.join(pheno_pearson_data_dir, BASE_NAME + '.P.pearson.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.pearson.tsv', '\n', spreadsheet_df)
spreadsheet_df[:][:] = pheno_t_test_data
spreadsheet_df.loc[drug_names[0],rand_names[0]] = 'abc'

spreadsheet_df.to_csv(os.path.join(pheno_t_test_data_dir, BASE_NAME + '.P.t_test.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.t_test.tsv', '\n', spreadsheet_df)

spreadsheet_ALPHA_data_.G.tsv 
                 LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
ENSG00000000005   abc    0.0    1.0    0.0    1.0
ENSG00000000419     0    1.0    0.0    1.0    1.0
ENSG00000000457     0    0.0    0.0    0.0    1.0
ENSG00000000460     1    0.0    0.0    1.0    1.0
ENSG00000000938     1    1.0    0.0    0.0    0.0

 spreadsheet_ALPHA_data_.P.pearson.tsv 
         LIWLX     WNPQW     OZDEA     PTQJC     GYJTZ
SWLXSSS   abc  0.000000  0.000000  0.000000  0.317449
TJOJYEU     0  0.000000  0.468790  0.000000  0.000000
XBMQOMR     0  0.671948  0.000000  0.000000  0.476718
ZQQYKOO     1  0.565216  0.753110  0.513319  0.000000
LLHLQYY     1  0.565878  0.740513  0.566161  0.660300

 spreadsheet_ALPHA_data_.P.t_test.tsv 
         LIWLX  WNPQW  OZDEA  PTQJC  GYJTZ
SWLXSSS   abc    0.0    1.0    1.0    0.0
TJOJYEU     0    0.0    0.0    0.0    0.0
XBMQOMR     0    0.0    1.0    0.0    0.0
ZQQYKOO     1    1.0    1.0    1.0    1.0
LLHLQYY     1    0.0    1.0    1.0    1.0
