# Spreadsheets .G.  genes X samples test data files generator
* write a small data frame file for all thinkable cases of input spreadsheet data for Samples Clustering
* write small data frames for each of those to use as in Gene Prioratization t-test and pearson
* naming conventions used to allow directory processing with dcp_test.py module


In [1]:
import os
import sys
import itertools
import time
import numpy as np
import pandas as pd

sys.path.insert(1, '../../KnowEnG_Pipelines_Library')
import knpackage.redis_utilities
sys.path.insert(1, '../../KnowEnG_Pipelines_Library/knpackage')
import knpackage.toolbox as kn

sys.path.insert(1, '../src')
import data_synth
import data_wrangler as dw

sys.path.insert(1, '../../Data_Cleanup_Pipeline/src/')
import data_cleanup_toolbox as dc

In [2]:
out_data_dir = './spreadsheets.G.SC.etc'
pheno_samples_clustering_data_dir = './phenotypes.P.SC.etc'

# data_cleanup_toolbox test: Set of 14 Spreadsheets

In [3]:
n_spreadsheet_rows = 5
n_phenotype_cols = n_spreadsheet_rows 
n_spreadsheet_cols = 5
spreadsheet_shape = (n_spreadsheet_rows, n_spreadsheet_cols)

spreadsheet_data = np.random.random(spreadsheet_shape)
spreadsheet_data[spreadsheet_data < 0.5] = 0.0
spreadsheet_data[spreadsheet_data != 0.0] = 1.0
spreadsheet_data

array([[ 0.,  1.,  0.,  1.,  1.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  1.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  1.]])

In [4]:
BASE_NAME = 'spreadsheet_DNE_'
DNE_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')
spreadsheet_df = data_synth.get_rand_dataframe(
                    n_spreadsheet_rows, n_spreadsheet_cols, row_name_chars=5, col_name_chars=8)
spreadsheet_df.to_csv(DNE_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n', spreadsheet_df)

pheno_data_df = data_synth.get_random_phenotype_data_for_samples(list(spreadsheet_df.columns), n_phenotype_cols)
pheno_data_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.SC_pheno.tsv', '\n', pheno_data_df)

spreadsheet_DNE_.G.SC.tsv 
        CSDZKRPO  FJYMFZEI  QFYGEKLQ  AYTBCPJE  WRDYYOJA
VNDXE  0.873001  0.575658  0.060676  0.177694  0.502538
IOXFT  0.599019  0.474041  0.080747  0.772109  0.427749
AMOWE  0.405131  0.214972  0.380497  0.362865  0.856615
KKCYS  0.920417  0.204564  0.416507  0.805883  0.059719
KPQDH  0.594327  0.301471  0.580700  0.319793  0.760920

 spreadsheet_DNE_.P.SC_pheno.tsv 
             GWJNGNC      IZENEXN TMOCGSX   BVXMRST   IMRGABV
CSDZKRPO  NMIIMGELD  2302.784922   False  1.872784  7.964585
FJYMFZEI  ITGJQLJUH  4843.461924    True  7.230624  2.272162
QFYGEKLQ  HHINGJRMT  7378.892587    True  2.273242  2.884077
AYTBCPJE  BAZUCDHEM  9810.416907    True  9.573233  8.674557
WRDYYOJA  KWDJJPKCJ  4412.341134   False  1.436853  6.539232


In [5]:
some_neg_data = spreadsheet_data.copy()
some_neg_data[0,0] = -1
some_neg_data

array([[-1.,  1.,  0.,  1.,  1.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  1.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  1.]])

In [6]:
some_nan_data = spreadsheet_data.copy()
some_nan_data[0,0] = np.nan
some_nan_data

array([[ nan,   1.,   0.,   1.,   1.],
       [  0.,   0.,   1.,   0.,   0.],
       [  1.,   0.,   0.,   0.,   1.],
       [  1.,   0.,   1.,   1.,   0.],
       [  1.,   0.,   0.,   0.,   1.]])

In [7]:
KnowEnG_GP_dir = '../../Samples_Clustering_Pipeline/data/networks'
network_full_file = os.path.join(KnowEnG_GP_dir, 'keg_ST90_4col.edge')
adj_mat, ensembl_names = kn.get_sparse_network_matrix(network_full_file)
del adj_mat
raw_data_dir = '../../'

In [8]:
raw_data_dir = '../../../pipeline_spreadsheets/raw'
sp_file = 'Hsap.ccle.G.gene_mut.binary.df'
sp_4_gene_names_df = pd.read_csv(os.path.join(raw_data_dir,sp_file),sep='\t',index_col=0,header=0)
asorted_gene_names = list(sp_4_gene_names_df.index)
del sp_4_gene_names_df

In [9]:
BASE_NAME = 'spreadsheet_A_'
good_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')
rand_names = data_synth.get_rand_unique_name_list(n_names=5, name_length=5)
gene_names = ensembl_names[0:n_spreadsheet_rows]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(good_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n', spreadsheet_df)

pheno_data_df = data_synth.get_random_phenotype_data_for_samples(list(spreadsheet_df.columns), n_phenotype_cols)
pheno_data_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.SC_pheno.tsv', '\n', pheno_data_df)

spreadsheet_A_.G.SC.tsv 
                  VMEKT  LSAHJ  KOXMO  PYBPU  WGBQC
ENSG00000000005    0.0    1.0    0.0    1.0    1.0
ENSG00000000419    0.0    0.0    1.0    0.0    0.0
ENSG00000000457    1.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    1.0    1.0    0.0
ENSG00000000938    1.0    0.0    0.0    0.0    1.0

 spreadsheet_A_.P.SC_pheno.tsv 
       EGGKEDP IMHTUWP SPJCOPC KJDHOFK CEWIGYM
VMEKT    True   UXGDX    True    True  LVPFUG
LSAHJ   False   HKRJT    True   False  BCFGLJ
KOXMO   False   ETIZE   False    True  EPCFGY
PYBPU   False   JBSRY   False    True  UZUGQS
WGBQC    True   DAUOX   False   False  LSZWNY


In [10]:
BASE_NAME = 'spreadsheet_B_'
OK_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')
gene_names_B = gene_names.copy()
gene_names_B[0] = asorted_gene_names[0]
gene_names_B[1] = asorted_gene_names[1]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names_B, columns=rand_names)
spreadsheet_df.to_csv(OK_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n', spreadsheet_df)

pheno_data_df = data_synth.get_random_phenotype_data_for_samples(list(spreadsheet_df.columns), n_phenotype_cols)
pheno_data_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.SC_pheno.tsv', '\n', pheno_data_df)

spreadsheet_B_.G.SC.tsv 
                  VMEKT  LSAHJ  KOXMO  PYBPU  WGBQC
AAK1               0.0    1.0    0.0    1.0    1.0
AATK               0.0    0.0    1.0    0.0    0.0
ENSG00000000457    1.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    1.0    1.0    0.0
ENSG00000000938    1.0    0.0    0.0    0.0    1.0

 spreadsheet_B_.P.SC_pheno.tsv 
           CNGYFOF  NCKZVSJ      VYMIVFM DXHZZBD       IRZOBDQ
VMEKT  228.364751        3 -3767.328789    True  -3617.099327
LSAHJ  398.031492        2 -9612.826353    True -19312.444630
KOXMO   62.211330        1 -4824.454352   False  13557.812629
PYBPU  424.345496        3  3456.110853   False   1769.520358
WGBQC  864.192685        3 -2187.852179    True  -4694.785335


In [11]:
BASE_NAME = 'spreadsheet_duplicate_cols_'
dup_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')

rand_dup_names = rand_names.copy()
rand_dup_names[1] = rand_dup_names[2]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_dup_names)
spreadsheet_df.to_csv(dup_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n', spreadsheet_df)

pheno_data_df = data_synth.get_random_phenotype_data_for_samples(list(spreadsheet_df.columns), n_phenotype_cols)
pheno_data_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.SC_pheno.tsv', '\n', pheno_data_df)

spreadsheet_duplicate_cols_.G.SC.tsv 
                  VMEKT  KOXMO  KOXMO  PYBPU  WGBQC
ENSG00000000005    0.0    1.0    0.0    1.0    1.0
ENSG00000000419    0.0    0.0    1.0    0.0    0.0
ENSG00000000457    1.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    1.0    1.0    0.0
ENSG00000000938    1.0    0.0    0.0    0.0    1.0

 spreadsheet_duplicate_cols_.P.SC_pheno.tsv 
       OBRZDUJ      WSGCCKK      CHGKPIO   TMCEZDW WYOGAAI
VMEKT    CFXW -1140.145461  EHMAMZRJZLB  EYAKDPOP   False
KOXMO    DMSZ   597.342633  UXGRYKFNJXM  HAXQMKXK   False
KOXMO    JYPT  -677.042868  VLUKSMFLAAV  JNGQUKAW   False
PYBPU    UHBC   425.316199  DEKFVWLTBMY  YCBYKYHL    True
WGBQC    XXAO  -360.763151  LIYDITIZBUK  MVLVWAKD    True


In [12]:
BASE_NAME = 'spreadsheet_duplicate_rows_'
dup_row_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')

gene_dup_names = gene_names.copy()
gene_dup_names[1] = gene_dup_names[2]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_dup_names, columns=rand_names)
spreadsheet_df.to_csv(dup_row_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n', spreadsheet_df)

pheno_data_df = data_synth.get_random_phenotype_data_for_samples(list(spreadsheet_df.columns), n_phenotype_cols)
pheno_data_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.SC_pheno.tsv', '\n', pheno_data_df)

spreadsheet_duplicate_rows_.G.SC.tsv 
                  VMEKT  LSAHJ  KOXMO  PYBPU  WGBQC
ENSG00000000005    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    0.0    0.0    1.0    0.0    0.0
ENSG00000000457    1.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    1.0    1.0    0.0
ENSG00000000938    1.0    0.0    0.0    0.0    1.0

 spreadsheet_duplicate_rows_.P.SC_pheno.tsv 
           UDMODRU YVVHGKB GYSOGJI       UOZFRBX ANFVHSV
VMEKT  622.628372    True    True   2852.841727    True
LSAHJ  516.066611    True   False  25568.436485   False
KOXMO  109.048068   False    True   -702.238104   False
PYBPU   37.967536   False   False -13742.594996   False
WGBQC  478.257837    True    True   5421.263622    True


In [13]:
BASE_NAME = 'spreadsheet_duplicate_rows_AND_cols_'
dup_row_and_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_dup_names, columns=rand_dup_names)
spreadsheet_df.to_csv(dup_row_and_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n', spreadsheet_df)

pheno_data_df = data_synth.get_random_phenotype_data_for_samples(list(spreadsheet_df.columns), n_phenotype_cols)
pheno_data_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.SC_pheno.tsv', '\n', pheno_data_df)

spreadsheet_duplicate_rows_AND_cols_.G.SC.tsv 
                  VMEKT  KOXMO  KOXMO  PYBPU  WGBQC
ENSG00000000005    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    0.0    0.0    1.0    0.0    0.0
ENSG00000000457    1.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    1.0    1.0    0.0
ENSG00000000938    1.0    0.0    0.0    0.0    1.0

 spreadsheet_duplicate_rows_AND_cols_.P.SC_pheno.tsv 
        KMKEVWI       IFFYOQH  MYFNMEF       ZWFCGUD    KSRAWLY
VMEKT        1  44692.379342  YAHIMMI  -1620.174747   2.733373
KOXMO        2  55827.345259  XYHXRHK -12895.037266 -16.730719
KOXMO        1  39088.073427  AMSKRSD  11882.251123  11.557215
PYBPU        4   4686.742144  RQILPTH  -8724.778288 -18.642817
WGBQC        3  57924.448420  CCIKTLV   2816.333499 -19.171659


In [14]:
BASE_NAME = 'spreadsheet_EMPTY_cols_'
empty_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')

spreadsheet_df = pd.DataFrame(data=None, columns=rand_names)
spreadsheet_df.to_csv(empty_col_spreadsheet_name, sep='\t', index=None, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n')
spreadsheet_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df

spreadsheet_EMPTY_cols_.G.SC.tsv 



Unnamed: 0,VMEKT,LSAHJ,KOXMO,PYBPU,WGBQC


In [15]:
BASE_NAME = 'spreadsheet_EMPTY_rows_'
empty_row_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')

spreadsheet_df = pd.DataFrame(data=None, index=gene_names)
spreadsheet_df.to_csv(empty_row_spreadsheet_name, sep='\t', index=True, header=None)
print(BASE_NAME + '.G.SC.tsv', '\n')

spreadsheet_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
spreadsheet_df

spreadsheet_EMPTY_rows_.G.SC.tsv 



ENSG00000000005
ENSG00000000419
ENSG00000000457
ENSG00000000460
ENSG00000000938


In [16]:
BASE_NAME = 'spreadsheet_NA_cols_'
NA_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')

na_col_names = rand_names.copy()
na_col_names[2] = 'NA'
na_row_names = gene_names.copy()
na_row_names[2] = 'NA'
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=na_col_names)
spreadsheet_df.to_csv(NA_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n', spreadsheet_df)

pheno_data_df = data_synth.get_random_phenotype_data_for_samples(list(spreadsheet_df.columns), n_phenotype_cols)
pheno_data_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.SC_pheno.tsv', '\n', pheno_data_df)

spreadsheet_NA_cols_.G.SC.tsv 
                  VMEKT  LSAHJ   NA  PYBPU  WGBQC
ENSG00000000005    0.0    1.0  0.0    1.0    1.0
ENSG00000000419    0.0    0.0  1.0    0.0    0.0
ENSG00000000457    1.0    0.0  0.0    0.0    1.0
ENSG00000000460    1.0    0.0  1.0    1.0    0.0
ENSG00000000938    1.0    0.0  0.0    0.0    1.0

 spreadsheet_NA_cols_.P.SC_pheno.tsv 
            ZEDGOQN      GGYNGXT  ABBMGPH   FXIEPRS  JLAXJRR
VMEKT  6202.395171  -299.415512  MXPFYOL  0.800872  JWLLWMB
LSAHJ  7742.385528  1246.294925  ZUYCHMH  9.836857  NFMPPJQ
NA     1702.901848  -443.519099  TOULFUF  5.401718  SWNUFYG
PYBPU  6279.584531  1358.445275  NBQKOWU  5.911502  QEWFGGB
WGBQC  6651.339894 -1054.523983  FLIQUOQ  9.794322  AXTBYWC


In [17]:
BASE_NAME = 'spreadsheet_NA_rows_'
NA_rows_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=na_row_names, columns=rand_names)
spreadsheet_df.to_csv(NA_rows_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n', spreadsheet_df)

pheno_data_df = data_synth.get_random_phenotype_data_for_samples(list(spreadsheet_df.columns), n_phenotype_cols)
pheno_data_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.SC_pheno.tsv', '\n', pheno_data_df)

spreadsheet_NA_rows_.G.SC.tsv 
                  VMEKT  LSAHJ  KOXMO  PYBPU  WGBQC
ENSG00000000005    0.0    1.0    0.0    1.0    1.0
ENSG00000000419    0.0    0.0    1.0    0.0    0.0
NA                 1.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    1.0    1.0    0.0
ENSG00000000938    1.0    0.0    0.0    0.0    1.0

 spreadsheet_NA_rows_.P.SC_pheno.tsv 
          NCNJSAE    LADSURJ     PDXHJXA BJNZXWH  FTWTPGG
VMEKT  -1.041284  63.517627 -166.470721     LYW        1
LSAHJ  -8.673546  19.274754  -42.802664     ZWZ        1
KOXMO  10.914872  67.307012   99.987125     HGS        4
PYBPU   4.212817  88.264103  -60.497019     XMX        0
WGBQC  13.523644  20.748832   39.139047     MZG        1


In [18]:
BASE_NAME = 'spreadsheet_NA_rows_and_cols_'
NA_row_and_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=na_row_names, columns=na_col_names)
spreadsheet_df.to_csv(NA_row_and_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n', spreadsheet_df)

pheno_data_df = data_synth.get_random_phenotype_data_for_samples(list(spreadsheet_df.columns), n_phenotype_cols)
pheno_data_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.SC_pheno.tsv', '\n', pheno_data_df)

spreadsheet_NA_rows_and_cols_.G.SC.tsv 
                  VMEKT  LSAHJ   NA  PYBPU  WGBQC
ENSG00000000005    0.0    1.0  0.0    1.0    1.0
ENSG00000000419    0.0    0.0  1.0    0.0    0.0
NA                 1.0    0.0  0.0    0.0    1.0
ENSG00000000460    1.0    0.0  1.0    1.0    0.0
ENSG00000000938    1.0    0.0  0.0    0.0    1.0

 spreadsheet_NA_rows_and_cols_.P.SC_pheno.tsv 
            ZUGLULI      FESGYQM OECYNCZ  NSRNSUW JWVZUOU
VMEKT  7090.640497  6111.716007   False        0  DMCPPF
LSAHJ  3867.733183  3147.728506    True        2  KFPRRP
NA     2281.540309  8672.841332   False        1  OVGZBT
PYBPU  8784.398093  4224.920407    True        0  NDMNHV
WGBQC  9185.983160  7969.171423    True        4  KGWGSO


In [19]:
BASE_NAME = 'spreadsheet_NAN_data_'
NAN_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')

spreadsheet_df = pd.DataFrame(some_nan_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(NAN_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n', spreadsheet_df)

pheno_data_df = data_synth.get_random_phenotype_data_for_samples(list(spreadsheet_df.columns), n_phenotype_cols)
pheno_data_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.SC_pheno.tsv', '\n', pheno_data_df)

spreadsheet_NAN_data_.G.SC.tsv 
                  VMEKT  LSAHJ  KOXMO  PYBPU  WGBQC
ENSG00000000005    NaN    1.0    0.0    1.0    1.0
ENSG00000000419    0.0    0.0    1.0    0.0    0.0
ENSG00000000457    1.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    1.0    1.0    0.0
ENSG00000000938    1.0    0.0    0.0    0.0    1.0

 spreadsheet_NAN_data_.P.SC_pheno.tsv 
        APESAKV ZFTHCLK      ZMEARMH      MDPANDT YLRQHIF
VMEKT        2   False  4045.081748 -1076.886455    True
LSAHJ        0   False   207.698435 -1262.406281    True
KOXMO        1    True  5655.385980  1114.430032    True
PYBPU        0   False  2292.867215  -148.769900   False
WGBQC        2   False  2602.200194   572.090251   False


In [20]:
BASE_NAME = 'spreadsheet_Negative_data_'
NEG_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')

spreadsheet_df = pd.DataFrame(some_neg_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(NEG_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n', spreadsheet_df)

pheno_data_df = data_synth.get_random_phenotype_data_for_samples(list(spreadsheet_df.columns), n_phenotype_cols)
pheno_data_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.SC_pheno.tsv', '\n', pheno_data_df)

spreadsheet_Negative_data_.G.SC.tsv 
                  VMEKT  LSAHJ  KOXMO  PYBPU  WGBQC
ENSG00000000005   -1.0    1.0    0.0    1.0    1.0
ENSG00000000419    0.0    0.0    1.0    0.0    0.0
ENSG00000000457    1.0    0.0    0.0    0.0    1.0
ENSG00000000460    1.0    0.0    1.0    1.0    0.0
ENSG00000000938    1.0    0.0    0.0    0.0    1.0

 spreadsheet_Negative_data_.P.SC_pheno.tsv 
        EZBBDBY YUPPQUR MVQGSWL      OEBGRDG TLYTGOX
VMEKT        0   False   False  ICZYXZDTNHD    True
LSAHJ        4    True   False  LEXULHRIEQW   False
KOXMO        2   False    True  BEOMFWZRFRM    True
PYBPU        0   False    True  JHRUWLVAWDT    True
WGBQC        1    True   False  WCUFLTUPMWJ    True


In [21]:
BASE_NAME = 'spreadsheet_ALPHA_data_'
alpha_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.SC.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
spreadsheet_df.loc[gene_names[0], rand_names[0]] = 'abc'

spreadsheet_df.to_csv(alpha_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.SC.tsv', '\n', spreadsheet_df)

pheno_data_df = data_synth.get_random_phenotype_data_for_samples(list(spreadsheet_df.columns), n_phenotype_cols)
pheno_data_df.to_csv(os.path.join(pheno_samples_clustering_data_dir, BASE_NAME + '.P.SC_pheno.tsv'), sep='\t', 
                      index=True, header=True)
print('\n', BASE_NAME + '.P.SC_pheno.tsv', '\n', pheno_data_df)

spreadsheet_ALPHA_data_.G.SC.tsv 
                 VMEKT  LSAHJ  KOXMO  PYBPU  WGBQC
ENSG00000000005   abc    1.0    0.0    1.0    1.0
ENSG00000000419     0    0.0    1.0    0.0    0.0
ENSG00000000457     1    0.0    0.0    0.0    1.0
ENSG00000000460     1    0.0    1.0    1.0    0.0
ENSG00000000938     1    0.0    0.0    0.0    1.0

 spreadsheet_ALPHA_data_.P.SC_pheno.tsv 
            VEUZRZG GEJQJAY    NOXXCVK JNZHPDG RSYGYYC
VMEKT   711.566974    True  43.428357    True   False
LSAHJ  3530.590045   False  90.004511   False   False
KOXMO  7569.096834   False  14.162039   False    True
PYBPU  4912.029206    True  49.792994    True   False
WGBQC  8099.468531   False  91.224561   False   False
