# Spreadsheets .G.  genes X samples test data files generator
* write a small data frame file for all thinkable cases of input spreadsheet data for GeneSet Characterization
* naming conventions used to allow directory processing with dcp_test.py module

In [1]:
import os
import sys
import itertools
import time
import numpy as np
import pandas as pd

sys.path.insert(1, '../../KnowEnG_Pipelines_Library')
import knpackage.redis_utilities
sys.path.insert(1, '../../KnowEnG_Pipelines_Library/knpackage')
import knpackage.toolbox as kn

sys.path.insert(1, '../src')
import data_synth
import data_wrangler as dw

sys.path.insert(1, '../../Data_Cleanup_Pipeline/src/')
import data_cleanup_toolbox as dc

In [2]:
out_data_dir = '../../keg_test_tools/data/geneset_characterization/spreadsheets_GSC'

# data_cleanup_toolbox test: Set of 14 Spreadsheets

In [3]:
n_spreadsheet_rows = 5
n_phenotype_cols = n_spreadsheet_rows 
n_spreadsheet_cols = 5
spreadsheet_shape = (n_spreadsheet_rows, n_spreadsheet_cols)

spreadsheet_data = np.random.random(spreadsheet_shape)
spreadsheet_data[spreadsheet_data < 0.5] = 0.0
spreadsheet_data[spreadsheet_data != 0.0] = 1.0
spreadsheet_data

array([[ 1.,  1.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  1.,  1.],
       [ 1.,  0.,  0.,  1.,  1.],
       [ 0.,  1.,  0.,  0.,  1.],
       [ 0.,  0.,  1.,  1.,  1.]])

In [4]:
BASE_NAME = 'spreadsheet_DNE_'
DNE_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')
spreadsheet_df = data_synth.get_rand_dataframe(
                    n_spreadsheet_rows, n_spreadsheet_cols, row_name_chars=5, col_name_chars=8)
spreadsheet_df.to_csv(DNE_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n', spreadsheet_df)

spreadsheet_DNE_.G.GSC.tsv 
        LBKWPGZW  SYBTXWRN  QAQDSBLM  ZYHAFWYG  WCOAPUFV
QPEEV  0.699735  0.777384  0.562407  0.688294  0.406830
PAWNB  0.030958  0.861709  0.927754  0.537586  0.371735
OOBBI  0.239647  0.210067  0.982109  0.103054  0.638202
RWNGI  0.726740  0.608046  0.454255  0.043426  0.948203
SKLRF  0.930282  0.298488  0.698147  0.737180  0.955285


In [5]:
some_neg_data = spreadsheet_data.copy()
some_neg_data[0,0] = -1
some_neg_data

array([[-1.,  1.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  1.,  1.],
       [ 1.,  0.,  0.,  1.,  1.],
       [ 0.,  1.,  0.,  0.,  1.],
       [ 0.,  0.,  1.,  1.,  1.]])

In [6]:
some_nan_data = spreadsheet_data.copy()
some_nan_data[0,0] = np.nan
some_nan_data

array([[ nan,   1.,   0.,   0.,   1.],
       [  0.,   1.,   0.,   1.,   1.],
       [  1.,   0.,   0.,   1.,   1.],
       [  0.,   1.,   0.,   0.,   1.],
       [  0.,   0.,   1.,   1.,   1.]])

In [7]:
KnowEnG_GP_dir = '../../Samples_Clustering_Pipeline/data/networks'
network_full_file = os.path.join(KnowEnG_GP_dir, 'keg_ST90_4col.edge')
adj_mat, ensembl_names = kn.get_sparse_network_matrix(network_full_file)
del adj_mat
raw_data_dir = '../../'

In [8]:
raw_data_dir = '../../../pipeline_spreadsheets/raw'
sp_file = 'Hsap.ccle.G.gene_mut.binary.df'
sp_4_gene_names_df = pd.read_csv(os.path.join(raw_data_dir,sp_file),sep='\t',index_col=0,header=0)
asorted_gene_names = list(sp_4_gene_names_df.index)
del sp_4_gene_names_df

In [9]:
BASE_NAME = 'spreadsheet_A_'
good_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')
rand_names = data_synth.get_rand_unique_name_list(n_names=5, name_length=5)
gene_names = ensembl_names[0:n_spreadsheet_rows]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(good_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n', spreadsheet_df)

spreadsheet_A_.G.GSC.tsv 
                  NWOHK  NGMGC  XYWUO  FPTXU  DADOA
ENSG00000000005    1.0    1.0    0.0    0.0    1.0
ENSG00000000419    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    1.0    0.0    0.0    1.0    1.0
ENSG00000000460    0.0    1.0    0.0    0.0    1.0
ENSG00000000938    0.0    0.0    1.0    1.0    1.0


In [10]:
BASE_NAME = 'spreadsheet_B_'
OK_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')
gene_names_B = gene_names.copy()
gene_names_B[0] = asorted_gene_names[0]
gene_names_B[1] = asorted_gene_names[1]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names_B, columns=rand_names)
spreadsheet_df.to_csv(OK_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n', spreadsheet_df)

spreadsheet_B_.G.GSC.tsv 
                  NWOHK  NGMGC  XYWUO  FPTXU  DADOA
AAK1               1.0    1.0    0.0    0.0    1.0
AATK               0.0    1.0    0.0    1.0    1.0
ENSG00000000457    1.0    0.0    0.0    1.0    1.0
ENSG00000000460    0.0    1.0    0.0    0.0    1.0
ENSG00000000938    0.0    0.0    1.0    1.0    1.0


In [11]:
BASE_NAME = 'spreadsheet_duplicate_cols_'
dup_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')

rand_dup_names = rand_names.copy()
rand_dup_names[1] = rand_dup_names[2]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_dup_names)
spreadsheet_df.to_csv(dup_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n', spreadsheet_df)

spreadsheet_duplicate_cols_.G.GSC.tsv 
                  NWOHK  XYWUO  XYWUO  FPTXU  DADOA
ENSG00000000005    1.0    1.0    0.0    0.0    1.0
ENSG00000000419    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    1.0    0.0    0.0    1.0    1.0
ENSG00000000460    0.0    1.0    0.0    0.0    1.0
ENSG00000000938    0.0    0.0    1.0    1.0    1.0


In [12]:
BASE_NAME = 'spreadsheet_duplicate_rows_'
dup_row_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')

gene_dup_names = gene_names.copy()
gene_dup_names[1] = gene_dup_names[2]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_dup_names, columns=rand_names)
spreadsheet_df.to_csv(dup_row_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n', spreadsheet_df)

spreadsheet_duplicate_rows_.G.GSC.tsv 
                  NWOHK  NGMGC  XYWUO  FPTXU  DADOA
ENSG00000000005    1.0    1.0    0.0    0.0    1.0
ENSG00000000457    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    1.0    0.0    0.0    1.0    1.0
ENSG00000000460    0.0    1.0    0.0    0.0    1.0
ENSG00000000938    0.0    0.0    1.0    1.0    1.0


In [13]:
BASE_NAME = 'spreadsheet_duplicates_rows_AND_cols_'
dup_row_and_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_dup_names, columns=rand_dup_names)
spreadsheet_df.to_csv(dup_row_and_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n', spreadsheet_df)

spreadsheet_duplicates_rows_AND_cols_.G.GSC.tsv 
                  NWOHK  XYWUO  XYWUO  FPTXU  DADOA
ENSG00000000005    1.0    1.0    0.0    0.0    1.0
ENSG00000000457    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    1.0    0.0    0.0    1.0    1.0
ENSG00000000460    0.0    1.0    0.0    0.0    1.0
ENSG00000000938    0.0    0.0    1.0    1.0    1.0


In [14]:
BASE_NAME = 'spreadsheet_EMPTY_cols_'
empty_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')

spreadsheet_df = pd.DataFrame(data=None, columns=rand_names)
spreadsheet_df.to_csv(empty_col_spreadsheet_name, sep='\t', index=None, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n')

spreadsheet_df

spreadsheet_EMPTY_cols_.G.GSC.tsv 



Unnamed: 0,NWOHK,NGMGC,XYWUO,FPTXU,DADOA


In [15]:
BASE_NAME = 'spreadsheet_EMPTY_rows_'
empty_row_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')

spreadsheet_df = pd.DataFrame(data=None, index=gene_names)
spreadsheet_df.to_csv(empty_row_spreadsheet_name, sep='\t', index=True, header=None)
print(BASE_NAME + '.G.GSC.tsv', '\n')

spreadsheet_df

spreadsheet_EMPTY_rows_.G.GSC.tsv 



ENSG00000000005
ENSG00000000419
ENSG00000000457
ENSG00000000460
ENSG00000000938


In [16]:
BASE_NAME = 'spreadsheet_NA_cols_'
NA_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')

na_col_names = rand_names.copy()
na_col_names[2] = 'NA'
na_row_names = gene_names.copy()
na_row_names[2] = 'NA'
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=na_col_names)
spreadsheet_df.to_csv(NA_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n', spreadsheet_df)

spreadsheet_NA_cols_.G.GSC.tsv 
                  NWOHK  NGMGC   NA  FPTXU  DADOA
ENSG00000000005    1.0    1.0  0.0    0.0    1.0
ENSG00000000419    0.0    1.0  0.0    1.0    1.0
ENSG00000000457    1.0    0.0  0.0    1.0    1.0
ENSG00000000460    0.0    1.0  0.0    0.0    1.0
ENSG00000000938    0.0    0.0  1.0    1.0    1.0


In [17]:
BASE_NAME = 'spreadsheet_NA_rows_'
NA_rows_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=na_row_names, columns=rand_names)
spreadsheet_df.to_csv(NA_rows_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n', spreadsheet_df)

spreadsheet_NA_rows_.G.GSC.tsv 
                  NWOHK  NGMGC  XYWUO  FPTXU  DADOA
ENSG00000000005    1.0    1.0    0.0    0.0    1.0
ENSG00000000419    0.0    1.0    0.0    1.0    1.0
NA                 1.0    0.0    0.0    1.0    1.0
ENSG00000000460    0.0    1.0    0.0    0.0    1.0
ENSG00000000938    0.0    0.0    1.0    1.0    1.0


In [18]:
BASE_NAME = 'spreadsheet_NAs_rows_and_cols_'
NA_row_and_col_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=na_row_names, columns=na_col_names)
spreadsheet_df.to_csv(NA_row_and_col_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n', spreadsheet_df)

spreadsheet_NAs_rows_and_cols_.G.GSC.tsv 
                  NWOHK  NGMGC   NA  FPTXU  DADOA
ENSG00000000005    1.0    1.0  0.0    0.0    1.0
ENSG00000000419    0.0    1.0  0.0    1.0    1.0
NA                 1.0    0.0  0.0    1.0    1.0
ENSG00000000460    0.0    1.0  0.0    0.0    1.0
ENSG00000000938    0.0    0.0  1.0    1.0    1.0


In [19]:
BASE_NAME = 'spreadsheet_NAN_data_'
NAN_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')

spreadsheet_df = pd.DataFrame(some_nan_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(NAN_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n', spreadsheet_df)

spreadsheet_NAN_data_.G.GSC.tsv 
                  NWOHK  NGMGC  XYWUO  FPTXU  DADOA
ENSG00000000005    NaN    1.0    0.0    0.0    1.0
ENSG00000000419    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    1.0    0.0    0.0    1.0    1.0
ENSG00000000460    0.0    1.0    0.0    0.0    1.0
ENSG00000000938    0.0    0.0    1.0    1.0    1.0


In [20]:
BASE_NAME = 'spreadsheet_Negative_data_'
NEG_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')

spreadsheet_df = pd.DataFrame(some_neg_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(NEG_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n', spreadsheet_df)

spreadsheet_Negative_data_.G.GSC.tsv 
                  NWOHK  NGMGC  XYWUO  FPTXU  DADOA
ENSG00000000005   -1.0    1.0    0.0    0.0    1.0
ENSG00000000419    0.0    1.0    0.0    1.0    1.0
ENSG00000000457    1.0    0.0    0.0    1.0    1.0
ENSG00000000460    0.0    1.0    0.0    0.0    1.0
ENSG00000000938    0.0    0.0    1.0    1.0    1.0


In [21]:
BASE_NAME = 'spreadsheet_ALPHA_data_'
alpha_spreadsheet_name = os.path.join(out_data_dir, BASE_NAME + '.G.GSC.tsv')

spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
spreadsheet_df.loc[gene_names[0], rand_names[0]] = 'abc'

spreadsheet_df.to_csv(alpha_spreadsheet_name, sep='\t', index=True, header=True)
print(BASE_NAME + '.G.GSC.tsv', '\n', spreadsheet_df)

spreadsheet_ALPHA_data_.G.GSC.tsv 
                 NWOHK  NGMGC  XYWUO  FPTXU  DADOA
ENSG00000000005   abc    1.0    0.0    0.0    1.0
ENSG00000000419     0    1.0    0.0    1.0    1.0
ENSG00000000457     1    0.0    0.0    1.0    1.0
ENSG00000000460     0    1.0    0.0    0.0    1.0
ENSG00000000938     0    0.0    1.0    1.0    1.0
