# Spreadsheets .G.  genes X samples test data files generator

In [1]:
import os
import sys
import itertools
import time
import numpy as np
import pandas as pd

sys.path.insert(1, '../../KnowEnG_Pipelines_Library')
import knpackage.redis_utilities
sys.path.insert(1, '../../KnowEnG_Pipelines_Library/knpackage')
import knpackage.toolbox as kn

sys.path.insert(1, '../src')
import data_synth
import data_wrangler as dw

sys.path.insert(1, '../../Data_Cleanup_Pipeline/src/')
import data_cleanup_toolbox as dc

# data_cleanup_toolbox test: Set of 14 Spreadsheets

In [2]:
n_spreadsheet_rows = 5
n_spreadsheet_cols = 5
spreadsheet_shape = (n_spreadsheet_rows, n_spreadsheet_cols)
out_data_dir = './spreadsheets.G.etc'
spreadsheet_data = np.random.random(spreadsheet_shape)
spreadsheet_data[spreadsheet_data < 0.5] = 0.0
spreadsheet_data[spreadsheet_data != 0.0] = 1.0
spreadsheet_data

array([[ 0.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  1.,  1.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  1.],
       [ 1.,  1.,  0.,  1.,  1.]])

In [3]:
DNE_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_DNE_.G.tsv')
spreadsheet_df = data_synth.get_rand_dataframe(
                    n_spreadsheet_rows, n_spreadsheet_cols, row_name_chars=5, col_name_chars=8)

spreadsheet_df.to_csv(DNE_spreadsheet_name, sep='\t', index=True, header=True)
print(DNE_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets.G.etc/spreadsheet_DNE_.G.tsv 



Unnamed: 0,EOTKYGTZ,IGZDXYOF,VTYNKBYU,YMKODNRE,WMDMDBHY
MCDSF,0.689317,0.177434,0.075393,0.10173,0.335343
CTKTZ,0.004389,0.627149,0.886388,0.555164,0.324254
ZXPUT,0.234505,0.181385,0.753029,0.387288,0.531604
CJEBT,0.760613,0.869129,0.934562,0.306075,0.91774
YQCOO,0.566154,0.101964,0.460603,0.477487,0.501907


In [4]:
some_neg_data = spreadsheet_data.copy()
some_neg_data[0,0] = -1
some_neg_data

array([[-1.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  1.,  1.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  1.],
       [ 1.,  1.,  0.,  1.,  1.]])

In [5]:
some_nan_data = spreadsheet_data.copy()
some_nan_data[0,0] = np.nan
some_nan_data

array([[ nan,   1.,   1.,   1.,   1.],
       [  0.,   0.,   1.,   0.,   0.],
       [  0.,   1.,   1.,   1.,   0.],
       [  0.,   0.,   1.,   0.,   1.],
       [  1.,   1.,   0.,   1.,   1.]])

In [6]:
KnowEnG_GP_dir = '../../Samples_Clustering_Pipeline/data/networks'
network_full_file = os.path.join(KnowEnG_GP_dir, 'keg_ST90_4col.edge')
adj_mat, ensembl_names = kn.get_sparse_network_matrix(network_full_file)
del adj_mat
raw_data_dir = '../../'

In [7]:
raw_data_dir = '../../../BigDataTank/pipeline_spreadsheets/raw'
sp_file = 'Hsap.ccle.G.gene_mut.binary.df'
sp_4_gene_names_df = pd.read_csv(os.path.join(raw_data_dir,sp_file),sep='\t',index_col=0,header=0)
asorted_gene_names = list(sp_4_gene_names_df.index)
del sp_4_gene_names_df

In [8]:
good_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_A_.G.tsv')
rand_names = data_synth.get_rand_unique_name_list(n_names=5, name_length=5)
gene_names = ensembl_names[0:n_spreadsheet_rows]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(good_spreadsheet_name, sep='\t', index=True, header=True)
print(good_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_A_df.tsv 



Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
ENSG00000000003,1.0,0.0,1.0,1.0,1.0
ENSG00000000005,0.0,1.0,1.0,1.0,1.0
ENSG00000000419,1.0,1.0,1.0,0.0,0.0
ENSG00000000457,1.0,1.0,0.0,0.0,0.0
ENSG00000000460,1.0,0.0,0.0,1.0,1.0


In [9]:
OK_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_B_.G.tsv')
gene_names_B = gene_names.copy()
gene_names_B[0] = asorted_gene_names[0]
gene_names_B[1] = asorted_gene_names[1]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names_B, columns=rand_names)
spreadsheet_df.to_csv(OK_spreadsheet_name, sep='\t', index=True, header=True)
print(OK_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_B_df.tsv 



Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
AAK1,1.0,0.0,1.0,1.0,1.0
AATK,0.0,1.0,1.0,1.0,1.0
ENSG00000000419,1.0,1.0,1.0,0.0,0.0
ENSG00000000457,1.0,1.0,0.0,0.0,0.0
ENSG00000000460,1.0,0.0,0.0,1.0,1.0


In [10]:
dup_col_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_duplicate_cols_.G.tsv')
rand_dup_names = rand_names.copy()
rand_dup_names[1] = rand_dup_names[2]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_dup_names)
spreadsheet_df.to_csv(dup_col_spreadsheet_name, sep='\t', index=True, header=True)
print(dup_col_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_duplicate_cols_df.tsv 



Unnamed: 0,HUNDR,ZDAIV,ZDAIV.1,DCIQI,FRTWY
ENSG00000000003,1.0,0.0,1.0,1.0,1.0
ENSG00000000005,0.0,1.0,1.0,1.0,1.0
ENSG00000000419,1.0,1.0,1.0,0.0,0.0
ENSG00000000457,1.0,1.0,0.0,0.0,0.0
ENSG00000000460,1.0,0.0,0.0,1.0,1.0


In [11]:
dup_row_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_duplicate_rows__.G.tsv')
gene_dup_names = gene_names.copy()
gene_dup_names[1] = gene_dup_names[2]
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_dup_names, columns=rand_names)
spreadsheet_df.to_csv(dup_row_spreadsheet_name, sep='\t', index=True, header=True)
print(dup_row_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_duplicate_rows__df.tsv 



Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
ENSG00000000003,1.0,0.0,1.0,1.0,1.0
ENSG00000000419,0.0,1.0,1.0,1.0,1.0
ENSG00000000419,1.0,1.0,1.0,0.0,0.0
ENSG00000000457,1.0,1.0,0.0,0.0,0.0
ENSG00000000460,1.0,0.0,0.0,1.0,1.0


In [12]:
dup_row_and_col_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_duplicate_rows_AND_cols_.G.tsv')
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_dup_names, columns=rand_dup_names)
spreadsheet_df.to_csv(dup_row_and_col_spreadsheet_name, sep='\t', index=True, header=True)
print(dup_row_and_col_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_duplicate_rows_AND_cols_df.tsv 



Unnamed: 0,HUNDR,ZDAIV,ZDAIV.1,DCIQI,FRTWY
ENSG00000000003,1.0,0.0,1.0,1.0,1.0
ENSG00000000419,0.0,1.0,1.0,1.0,1.0
ENSG00000000419,1.0,1.0,1.0,0.0,0.0
ENSG00000000457,1.0,1.0,0.0,0.0,0.0
ENSG00000000460,1.0,0.0,0.0,1.0,1.0


In [13]:
empty_col_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_EMPTY_cols_.G.tsv')
spreadsheet_df = pd.DataFrame(data=None, columns=rand_names)
spreadsheet_df.to_csv(empty_col_spreadsheet_name, sep='\t', index=None, header=True)
print(empty_col_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_EMPTY_cols_df.tsv 



Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY


In [14]:
empty_row_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_EMPTY_rows_.G.tsv')
spreadsheet_df = pd.DataFrame(data=None, index=gene_names)
spreadsheet_df.to_csv(empty_row_spreadsheet_name, sep='\t', index=True, header=None)
print(empty_row_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_EMPTY_rows_df.tsv 



ENSG00000000003
ENSG00000000005
ENSG00000000419
ENSG00000000457
ENSG00000000460


In [15]:
NA_col_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_NA_cols_.G.tsv')
na_col_names = rand_names.copy()
na_col_names[2] = 'NA'
na_row_names = gene_names.copy()
na_row_names[2] = 'NA'
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=na_col_names)
spreadsheet_df.to_csv(NA_col_spreadsheet_name, sep='\t', index=True, header=True)
print(NA_col_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_NA_cols_df.tsv 



Unnamed: 0,HUNDR,IAGIC,NA,DCIQI,FRTWY
ENSG00000000003,1.0,0.0,1.0,1.0,1.0
ENSG00000000005,0.0,1.0,1.0,1.0,1.0
ENSG00000000419,1.0,1.0,1.0,0.0,0.0
ENSG00000000457,1.0,1.0,0.0,0.0,0.0
ENSG00000000460,1.0,0.0,0.0,1.0,1.0


In [16]:
NA_rows_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_NA_rows_.G.tsv')
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=na_row_names, columns=rand_names)
spreadsheet_df.to_csv(NA_rows_spreadsheet_name, sep='\t', index=True, header=True)
print(NA_rows_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_NA_rows_df.tsv 



Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
ENSG00000000003,1.0,0.0,1.0,1.0,1.0
ENSG00000000005,0.0,1.0,1.0,1.0,1.0
,1.0,1.0,1.0,0.0,0.0
ENSG00000000457,1.0,1.0,0.0,0.0,0.0
ENSG00000000460,1.0,0.0,0.0,1.0,1.0


In [17]:
NA_row_and_col_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_NA_rows_and_cols_.G.tsv')
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=na_row_names, columns=na_col_names)
spreadsheet_df.to_csv(NA_row_and_col_spreadsheet_name, sep='\t', index=True, header=True)
print(NA_row_and_col_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_NA_rows_and_cols_df.tsv 



Unnamed: 0,HUNDR,IAGIC,NA,DCIQI,FRTWY
ENSG00000000003,1.0,0.0,1.0,1.0,1.0
ENSG00000000005,0.0,1.0,1.0,1.0,1.0
,1.0,1.0,1.0,0.0,0.0
ENSG00000000457,1.0,1.0,0.0,0.0,0.0
ENSG00000000460,1.0,0.0,0.0,1.0,1.0


In [18]:
NAN_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_NAN_data_.G.tsv')
spreadsheet_df = pd.DataFrame(some_nan_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(NAN_spreadsheet_name, sep='\t', index=True, header=True)
print(NAN_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_NAN_data_df.tsv 



Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
ENSG00000000003,,0.0,1.0,1.0,1.0
ENSG00000000005,0.0,1.0,1.0,1.0,1.0
ENSG00000000419,1.0,1.0,1.0,0.0,0.0
ENSG00000000457,1.0,1.0,0.0,0.0,0.0
ENSG00000000460,1.0,0.0,0.0,1.0,1.0


In [19]:
NEG_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_Negative_data_.G.tsv')
spreadsheet_df = pd.DataFrame(some_neg_data, index=gene_names, columns=rand_names)
spreadsheet_df.to_csv(NEG_spreadsheet_name, sep='\t', index=True, header=True)
print(NEG_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_Negative_data_df.tsv 



Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
ENSG00000000003,-1.0,0.0,1.0,1.0,1.0
ENSG00000000005,0.0,1.0,1.0,1.0,1.0
ENSG00000000419,1.0,1.0,1.0,0.0,0.0
ENSG00000000457,1.0,1.0,0.0,0.0,0.0
ENSG00000000460,1.0,0.0,0.0,1.0,1.0


In [20]:
alpha_spreadsheet_name = os.path.join(out_data_dir, 'spreadsheet_ALPHA_data_.G.tsv')
spreadsheet_df = pd.DataFrame(spreadsheet_data, index=gene_names, columns=rand_names)
spreadsheet_df[rand_names[0]].loc[gene_names[0]] = 'abc'
spreadsheet_df.to_csv(alpha_spreadsheet_name, sep='\t', index=True, header=True)
print(alpha_spreadsheet_name, '\n')
spreadsheet_df

./spreadsheets/spreadsheet_ALPHA_data_df.tsv 



Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
ENSG00000000003,abc,0.0,1.0,1.0,1.0
ENSG00000000005,0,1.0,1.0,1.0,1.0
ENSG00000000419,1,1.0,1.0,0.0,0.0
ENSG00000000457,1,1.0,0.0,0.0,0.0
ENSG00000000460,1,0.0,0.0,1.0,1.0
