# data_cleanup_toolbox test
##  Gene Prioritization Spreadsheets with Phenotypes

In [1]:
import os
import sys
import itertools
import time
import numpy as np
import pandas as pd

import knpackage.toolbox as kn

sys.path.insert(1, '../src')
import data_synth
import data_wrangler as dw

sys.path.insert(1, '../../Data_Cleanup_Pipeline/src/')
import data_cleanup_toolbox as dc

In [2]:
# set source and destination directories
spreadsheet_out_dir = './spreadsheets'
phenotype_pearson_out_dir = './phenotypes_pearson'
phenotype_t_test_out_dir = './phenotypes_t_test'

file_list = os.listdir(spreadsheet_out_dir)
spreadsheet_file_list = []
for f in file_list:
    if (f[0] != '.') and (f[-4:] == '.tsv'):
        spreadsheet_file_list.append(f)
#spreadsheet_file_list

spreadsheet_df = kn.get_spreadsheet_df(os.path.join(spreadsheet_out_dir, 'spreadsheet_A_df.tsv'))
spreadsheet_df

Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
ENSG00000000003,1.0,0.0,1.0,1.0,1.0
ENSG00000000005,0.0,1.0,1.0,1.0,1.0
ENSG00000000419,1.0,1.0,1.0,0.0,0.0
ENSG00000000457,1.0,1.0,0.0,0.0,0.0
ENSG00000000460,1.0,0.0,0.0,1.0,1.0


In [3]:
phenotype_pearson_data = np.random.random(spreadsheet_df.shape)
phenotype_pearson_data[phenotype_pearson_data < 0.4] = 0.0
phenotype_pearson_data[phenotype_pearson_data > 0.7] = 0.0
phenotype_pearson_data

array([[ 0.51317322,  0.        ,  0.53575632,  0.59583807,  0.        ],
       [ 0.        ,  0.6149494 ,  0.        ,  0.49586241,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.619632  ,  0.        ],
       [ 0.        ,  0.        ,  0.62066601,  0.        ,  0.        ],
       [ 0.47882796,  0.        ,  0.42050266,  0.        ,  0.63806858]])

In [4]:
phenotype_t_test_data = np.random.random(spreadsheet_df.shape)
phenotype_t_test_data[phenotype_t_test_data < 0.65] = 0.0
phenotype_t_test_data[phenotype_t_test_data !=  0.0] = 1.0
phenotype_t_test_data

array([[ 0.,  1.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  1.,  0.,  1.]])

In [11]:
phenotype_TMP_df = spreadsheet_df.copy()
col_names = list(phenotype_TMP_df.columns.values)
drug_names = data_synth.get_rand_unique_name_list(phenotype_TMP_df.shape[0], name_length=6)
name_sub_dict = {phenotype_TMP_df.index.values[k]: drug_names[k] for k in range(0, len(drug_names))}
phenotype_TMP_df.rename(index=name_sub_dict, inplace=True)

phenotype_TMP_df[:][:] = 0.0
phenotype_TMP_df

Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
OACEIM,0.0,0.0,0.0,0.0,0.0
XWKGVZ,0.0,0.0,0.0,0.0,0.0
YAPXMA,0.0,0.0,0.0,0.0,0.0
IABXWF,0.0,0.0,0.0,0.0,0.0
WPGYMN,0.0,0.0,0.0,0.0,0.0


In [17]:
# phenotype_pearson_out_dir = './phenotypes_pearson'
phenotype_pearson_df = phenotype_TMP_df.copy()
phenotype_pearson_df[:][:] = phenotype_pearson_data
phenotype_pearson_df.to_csv(
            os.path.join(phenotype_pearson_out_dir, 'phenotype_pearson_A_df.tsv'), sep='\t', index=True, header=True)
phenotype_pearson_df

Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
OACEIM,0.513173,0.0,0.535756,0.595838,0.0
XWKGVZ,0.0,0.614949,0.0,0.495862,0.0
YAPXMA,0.0,0.0,0.0,0.619632,0.0
IABXWF,0.0,0.0,0.620666,0.0,0.0
WPGYMN,0.478828,0.0,0.420503,0.0,0.638069


In [18]:
dupe_cols_pearson_df = phenotype_pearson_df.copy()
col_dict = {col_names[1]: col_names[2]}
dupe_cols_pearson_df.rename(columns=col_dict, inplace=True)
dupe_cols_pearson_df.to_csv(
            os.path.join(phenotype_pearson_out_dir, 'phenotype_pearson_dup_cols_df.tsv'), 
            sep='\t', index=True, header=True)

dupe_cols_pearson_df

Unnamed: 0,HUNDR,ZDAIV,ZDAIV.1,DCIQI,FRTWY
OACEIM,0.513173,0.0,0.535756,0.595838,0.0
XWKGVZ,0.0,0.614949,0.0,0.495862,0.0
YAPXMA,0.0,0.0,0.0,0.619632,0.0
IABXWF,0.0,0.0,0.620666,0.0,0.0
WPGYMN,0.478828,0.0,0.420503,0.0,0.638069


In [19]:
dupe_rows_pearson_df = phenotype_pearson_df.copy()
row_dict = {drug_names[1]: drug_names[2]}
dupe_rows_pearson_df.rename(index=row_dict, inplace=True)
dupe_rows_pearson_df.to_csv(
            os.path.join(phenotype_pearson_out_dir, 'phenotype_pearson_dup_rows_df.tsv'), 
            sep='\t', index=True, header=True)

dupe_rows_pearson_df

Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
OACEIM,0.513173,0.0,0.535756,0.595838,0.0
YAPXMA,0.0,0.614949,0.0,0.495862,0.0
YAPXMA,0.0,0.0,0.0,0.619632,0.0
IABXWF,0.0,0.0,0.620666,0.0,0.0
WPGYMN,0.478828,0.0,0.420503,0.0,0.638069


In [20]:
dupe_rows_and_cols_pearson_df = phenotype_pearson_df.copy()
dupe_rows_and_cols_pearson_df.rename(columns=col_dict, inplace=True)
dupe_rows_and_cols_pearson_df.rename(index=row_dict, inplace=True)
dupe_rows_and_cols_pearson_df.to_csv(
            os.path.join(phenotype_pearson_out_dir, 'phenotype_pearson_dup_rows_and_cols_df.tsv'), 
            sep='\t', index=True, header=True)

dupe_rows_and_cols_pearson_df

Unnamed: 0,HUNDR,ZDAIV,ZDAIV.1,DCIQI,FRTWY
OACEIM,0.513173,0.0,0.535756,0.595838,0.0
YAPXMA,0.0,0.614949,0.0,0.495862,0.0
YAPXMA,0.0,0.0,0.0,0.619632,0.0
IABXWF,0.0,0.0,0.620666,0.0,0.0
WPGYMN,0.478828,0.0,0.420503,0.0,0.638069


In [23]:
negative_values_pearson_df = phenotype_pearson_df.copy()
neg_data = phenotype_pearson_data.copy()
neg_data[0,0] = -1
negative_values_pearson_df[:][:] = neg_data

negative_values_pearson_df.to_csv(
            os.path.join(phenotype_pearson_out_dir, 'phenotype_pearson_negative_value_df.tsv'), 
            sep='\t', index=True, header=True)

negative_values_pearson_df

Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
OACEIM,-1.0,0.0,0.535756,0.595838,0.0
XWKGVZ,0.0,0.614949,0.0,0.495862,0.0
YAPXMA,0.0,0.0,0.0,0.619632,0.0
IABXWF,0.0,0.0,0.620666,0.0,0.0
WPGYMN,0.478828,0.0,0.420503,0.0,0.638069


In [25]:
nan_values_pearson_df = phenotype_pearson_df.copy()
nan_data = phenotype_pearson_data.copy()
nan_data[0,0] = np.nan
negative_values_pearson_df[:][:] = nan_data

negative_values_pearson_df.to_csv(
            os.path.join(phenotype_pearson_out_dir, 'phenotype_pearson_nan_value_df.tsv'), 
            sep='\t', index=True, header=True)

negative_values_pearson_df

Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
OACEIM,,0.0,0.535756,0.595838,0.0
XWKGVZ,0.0,0.614949,0.0,0.495862,0.0
YAPXMA,0.0,0.0,0.0,0.619632,0.0
IABXWF,0.0,0.0,0.620666,0.0,0.0
WPGYMN,0.478828,0.0,0.420503,0.0,0.638069


In [33]:
extra_cols_df = phenotype_pearson_df.copy()
extra_cols_df['EXTRA_COL'] = extra_cols_df[col_names[0]].values

col_vals = list(extra_cols_df.columns.values)
permu = np.random.permutation(len(col_vals))
col_perm_dict = {col_vals[k]: col_vals[permu[k]] for k in range(0, len(col_vals))}
extra_cols_df.rename(columns=col_perm_dict, inplace=True)
extra_cols_df.to_csv(
            os.path.join(phenotype_pearson_out_dir, 'phenotype_pearson_extra_cols_df.tsv'), 
            sep='\t', index=True, header=True)

extra_cols_df

Unnamed: 0,ZDAIV,HUNDR,DCIQI,EXTRA_COL,IAGIC,FRTWY
OACEIM,0.513173,0.0,0.535756,0.595838,0.0,0.513173
XWKGVZ,0.0,0.614949,0.0,0.495862,0.0,0.0
YAPXMA,0.0,0.0,0.0,0.619632,0.0,0.0
IABXWF,0.0,0.0,0.620666,0.0,0.0,0.0
WPGYMN,0.478828,0.0,0.420503,0.0,0.638069,0.478828


In [9]:
# phenotype_t_test_out_dir = './phenotypes_t_test'
phenotype_t_test_df = phenotype_TMP_df.copy()
phenotype_t_test_df[:][:] = phenotype_t_test_data
phenotype_t_test_df

Unnamed: 0,HUNDR,IAGIC,ZDAIV,DCIQI,FRTWY
QOMKCK,0.0,1.0,1.0,0.0,0.0
RYBZHQ,0.0,0.0,0.0,0.0,0.0
WLSSHH,0.0,0.0,0.0,1.0,1.0
IZTVBO,0.0,0.0,0.0,0.0,1.0
BCWUQS,0.0,0.0,1.0,0.0,1.0


In [29]:
permu = np.random.permutation(5)
permu

array([2, 4, 0, 1, 3])

In [12]:
spreadsheet_file_list

['spreadsheet_A_df.tsv',
 'spreadsheet_ALPHA_data_df.tsv',
 'spreadsheet_B_df.tsv',
 'spreadsheet_DNE_df.tsv',
 'spreadsheet_duplicate_cols_df.tsv',
 'spreadsheet_duplicate_rows__df.tsv',
 'spreadsheet_duplicate_rows_AND_cols_df.tsv',
 'spreadsheet_EMPTY_cols_df.tsv',
 'spreadsheet_EMPTY_rows_df.tsv',
 'spreadsheet_NA_cols_df.tsv',
 'spreadsheet_NA_rows_and_cols_df.tsv',
 'spreadsheet_NA_rows_df.tsv',
 'spreadsheet_NAN_data_df.tsv',
 'spreadsheet_Negative_data_df.tsv']