In [1]:
import pandas as pd
import numpy as np

In [2]:
def read_data(file_path, id_column_name='id', suffix=None):
    df = pd.read_csv(file_path, delimiter='\t')
    df = df.T
    if suffix:
        df = df.add_suffix(suffix)
    df = df.rename_axis(id_column_name).reset_index()
    return df

In [3]:
output_dir = 'data/preprocessed'

In [4]:
# UCEC_discovery_clinical is in tsv format, and with cp1252 encoding
df_clinical = pd.read_csv('./data/CPTAC/DISCOVERY_data_freeze_v_2.1/UCEC_discovery_clinical.txt', delimiter='\t', encoding='cp1252')
df_clinical.to_csv(f'{output_dir}/UCEC_discovery_clinical.csv', index=False, encoding='utf-8')

In [5]:
df_proteo = read_data('./data/CPTAC/DISCOVERY_data_freeze_v_2.1/UCEC_discovery_Proteomics_PNNL_ratio_median_polishing_log2.txt', suffix='_proteo')
df_proteo.to_csv(f'{output_dir}/UCEC_discovery_Proteomics_PNNL_ratio_median_polishing_log2.csv', index=False)
df_proteo

Unnamed: 0,id,A1BG_proteo,A2M_proteo,A2ML1_proteo,A4GALT_proteo,AAAS_proteo,AACS_proteo,AADAT_proteo,AAED1_proteo,AAGAB_proteo,...,ZSWIM8_proteo,ZSWIM9_proteo,ZW10_proteo,ZWILCH_proteo,ZWINT_proteo,ZXDC_proteo,ZYG11B_proteo,ZYX_proteo,ZZEF1_proteo,ZZZ3_proteo
0,S001,-1.180,-0.8630,-0.8020,0.2220,0.2560,0.6650,1.2800,-0.3390,0.4120,...,-0.08770,,0.02290,0.1090,,-0.332,-0.43300,-1.020,-0.12300,-0.0859
1,S002,-0.685,-1.0700,-0.6840,0.9840,0.1350,0.3340,1.3000,0.1390,1.3300,...,-0.03560,,0.36300,1.0700,0.7370,-0.564,-0.00461,-1.130,-0.07570,-0.4730
2,S003,-0.528,-1.3200,0.4350,,-0.2400,1.0400,-0.0213,-0.0479,0.4190,...,0.00112,-0.1450,0.01050,-0.1160,,0.151,-0.07400,-0.540,0.32000,-0.4190
3,S005,-1.670,-1.1900,-0.4430,0.2430,-0.0993,0.7570,0.7400,-0.9290,0.2290,...,0.07250,-0.0552,-0.07140,0.0933,0.1560,-0.398,-0.07520,-0.797,-0.03010,-0.4670
4,S006,-0.374,-0.0206,-0.5370,0.3110,0.3750,0.0131,-1.1000,,0.5650,...,-0.17600,,-1.22000,-0.5620,0.9370,-0.646,0.20700,-1.850,-0.17600,0.0513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,S099,-1.070,-0.7120,0.0462,-0.0471,0.2270,1.3500,1.2100,0.0048,0.6820,...,-0.06990,-0.4010,0.55700,0.8270,0.3480,-0.187,-0.10700,-0.830,0.06200,-0.5280
91,S100,-1.280,-0.7360,-0.5520,0.1140,0.2730,1.0700,0.8500,,-0.5360,...,0.13100,,-0.00364,0.7630,0.0784,-0.203,0.01320,-1.200,-0.35700,0.0299
92,S101,-0.290,-0.3200,3.1700,-0.9070,0.0317,-0.0425,,-0.2730,-0.0747,...,-0.17900,-0.5210,0.05230,0.5150,0.5850,0.138,-0.19000,-0.966,-0.00627,-0.2490
93,S102,0.266,1.3900,-0.0655,0.4700,0.3980,-0.1340,0.4610,1.0400,0.3630,...,-0.14000,,-0.01220,0.2500,0.5530,0.387,0.06420,-0.437,0.10400,-0.4980


In [6]:
df_cna = read_data('./data/CPTAC/DISCOVERY_data_freeze_v_2.1/UCEC_discovery_WGS_CNA_gene_level_THRESHOLD.txt')
df_cna.to_csv(f'{output_dir}/UCEC_discovery_WGS_CNA_gene_level_THRESHOLD.csv', index=False)
df_cna

Unnamed: 0,id,DDX11L1,FAM138A|chr1,FAM87B,LOC100133331|chr1,LOC100288069,LOC729737,MIR6723,MIR6859-1|chr1,OR4F16|chr1,...,DAZ3,TTTY3,CDY1,CSPG4P1Y,GOLGA2P2Y,TTTY3B,DDX11L16|chrY,IL9R|chrY,SPRY3|chrY,VAMP7|chrY
0,S001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,S002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,S003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,S005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,S006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,S099,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91,S100,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,0,0,0,0,0,0,0,0,0,0
92,S101,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
93,S102,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_sommut = read_data('./data/CPTAC/DISCOVERY_data_freeze_v_2.1/UCEC_discovery_somatic_mutation_gene_level.txt', suffix='_sommut')
df_sommut.to_csv(f'{output_dir}/UCEC_discovery_somatic_mutation_gene_level.csv', index=False)
df_sommut

Unnamed: 0,id,A1BG_sommut,A1CF_sommut,A2M_sommut,A2ML1_sommut,A3GALT2_sommut,A4GALT_sommut,A4GNT_sommut,AAAS_sommut,AACS_sommut,...,ZUFSP_sommut,ZWILCH_sommut,ZXDA_sommut,ZXDB_sommut,ZXDC_sommut,ZYG11A_sommut,ZYG11B_sommut,ZYX_sommut,ZZEF1_sommut,ZZZ3_sommut
0,S001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,S002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,S003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,S005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,S006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,S099,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
91,S100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92,S101,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
93,S102,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df_cna = read_data('./data/CPTAC/DISCOVERY_data_freeze_v_2.1/UCEC_discovery_WGS_CNA_gene_level_THRESHOLD.txt', suffix='_cna')
df_cna.to_csv(f'{output_dir}/UCEC_discovery_WGS_CNA_gene_level_THRESHOLD.csv', index=False)
df_cna

Unnamed: 0,id,DDX11L1_cna,FAM138A|chr1_cna,FAM87B_cna,LOC100133331|chr1_cna,LOC100288069_cna,LOC729737_cna,MIR6723_cna,MIR6859-1|chr1_cna,OR4F16|chr1_cna,...,DAZ3_cna,TTTY3_cna,CDY1_cna,CSPG4P1Y_cna,GOLGA2P2Y_cna,TTTY3B_cna,DDX11L16|chrY_cna,IL9R|chrY_cna,SPRY3|chrY_cna,VAMP7|chrY_cna
0,S001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,S002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,S003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,S005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,S006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,S099,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91,S100,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,0,0,0,0,0,0,0,0,0,0
92,S101,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
93,S102,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
