In [33]:
import numpy as np
import pandas as pd
import xenaPython as xena

In [34]:
df = pd.read_csv('../data/TCGA-DLBC.GDC_phenotype.tsv', sep='\t', encoding = "ISO-8859-1")

In [35]:
df.head()

Unnamed: 0,submitter_id.samples,age_at_initial_pathologic_diagnosis,b_lymphocyte_genotyping_method,b_symptoms,batch_number,bcr,bcr_followup_barcode,bcr_followup_uuid,submitter_id,bone_marrow_biopsy_done,...,intermediate_dimension.samples,is_ffpe.samples,longest_dimension.samples,oct_embedded.samples,preservation_method.samples,sample_type.samples,sample_type_id.samples,shortest_dimension.samples,state.samples,tissue_type.samples
0,TCGA-FA-A6HN-01A,73,,NO,326.45.0,Nationwide Children's Hospital,TCGA-FA-A6HN-F63567,2FF9624C-C1FF-4A3B-9FFB-0A2E1136A1CD,TCGA-FA-A6HN,NO,...,,False,,True,,Primary Tumor,1,,released,Not Reported
1,TCGA-GR-A4D4-01A,57,,YES,326.45.0,Nationwide Children's Hospital,TCGA-GR-A4D4-F55574,C7009E8E-9B88-4AC1-BCBE-5F909CCB7159,TCGA-GR-A4D4,YES,...,,False,,False,,Primary Tumor,1,,released,Not Reported
2,TCGA-GS-A9TT-01A,70,,NO,397.39.0,Nationwide Children's Hospital,TCGA-GS-A9TT-F66756,01292404-F955-478D-8493-A93DA649AF6C,TCGA-GS-A9TT,YES,...,,False,,True,,Primary Tumor,1,,released,Not Reported
3,TCGA-FF-A7CQ-01A,74,,YES,397.39.0,Nationwide Children's Hospital,TCGA-FF-A7CQ-F50787,4B8ED15F-FE05-4943-AA0E-F97D130A9D26,TCGA-FF-A7CQ,YES,...,,False,,False,,Primary Tumor,1,,released,Not Reported
4,TCGA-FF-8046-01A,51,,NO,212.48.0,Nationwide Children's Hospital,TCGA-FF-8046-F30762,85B1822D-B3B9-4102-9E8C-DCF8052EA79A,TCGA-FF-8046,,...,,False,,,,Primary Tumor,1,,released,Not Reported


In [36]:
df = df.set_index("submitter_id.samples")

In [37]:
df_1 = df.query('clinical_stage=="Stage I"')
df_2 = df.query('clinical_stage=="Stage II" | clinical_stage=="Stage III"')

In [38]:
hub = "https://gdc.xenahubs.net"
cohort = "GDC TCGA Large B-cell Lymphoma (DLBC)"
dataset = "TCGA-DLBC.htseq_counts.tsv"

In [39]:
# Fetch the sample names in the dataset
samples=xena.dataset_samples (hub, dataset, None)
print("len(samples): {}".format(len(samples)))

len(samples): 48


In [40]:
stage1 = [s for s in samples if s in df_1.index.values]
stage23 = [s for s in samples if s in df_2.index.values]

In [41]:
print("len(stage1): {}, len(stage23): {}".format(len(stage1),len(stage23)))

len(stage1): 8, len(stage23): 22


In [42]:
# Create phenotype file
#columns = ['clinical_stage']
#pheno = pd.concat([df.loc[stage1, columns], df.loc[stage3, columns]])
#pheno.to_csv("../outputs/Stage1and4Phenotype.tsv", sep='\t')

In [43]:
probes = xena.dataset_field(hub, dataset)

In [44]:
# Because of connection timeout, fetch the values of probes by batch
def divide_probes_to_batches(num_probes, limit=10000):
    num_batches = num_probes // limit
    batches = [limit for i in range(num_batches)]
    if len(batches) == 0:
        batches.append(num_probes)
    else:
        remainder = num_probes - sum(batches)
        if remainder > 0:
            batches.append(remainder)
    return batches

In [45]:
# Fetch counts from xena db
def dataset_fetch(probes, samples, prefix=''):
    counts = xena.dataset_fetch(hub, dataset, samples, probes)
    df = pd.DataFrame.from_records(counts)
    df.index.name = 'Probes'
    df.index = probes
    if prefix != '':
        samples = [prefix + str(i+1) for i in range(len(samples))]
    df.columns = samples
    
    return df

In [46]:
df_1_counts = pd.DataFrame()
df_2_counts = pd.DataFrame()
batches = divide_probes_to_batches(len(probes),limit=2000)
i = 0
for batch in batches:
    # fetch relapse batch probes counts
    df_1_batch = dataset_fetch(probes[i:i+batch],stage1,prefix='stage1_')
    df_1_counts = pd.concat([df_1_counts,df_1_batch])
    # fetch censored batch probes counts
    df_2_batch = dataset_fetch(probes[i:i+batch],stage23,prefix='stage23_')
    df_2_counts = pd.concat([df_2_counts,df_2_batch])
    i += batch

In [47]:
df_lg = pd.concat([df_1_counts, df_2_counts], axis=1, join='inner')
df_lg.to_csv("../outputs/s1s23/s1s23-logcounts.csv", sep=',')

In [48]:
def valuation_formula(x):
    for i, v in enumerate(x):
        #if i == 0: continue
        x[i] = round(2**v) -1
    return x


In [49]:
def norm_counts_to_raw(df):
    df = df.apply(lambda row: valuation_formula(row), axis=1)
    return df

In [50]:
# before you load the file insure that you deleted the six lines at the bottom of the file
df_a = pd.read_csv('../outputs/s1s23/s1s23-logcounts.csv', sep=',', encoding="ISO-8859-1", index_col=0)

In [51]:
df_a.head()

Unnamed: 0,stage1_1,stage1_2,stage1_3,stage1_4,stage1_5,stage1_6,stage1_7,stage1_8,stage23_1,stage23_2,...,stage23_13,stage23_14,stage23_15,stage23_16,stage23_17,stage23_18,stage23_19,stage23_20,stage23_21,stage23_22
ENSG00000000003.13,6.443,8.017,7.0,6.615,7.781,8.033,8.267,6.0,6.304,5.807,...,7.17,7.129,7.077,4.907,7.129,7.755,7.209,6.524,6.375,6.459
ENSG00000000005.5,0.0,0.0,0.0,0.0,1.0,0.0,3.585,1.585,1.585,1.0,...,1.585,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.585
ENSG00000000419.11,9.979,10.41,11.0,10.62,11.48,9.683,10.63,10.21,9.853,10.96,...,10.17,11.86,10.97,9.954,11.11,11.34,10.98,10.33,10.89,9.817
ENSG00000000457.12,7.508,9.418,10.19,7.994,9.003,7.109,10.74,8.994,7.748,9.692,...,9.053,10.13,9.09,7.219,7.644,10.67,9.369,9.681,8.119,8.418
ENSG00000000460.15,8.676,9.409,9.331,9.392,10.04,8.082,9.709,9.098,8.397,9.92,...,9.954,10.34,9.893,8.867,8.405,9.658,10.23,9.437,9.285,9.074


In [52]:
df2 = norm_counts_to_raw(df_a)

In [53]:
df2.head()

Unnamed: 0,stage1_1,stage1_2,stage1_3,stage1_4,stage1_5,stage1_6,stage1_7,stage1_8,stage23_1,stage23_2,...,stage23_13,stage23_14,stage23_15,stage23_16,stage23_17,stage23_18,stage23_19,stage23_20,stage23_21,stage23_22
ENSG00000000003.13,86.0,258.0,127.0,97.0,219.0,261.0,307.0,63.0,78.0,55.0,...,143.0,139.0,134.0,29.0,139.0,215.0,147.0,91.0,82.0,87.0
ENSG00000000005.5,0.0,0.0,0.0,0.0,1.0,0.0,11.0,2.0,2.0,1.0,...,2.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,2.0
ENSG00000000419.11,1008.0,1360.0,2047.0,1573.0,2855.0,821.0,1584.0,1183.0,924.0,1991.0,...,1151.0,3716.0,2005.0,991.0,2209.0,2591.0,2019.0,1286.0,1897.0,901.0
ENSG00000000457.12,181.0,683.0,1167.0,254.0,512.0,137.0,1709.0,509.0,214.0,826.0,...,530.0,1120.0,544.0,148.0,199.0,1628.0,660.0,820.0,277.0,341.0
ENSG00000000460.15,408.0,679.0,643.0,671.0,1052.0,270.0,836.0,547.0,336.0,968.0,...,991.0,1295.0,950.0,466.0,338.0,807.0,1200.0,692.0,623.0,538.0


In [54]:
df2.to_csv("../outputs/s1s23/s1s23-rawcounts.csv")

In [55]:
df2 = pd.read_csv('../outputs/s1s23/s1s23-rawcounts.csv', sep=',', encoding="ISO-8859-1", index_col=0)

In [56]:
df2.head()

Unnamed: 0,stage1_1,stage1_2,stage1_3,stage1_4,stage1_5,stage1_6,stage1_7,stage1_8,stage23_1,stage23_2,...,stage23_13,stage23_14,stage23_15,stage23_16,stage23_17,stage23_18,stage23_19,stage23_20,stage23_21,stage23_22
ENSG00000000003.13,86.0,258.0,127.0,97.0,219.0,261.0,307.0,63.0,78.0,55.0,...,143.0,139.0,134.0,29.0,139.0,215.0,147.0,91.0,82.0,87.0
ENSG00000000005.5,0.0,0.0,0.0,0.0,1.0,0.0,11.0,2.0,2.0,1.0,...,2.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,2.0
ENSG00000000419.11,1008.0,1360.0,2047.0,1573.0,2855.0,821.0,1584.0,1183.0,924.0,1991.0,...,1151.0,3716.0,2005.0,991.0,2209.0,2591.0,2019.0,1286.0,1897.0,901.0
ENSG00000000457.12,181.0,683.0,1167.0,254.0,512.0,137.0,1709.0,509.0,214.0,826.0,...,530.0,1120.0,544.0,148.0,199.0,1628.0,660.0,820.0,277.0,341.0
ENSG00000000460.15,408.0,679.0,643.0,671.0,1052.0,270.0,836.0,547.0,336.0,968.0,...,991.0,1295.0,950.0,466.0,338.0,807.0,1200.0,692.0,623.0,538.0


In [57]:
t_df = pd.DataFrame()
t_df['id'] = df2.columns[0:]

In [58]:
condition = []
samples = []
i, j = 0, 0
for rep in t_df['id']:
    if rep.startswith("stage1"):
        condition.append('S1')
        samples.append(stage1[i])
        i += 1
    else:
        condition.append('S23')
        samples.append(stage23[j])
        j += 1

In [59]:
t_df['conditionName'] = condition
t_df['sampleId'] = samples

In [60]:
t_df

Unnamed: 0,id,conditionName,sampleId
0,stage1_1,S1,TCGA-GR-7351-01A
1,stage1_2,S1,TCGA-FF-8062-01A
2,stage1_3,S1,TCGA-GS-A9TX-01A
3,stage1_4,S1,TCGA-FF-8041-01A
4,stage1_5,S1,TCGA-FA-8693-01A
5,stage1_6,S1,TCGA-GS-A9TY-01A
6,stage1_7,S1,TCGA-FA-A7Q1-01A
7,stage1_8,S1,TCGA-GR-A4D6-01A
8,stage23_1,S23,TCGA-FF-8046-01A
9,stage23_2,S23,TCGA-FF-A7CW-01A


In [61]:
t_df.to_csv('../outputs/s1s23/pd-s1s23.txt', sep='\t', encoding="ISO-8859-1", index=False)