## miRNA for datafreeze

This notebook combines the miRNA data from the WashU pipeline. The format of the output file is for the datafreeze according to Yize's instructions: rows are miRNA identifiers and columns are Patient_IDs. Normal samples have a ".A" appended to the Patient_ID and tumor samples have a ".T". Replicates have a ".1" appended to the Patient_ID.

In [1]:
import pandas as pd
import numpy as np
import warnings
from functools import reduce
import re
import glob

In [2]:
def average_replicates(df, common = '\.', to_drop = '\.\d$'):
    """Returns a df with one row for each patient_ID (all replicates for a patient are averaged)

    Parameters:
    df (pandas.DataFrame): The df containing replicates (duplicate entries for the same tissue_type).
    common: regex string that is common between replicates (identifies duplicate entries)
    to_drop: regex string to drop to find each patient_ID that has replicates (used to slice out all replicates)
    
    Returns:
    pandas.DataFrame: df with with replicate rows averaged and one row for each patient_ID.
    """
    replicate_df = df[df.index.str.contains(common)]
    patient_ids = pd.Series(replicate_df.index) # create series of replicate IDs to prep removing appended ".i"
    ids = patient_ids.replace(to_drop, '', regex=True)
    id_list = list(set(ids)) #id_list contains only patient_IDs of replicates (without #s)

    for patient_ID in id_list:
        id_df = df[df.index.str.contains(patient_ID)] # slice out replicates for a single patient
        vals = list(id_df.mean(axis=0)) 
        df.loc[patient_ID] = vals # add new row to original df with averages of replicates 

    df = df[~ df.index.str.contains(common)] # drop unaveraged replicate cols (averaged rows are kept)
    return df

In [3]:
''' Adds '_i' to duplicate col names where the int i increases by one with each duplicate 
Returns a df with unique column names. '''

def rename_duplicate_index(df):
    cols = pd.Series(df.index[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.index=cols
    return df

In [4]:
ca = 'PDA'

In [5]:
file_path = '../../../miRNA/datafreeze/'

li = [ca+"_mature_miRNA_combined.tsv", ca+"_precursor_miRNA_combined.tsv",
                ca+"_total_miRNA_combined.tsv"]

In [6]:
d ={}
for file_name in li:
    if 'miRNA_combined' in file_name:
        miRNA_type = file_name.split('_')[1] # get type of miRNA data (precursor, mature, or total)
        print(miRNA_type)
        if miRNA_type == 'mature':
            df = pd.read_csv(file_path+file_name, delimiter = '\t', index_col = ['Name', 'ID','Alias', 'Derives_from'])
        else:
            df = pd.read_csv(file_path+file_name, delimiter = '\t', index_col = ['Name', 'ID','Alias'])
        df = df.transpose()
        #df = average_replicates(df, common = '\.\d$')
        
        df.index = df.index.str.replace('\.T$','', regex = True)
        df.index = df.index.str.replace('\.A$','.N', regex = True)
        df.index.name = 'Patient_ID'
        
        # Sort
        normal = df.loc[df.index.str.contains('\.N$', regex =True)]
        normal = normal.sort_values(by=["Patient_ID"])
        tumor = df.loc[~ df.index.str.contains('\.N$', regex =True)]
        tumor = tumor.sort_values(by=["Patient_ID"])
    
        d[miRNA_type+'_miRNA'] = df

mature
precursor
total


In [10]:
df = d['precursor_miRNA']
df
#df[df.index.str.contains('C3N-00326')]

Name,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-9900,hsa-mir-9901,hsa-mir-9902-1,hsa-mir-9902-2,hsa-mir-9903,hsa-mir-9983,hsa-mir-9985,hsa-mir-9986,hsa-mir-99a,hsa-mir-99b
ID,MI0000060,MI0000061,MI0000062,MI0000063,MI0000064,MI0000065,MI0000066,MI0000067,MI0000068,MI0000433,...,MI0031828,MI0031829,MI0031830,MI0041071,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000746
Alias,MI0000060,MI0000061,MI0000062,MI0000063,MI0000064,MI0000065,MI0000066,MI0000067,MI0000068,MI0000433,...,MI0031828,MI0031829,MI0031830,MI0041071,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000746
Patient_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
C3L-00017,299.581512,330.466204,234.723659,271.785290,101.919484,30.884692,12.353877,287.227636,219.281313,151.334991,...,,,,,,,3.088469,,49.415507,33.973161
C3L-00102,324.193293,379.453513,187.884749,206.304823,114.204455,18.420073,33.156132,202.620808,158.412632,125.256500,...,,25.788103,,,,,,,58.944235,44.208176
C3L-00189,113.321904,98.699723,51.177634,124.288539,7.311091,7.311091,32.899908,91.388632,47.522089,7.311091,...,,3.655545,,,,,,,,7.311091
C3L-00277,315.800554,202.736158,140.355802,374.282139,35.088950,42.886495,19.493861,144.254574,152.052119,128.659485,...,,3.898772,,,,,3.898772,,27.291406,42.886495
C3L-00401,27.565675,58.577060,44.794222,99.925573,20.674256,10.337128,6.891419,75.805607,17.228547,34.457094,...,,10.337128,,,3.445709,,,,24.119966,20.674256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03069.N,74.722459,,32.023911,138.770282,42.698548,,,85.397096,21.349274,64.047822,...,,42.698548,,,,,,,10.674637,21.349274
C3N-03173.N,29.848391,25.584336,36.244475,23.452308,10.660140,2.132028,4.264056,29.848391,14.924196,21.320280,...,,2.132028,2.132028,,,,,,,10.660140
C3N-03440.N,44.677671,73.108916,44.677671,113.724981,8.123213,16.246426,8.123213,32.492852,24.369639,24.369639,...,,4.061606,,,,,,,4.061606,8.123213
C3N-03780.N,79.138185,59.353639,49.461366,79.138185,13.189698,,3.297424,75.840761,26.379395,23.081971,...,,19.784546,,,,,6.594849,,16.487122,9.892273


In [12]:
df = d['total_miRNA']
df

Name,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-9900,hsa-mir-9901,hsa-mir-9902-1,hsa-mir-9902-2,hsa-mir-9903,hsa-mir-9983,hsa-mir-9985,hsa-mir-9986,hsa-mir-99a,hsa-mir-99b
ID,MI0000060,MI0000061,MI0000062,MI0000063,MI0000064,MI0000065,MI0000066,MI0000067,MI0000068,MI0000433,...,MI0031828,MI0031829,MI0031830,MI0041071,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000746
Alias,MI0000060,MI0000061,MI0000062,MI0000063,MI0000064,MI0000065,MI0000066,MI0000067,MI0000068,MI0000433,...,MI0031828,MI0031829,MI0031830,MI0041071,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000746
Patient_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
C3L-00017,29414.580663,29037.787421,29145.883843,11377.920534,5469.678954,8582.855907,7498.803218,25467.517025,25035.131337,12668.900659,...,,,,,,,43.238569,,5435.705792,200.750498
C3L-00102,27862.203114,27534.325807,27913.779320,11597.278250,5275.509039,7080.676238,8841.635260,18383.233312,18342.709151,9991.047844,...,,25.788103,,,,,33.156132,,4815.007202,232.092926
C3L-00189,41059.084578,40730.085503,40741.052139,15612.833888,5772.105996,8930.497118,14819.580563,25552.261503,25537.639322,8250.565696,...,,3.655545,,,,,18.277726,,5157.974389,149.877356
C3L-00277,25887.847917,25119.789778,25533.059640,13852.337899,5103.492910,7368.679603,7914.507722,17809.591760,17139.002928,10230.378454,...,,7.797545,,,,,58.481584,,4397.815128,916.211485
C3L-00401,25818.700554,26053.008793,26332.111255,11432.863798,3635.223420,7229.098327,7787.303250,19630.206467,20205.639937,12090.994294,...,,10.337128,,,3.445709,,27.565675,,1995.065744,182.622598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03069.N,61357.813834,61315.115286,61902.220325,53768.146883,10194.278395,10226.302306,15147.309991,30230.572161,30721.605465,10322.374039,...,,53.373185,,,,,74.722459,,2359.094791,85.397096
C3N-03173.N,23906.429557,24160.140884,23859.524942,8270.136471,2778.032437,8182.723325,6649.795219,21910.851383,21516.426209,11037.508768,...,,4.264056,2.132028,,,,46.904615,,1396.478316,91.677202
C3N-03440.N,40477.969847,40689.173382,40266.766311,18500.617364,5726.865090,6445.769431,6616.356901,25831.817000,26416.688329,11697.426566,...,,4.061606,,,,,77.170522,,1998.310372,89.355342
C3N-03780.N,47413.665186,47476.316249,48178.667643,22890.720059,7646.727141,8306.212018,10578.137417,26742.111737,27075.151599,10845.228792,...,,19.784546,,,,,56.056214,,2318.089340,181.358341


In [11]:
df = d['mature_miRNA']
df

Name,hsa-let-7a-2-3p,hsa-let-7a-3p,hsa-let-7a-3p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7b-3p,hsa-let-7b-5p,hsa-let-7c-3p,hsa-let-7c-5p,...,hsa-miR-9899,hsa-miR-9901,hsa-miR-9903,hsa-miR-9983-3p,hsa-miR-9985,hsa-miR-9986,hsa-miR-99a-3p,hsa-miR-99a-5p,hsa-miR-99b-3p,hsa-miR-99b-5p
ID,MIMAT0010195,MIMAT0004481,MIMAT0004481_1,MIMAT0000062,MIMAT0000062_1,MIMAT0000062_2,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0039319,MIMAT0039321,MIMAT0039323,MIMAT0041993,MIMAT0039763,MIMAT0039766,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Alias,MIMAT0010195,MIMAT0004481,MIMAT0004481,MIMAT0000062,MIMAT0000062,MIMAT0000062,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0039319,MIMAT0039321,MIMAT0039323,MIMAT0041993,MIMAT0039763,MIMAT0039766,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Derives_from,MI0000061,MI0000062,MI0000060,MI0000061,MI0000062,MI0000060,MI0000063,MI0000063,MI0000064,MI0000064,...,MI0031827,MI0031829,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000101,MI0000746,MI0000746
Patient_ID,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
C3L-00017,,86.477138,83.388668,28707.321216,28824.683046,29031.610482,101.919484,11004.215760,12.353877,5355.405593,...,,,,,40.150100,,71.034792,5315.255494,15.442346,151.334991
C3L-00102,7.368029,51.576206,51.576206,27147.504264,27674.318365,27486.433616,88.416353,11302.557075,,5161.304583,...,,,,,33.156132,,99.468397,4656.594570,44.208176,143.676573
C3L-00189,,29.244362,51.177634,40631.385781,40660.630143,40894.585041,146.221811,15342.323538,10.966636,5753.828270,...,,,,,18.277726,,18.277726,5139.696663,40.210998,102.355268
C3L-00277,,74.076673,62.380356,24917.053620,25318.627164,25509.667006,74.076673,13403.979087,,5068.403960,...,,3.898772,,,54.582812,,27.291406,4343.232316,27.291406,846.033584
C3L-00401,,62.022769,55.131350,25994.431734,26225.294264,25736.003528,137.828376,11195.109849,,3614.549163,...,,,,,27.565675,,10.337128,1960.608650,55.131350,106.816991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03069.N,,128.095645,138.770282,61315.115286,61742.100769,61144.321093,234.842015,53394.534586,,10151.579846,...,,10.674637,,,74.722459,,32.023911,2316.396243,,64.047822
C3N-03173.N,,55.432727,40.508531,24134.556549,23767.847739,23836.072634,123.657622,8123.026542,2.132028,2765.240269,...,,2.132028,,,46.904615,,10.660140,1385.818176,23.452308,57.564755
C3N-03440.N,,142.156226,97.478555,40616.064466,40079.932415,40335.813621,223.388355,18163.504029,4.061606,5714.680270,...,,,,,77.170522,,8.123213,1986.125552,12.184819,69.047310
C3N-03780.N,,59.353639,72.543336,47416.962611,48069.852638,47261.983665,220.927434,22590.654440,,7633.537444,...,,,,,49.461366,,9.892273,2291.709945,65.948488,105.517580


In [33]:
df
df.index = df.index.str.replace('\.T$','', regex = True)
df.index = df.index.str.replace('\.A$','.N', regex = True)
df

Name,hsa-let-7a-2-3p,hsa-let-7a-3p,hsa-let-7a-3p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7b-3p,hsa-let-7b-5p,hsa-let-7c-3p,hsa-let-7c-5p,...,hsa-miR-98-5p,hsa-miR-9851-5p,hsa-miR-9899,hsa-miR-9901,hsa-miR-9983-3p,hsa-miR-9985,hsa-miR-99a-3p,hsa-miR-99a-5p,hsa-miR-99b-3p,hsa-miR-99b-5p
ID,MIMAT0010195,MIMAT0004481,MIMAT0004481_1,MIMAT0000062,MIMAT0000062_1,MIMAT0000062_2,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0000096,MIMAT0048639,MIMAT0039319,MIMAT0039321,MIMAT0041993,MIMAT0039763,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Alias,MIMAT0010195,MIMAT0004481,MIMAT0004481,MIMAT0000062,MIMAT0000062,MIMAT0000062,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0000096,MIMAT0048639,MIMAT0039319,MIMAT0039321,MIMAT0041993,MIMAT0039763,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Derives_from,MI0000061,MI0000062,MI0000060,MI0000061,MI0000062,MI0000060,MI0000063,MI0000063,MI0000064,MI0000064,...,MI0000100,MI0039502,MI0031827,MI0031829,MI0033670,MI0032313,MI0000101,MI0000101,MI0000746,MI0000746
Patient_ID,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
C3L-00104,,125.272610,108.189982,38561.187129,38549.798710,38709.236577,216.379963,22133.392553,,8063.000735,...,609.280423,,,,,,182.214706,5540.465900,11.388419,222.074173
C3L-00365,,28.594917,39.318011,21299.638989,21760.732030,22161.060871,89.359116,12767.630554,7.148729,7420.381027,...,368.159560,,,,,3.574365,225.184973,7445.401580,60.764199,175.143868
C3L-00674,,15.936594,21.248791,30869.181815,29982.044771,30502.640162,69.058572,15442.559204,5.312198,8972.302200,...,605.590557,,,,,10.624396,84.995166,11357.479043,58.434177,2204.562116
C3L-00677,,6.501063,19.503189,21290.981075,20978.930055,21466.509774,117.019133,8373.369046,,7612.744684,...,721.617985,,,,,6.501063,97.515944,13073.637540,182.029762,143.023384
C3L-01040,,56.855655,45.484524,21667.690068,20883.082031,21633.576676,68.226786,7175.183644,11.371131,4804.302836,...,466.216370,,,5.685565,,11.371131,85.283482,4400.627686,79.597917,216.051488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03183,,61.862685,41.241790,29680.341757,29195.750721,28645.860183,134.035818,8468.314276,3.436816,7213.876488,...,780.157200,,,,,27.494527,158.093530,9399.691374,44.678606,175.277609
C3N-03184,,66.734428,77.001263,28339.031427,28362.131806,27856.490180,128.335438,11624.623977,5.133418,5510.723709,...,372.172770,,,,,2.566709,125.768729,6873.646061,84.701389,77.001263
C3N-03186,,37.422193,70.686365,27767.267224,28029.222575,27343.149036,108.108558,12174.686797,4.158021,3966.752461,...,573.806960,,4.158021,8.316043,,4.158021,74.844386,2956.353249,66.528343,174.636901
C3N-03188,,67.080035,37.914803,17878.287651,17910.369407,17621.633603,81.662652,6801.332268,,2726.949258,...,784.544760,,,,,2.916523,49.580896,2219.474209,134.160070,166.241826


In [None]:
elif 'miRNA_combined' in file_name:
                miRNA_type = file_name.split('_')[1] # get type of miRNA data (precursor, mature, or total)
                if miRNA_type == 'mature':
                    df = pd.read_csv(file_path, delimiter = '\t', index_col = ['Name', 'ID','Alias', 'Derives_from'])
                else:
                    df = pd.read_csv(file_path, delimiter = '\t', index_col = ['Name', 'ID','Alias'])
                df = df.transpose()
                df.index = df.index.str.replace('\.T$','', regex = True)
                df.index = df.index.str.replace('\.A$','.N', regex = True)
                df.index.name = 'Patient_ID'                
                # Sort
                normal = df.loc[df.index.str.contains('\.N$', regex =True)]
                normal = normal.sort_values(by=["Patient_ID"])
                tumor = df.loc[~ df.index.str.contains('\.N$', regex =True)]
                tumor = tumor.sort_values(by=["Patient_ID"])
                all_df = tumor.append(normal)
                self._data[miRNA_type+'_miRNA'] = all_df
                
            elif file_name == "GBM_xCell.txt":
                df = pd.read_csv(file_path, sep = '\t', index_col = 0) 
                df = df.transpose()
                df.columns.name = 'Name'
                df.index.name = 'Patient_ID'
                df.index = df.index.str.replace(r'-T$', '', regex=True) # remove label for tumor samples
                df.index = df.index.str.replace(r'-A$', '.N', regex=True) # change label for normal samples
                self._data["xcell"] = df
                
            elif file_name == "CIBERSORT.Output_Abs_GBM.txt":
                df = pd.read_csv(file_path, sep = '\t', index_col = 0) 
                df.index.name = 'Patient_ID'
                df.columns.name = 'Name'
                df.index = df.index.str.replace(r'-T$', '', regex=True) 
                df.index = df.index.str.replace(r'-A$', '.N', regex=True)
                self._data["cibersort"] = df

In [8]:
d['total_miRNA']

Name,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-9900,hsa-mir-9901,hsa-mir-9902-1,hsa-mir-9902-2,hsa-mir-9903,hsa-mir-9983,hsa-mir-9985,hsa-mir-9986,hsa-mir-99a,hsa-mir-99b
ID,MI0000060,MI0000061,MI0000062,MI0000063,MI0000064,MI0000065,MI0000066,MI0000067,MI0000068,MI0000433,...,MI0031828,MI0031829,MI0031830,MI0041071,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000746
Alias,MI0000060,MI0000061,MI0000062,MI0000063,MI0000064,MI0000065,MI0000066,MI0000067,MI0000068,MI0000433,...,MI0031828,MI0031829,MI0031830,MI0041071,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000746
Patient_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3


# Test created files

In [92]:
import cptac.pancan as pc

In [82]:
g = pc.PancanGbm()
l = pc.PancanLuad()
ls = pc.PancanLscc()
e = pc.PancanUcec()
r = pc.PancanCcrcc()
h = pc.PancanHnscc()
b = pc.PancanBrca()
o = pc.PancanOv()
c = pc.PancanCoad()

                                                 

MissingFileError: Missing data file 'BR_prospective.dnp.annotated.exonic.addrecovercases.maf.gz'. Call "cptac.download(dataset='washubrca', version='1.0')" to download it. Dataset loading aborted.

In [86]:
pc.pancan_download('pancanbrca')

Please login to Box on the webpage that was just opened and grant access for cptac to download files through your account. If you accidentally closed the browser window, press Ctrl+C and call the download function again.
                                                 

In [91]:
b = pc.PancanBrca()

                                                 

In [88]:
ca = b

In [93]:
xc = ca.get_deconvolution('washu','xcell')
ciber = ca.get_deconvolution('washu','cibersort')

DataSourceNotFoundError: Data source xcell not found for the pancanbrca dataset.

In [None]:
decon = {'xcell':xc, 'cibersort':ciber}

for decon_type in decon:
    print(decon_type)
    df = xc
    print('num IDs:', len(df.index))
    # check tumor/normal counts
    tumor = df.loc[~ df.index.str.contains('\.N$', regex = True)]
    print('Tumor:', len(tumor))
    normal = df.loc[df.index.str.contains('\.N$', regex = True)]
    print('Normal:', len(normal))

    # check for duplicate indices 
    if True in (df.index.duplicated()):
        print('Duplicates present')
    else:
        print('NO dup index')

    print('\n')

In [63]:
pre = ca.get_miRNA('washu', 'precursor')
mat = ca.get_miRNA('washu', 'mature')
tot = ca.get_miRNA('washu', 'total')

all_list = {'precursor':pre, 'mature':mat, 'total':tot}

In [85]:
for miRNA_type in all_list:
    df = all_list[miRNA_type]
    print(miRNA_type)
    print(df.columns.names)
    
    print('num IDs:', len(df.index))
    # check tumor/normal counts
    tumor = df.loc[~ df.index.str.contains('\.N$', regex = True)]
    print('Tumor:', len(tumor))
    normal = df.loc[df.index.str.contains('\.N$', regex = True)]
    print('Normal:', len(normal))

    # check for duplicate indices 
    if True in (df.index.duplicated()):
        print('Duplicates present')
    else:
        print('NO dup index')
        
    print('\n')

    all_df

precursor
['Name', 'ID', 'Alias']
num IDs: 100
Tumor: 100
Normal: 0
NO dup index


mature
['Name', 'ID', 'Alias', 'Derives_from']
num IDs: 99
Tumor: 99
Normal: 0
NO dup index


total
['Name', 'ID', 'Alias']
num IDs: 99
Tumor: 99
Normal: 0
NO dup index




In [72]:
df

Name,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-9900,hsa-mir-9901,hsa-mir-9902-1,hsa-mir-9902-2,hsa-mir-9903,hsa-mir-9983,hsa-mir-9985,hsa-mir-9986,hsa-mir-99a,hsa-mir-99b
ID,MI0000060,MI0000061,MI0000062,MI0000063,MI0000064,MI0000065,MI0000066,MI0000067,MI0000068,MI0000433,...,MI0031828,MI0031829,MI0031830,MI0041071,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000746
Alias,MI0000060,MI0000061,MI0000062,MI0000063,MI0000064,MI0000065,MI0000066,MI0000067,MI0000068,MI0000433,...,MI0031828,MI0031829,MI0031830,MI0041071,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000746
C3L-00104,38885.757074,38595.352386,38880.062864,22406.714612,8193.967554,7191.786672,10141.387223,16422.100366,15778.654686,6815.968841,...,,,,,,,,,5745.457444,318.875735
C3L-00365,22253.994353,21374.700647,21892.983522,12985.666798,7502.591414,2859.491725,9579.297280,9218.286450,9139.650427,3942.524216,...,,10.723094,,,,,3.574365,,7734.925117,275.226079
C3L-00674,31033.859949,31230.411270,30199.844884,15856.910638,9041.360773,5933.725019,17370.887031,17429.321207,17264.643073,5354.695452,...,,5.312198,,,,,15.936594,,11506.220584,2401.113437
C3L-00677,21538.021467,21375.494893,20998.433244,8581.403059,7697.258502,8951.963646,11630.401571,17708.895404,17669.889027,8626.910500,...,,19.503189,,,,,13.002126,,13229.663050,364.059524
C3L-01040,21718.860158,21679.061199,20996.793341,7334.379477,4849.787360,5264.833640,14759.728003,13002.888267,12832.321303,4866.844056,...,,5.685565,,,,,11.371131,,4491.596734,329.762798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03183,28721.470132,29735.330811,29281.671117,8712.328202,7251.681462,6784.274505,9279.402819,22318.682187,22191.520001,9709.004801,...,,3.436816,3.436816,,,,27.494527,,9599.026694,240.577110
C3N-03184,27992.525744,28385.232184,28469.933574,11837.660804,5536.390797,3557.458342,7702.692991,12209.833575,12084.064845,4884.446772,...,,,,,,,2.566709,,7014.815043,177.102904
C3N-03186,27472.047701,27858.743695,28124.857068,12390.903912,3995.858611,6124.765592,9733.928208,16557.241402,16386.762523,6274.454364,...,,20.790107,,,,,4.158021,,3051.987742,266.113373
C3N-03188,17682.880592,17889.953744,18000.781628,6915.076675,2729.865782,4231.875266,11368.607710,15527.569894,15979.631001,5413.067191,...,,,,,,,2.916523,,2280.721198,335.400176


In [11]:
# check tumor/normal counts
tumor = all_df.loc[~ all_df.index.str.contains('.A$')]
print('Tumor:', len(tumor))
normal = all_df.loc[all_df.index.str.contains('.A$')]
print('Normal:', len(normal))

# check for duplicate indices 
if True in (all_df.index.duplicated()):
    print('Duplicates present')
else:
    print('NO dup index')

all_df

ccRCC mature
Tumor: 110
Normal: 75
NO dup index


Name,hsa-let-7a-2-3p,hsa-let-7a-3p,hsa-let-7a-3p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7b-3p,hsa-let-7b-5p,hsa-let-7c-3p,hsa-let-7c-5p,...,hsa-miR-9851-5p,hsa-miR-9898,hsa-miR-9899,hsa-miR-9901,hsa-miR-9903,hsa-miR-9985,hsa-miR-99a-3p,hsa-miR-99a-5p,hsa-miR-99b-3p,hsa-miR-99b-5p
ID,MIMAT0010195,MIMAT0004481,MIMAT0004481_1,MIMAT0000062,MIMAT0000062_1,MIMAT0000062_2,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0048639,MIMAT0039318,MIMAT0039319,MIMAT0039321,MIMAT0039323,MIMAT0039763,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Alias,MIMAT0010195,MIMAT0004481,MIMAT0004481,MIMAT0000062,MIMAT0000062,MIMAT0000062,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0048639,MIMAT0039318,MIMAT0039319,MIMAT0039321,MIMAT0039323,MIMAT0039763,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Derives_from,MI0000061,MI0000062,MI0000060,MI0000061,MI0000062,MI0000060,MI0000063,MI0000063,MI0000064,MI0000064,...,MI0039502,MI0031826,MI0031827,MI0031829,MI0031831,MI0032313,MI0000101,MI0000101,MI0000746,MI0000746
C3L-00004.T,,57.081886,48.300058,33300.694204,32962.593801,33630.012778,127.336515,12610.705927,,6862.999082,...,,,,,,30.736400,48.300058,9343.865673,61.472800,2415.002876
C3L-00010.T,,41.253197,75.005813,46274.836300,46473.601704,45971.062757,138.760754,17836.382320,,8441.904248,...,,,,,,33.752616,30.002325,8141.880996,112.508719,2430.188340
C3L-00011.T,,22.135035,38.736311,38481.757964,38005.854717,37568.687780,420.565661,58525.031957,,7155.149993,...,,,,11.067517,,,,1610.323780,55.337587,254.552900
C3L-00026.T,,43.049636,32.287227,30000.215248,30511.429678,29881.828749,91.480477,10783.933876,,5811.700891,...,,5.381205,,16.143614,,37.668432,69.955659,7130.096001,150.673727,3024.236945
C3L-00079.T,,57.970268,96.617113,42144.384613,43033.262051,44224.873110,161.028521,14924.123361,,5513.616572,...,,,,,,25.764563,57.970268,2228.634736,64.411409,193.234226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646.A,,90.432266,83.475938,32757.349361,33320.811943,33310.377450,139.126563,14114.389860,10.434492,5161.595503,...,,,,6.956328,,48.694297,69.563282,5373.763513,48.694297,163.473712
C3N-01648.A,,84.388186,111.238972,30690.448792,30644.418872,30767.165324,99.731492,12159.570387,11.507480,4188.722670,...,,,,,,38.358266,65.209053,5082.470272,46.029919,161.104718
C3N-01649.A,,59.224665,71.915664,33555.002792,34659.119752,33774.980117,287.662656,15051.525458,4.230333,5283.686143,...,,4.230333,,,,50.763998,67.685331,5419.056805,71.915664,249.589658
C3N-01651.A,,93.273638,76.617631,35097.537576,35377.358491,35450.644921,193.209679,13211.544611,3.331201,4563.745869,...,,3.331201,,,,36.643215,26.649611,4437.160217,76.617631,169.891270


In [61]:
test = '\.1'
all_df.loc[all_df.index.str.contains(test)]

Name,hsa-let-7a-2-3p,hsa-let-7a-3p,hsa-let-7a-3p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7b-3p,hsa-let-7b-5p,hsa-let-7c-3p,hsa-let-7c-5p,...,hsa-miR-9899,hsa-miR-9901,hsa-miR-9903,hsa-miR-9983-3p,hsa-miR-9985,hsa-miR-9986,hsa-miR-99a-3p,hsa-miR-99a-5p,hsa-miR-99b-3p,hsa-miR-99b-5p
ID,MIMAT0010195,MIMAT0004481,MIMAT0004481_1,MIMAT0000062,MIMAT0000062_1,MIMAT0000062_2,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0039319,MIMAT0039321,MIMAT0039323,MIMAT0041993,MIMAT0039763,MIMAT0039766,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Alias,MIMAT0010195,MIMAT0004481,MIMAT0004481,MIMAT0000062,MIMAT0000062,MIMAT0000062,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0039319,MIMAT0039321,MIMAT0039323,MIMAT0041993,MIMAT0039763,MIMAT0039766,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Derives_from,MI0000061,MI0000062,MI0000060,MI0000061,MI0000062,MI0000060,MI0000063,MI0000063,MI0000064,MI0000064,...,MI0031827,MI0031829,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000101,MI0000746,MI0000746


In [171]:
# check na (first col in LSCC has lots of na)
# no na in file, but na appear when combining when a miRNA was not included in some files
all_df.columns = all_df.columns.droplevel(['ID','Alias','Derives_from'])
all_df.iloc[:,0].dropna()

Patient_ID
C3L-01663       4.727194
C3L-03678       4.113043
C3L-04014       8.284798
C3N-02283       7.457622
C3N-02288      19.479634
C3N-03441       5.603591
C3L-00923.N     3.308104
C3L-02130.N     2.990037
C3L-02358.N     2.943982
C3L-02552.N     4.337586
C3L-02625.N     4.738640
C3L-02968.N     2.691254
C3N-04124.N     8.639831
Name: hsa-let-7a-2-3p, dtype: float64

In [75]:
ident = 'total'
for can in cancer_names:
    df = pd.read_csv('~/Documents/miRNA/datafreeze/'+can+'_'+ ident +'_miRNA_combined.tsv', delimiter = '\t')
    print(can)
    # check tumor/normal counts
    tumor = [col for col in df.columns if '.N' not in col]
    if ident == 'mature':
        print('Tumor:', len(tumor) - 4)
    else:
        print('Tumor:', len(tumor) - 3)
    normal = [col for col in df.columns if '.N' in col]
    print('Normal:', len(normal))
    #print(df.head())

GBM
Tumor: 99
Normal: 0
HNSCC
Tumor: 111
Normal: 61
LSCC
Tumor: 109
Normal: 97
LUAD
Tumor: 110
Normal: 101
EC
Tumor: 95
Normal: 15
PDA
Tumor: 145
Normal: 38
ccRCC
Tumor: 110
Normal: 75


In [35]:
# total and precursor have same num of tumor and normal
# mature is off by a couple

In [43]:
# Check patient_IDs
new = {}
for can in cancer_names:
    #print(can)
    for i in identifier_list:
        #print(i)
        df = pd.read_csv('~/Documents/miRNA/datafreeze/'+can+'_'+ i +'_miRNA_combined.tsv', delimiter = '\t')
        if i == 'mature':
            new[can+'_'+i] = list(df.columns[4:]) # 'Name', 'ID', 'Alias', 'Derives_from'
        else:
            new[can+'_'+i] = list(df.columns[3:])
        #print(list(df.columns[0:6]))
print('done')

done


In [71]:
#for can in cancer_names:
can = 'PDA'
t = new[can+'_total'] 
m = new[can+'_mature'] 
p = new[can+'_precursor']

l3 = {'total':t,'pre':p, 'mature':m}

for l in l3:
    print(l, len(l3[l]))

total 183
pre 183
mature 183


In [72]:
n = set(t) ^ set(p) 
print('total vs pre:', n, '\n')

n2 = set(t) ^ set(m) 
print('total vs mature:', n2, '\n')

n3 = set(m) ^ set(p) 
print('mature vs pre:', n3)

total vs pre: set() 

total vs mature: set() 

mature vs pre: set()


In [73]:
p

['C3L-00017',
 'C3L-00102',
 'C3L-00189',
 'C3L-00277',
 'C3L-00401',
 'C3L-00589',
 'C3L-00598',
 'C3L-00599',
 'C3L-00622',
 'C3L-00625',
 'C3L-00640',
 'C3L-00819',
 'C3L-00881',
 'C3L-00928',
 'C3L-01031',
 'C3L-01036',
 'C3L-01037',
 'C3L-01051',
 'C3L-01052',
 'C3L-01053',
 'C3L-01054',
 'C3L-01124',
 'C3L-01328',
 'C3L-01453',
 'C3L-01598',
 'C3L-01637',
 'C3L-01662',
 'C3L-01687',
 'C3L-01689',
 'C3L-01703',
 'C3L-01971',
 'C3L-02109',
 'C3L-02112',
 'C3L-02115',
 'C3L-02116',
 'C3L-02118',
 'C3L-02463',
 'C3L-02604',
 'C3L-02606',
 'C3L-02610',
 'C3L-02613',
 'C3L-02701',
 'C3L-02809',
 'C3L-02890',
 'C3L-02897',
 'C3L-02899',
 'C3L-03123',
 'C3L-03129',
 'C3L-03356',
 'C3L-03371',
 'C3L-03388',
 'C3L-03394',
 'C3L-03395',
 'C3L-03628',
 'C3L-03630',
 'C3L-03632',
 'C3L-03635',
 'C3L-03639',
 'C3L-03743',
 'C3L-04027',
 'C3L-04072',
 'C3L-04080',
 'C3L-04473',
 'C3L-04475',
 'C3L-04479',
 'C3L-04495',
 'C3L-04848',
 'C3L-04853',
 'C3N-00198',
 'C3N-00249',
 'C3N-00302',
 'C3N-

In [29]:
dif = list(n2)[0]

if dif in t:
    print('in total')
if dif in m:
    print('in mature')
if dif in p:
    print('in pre')

in total
in pre


In [28]:
dif

'Derives_from'