## miRNA for datafreeze

This notebook combines the miRNA data from the WashU pipeline. The format of the output file is for the datafreeze according to Yize's instructions: rows are miRNA identifiers and columns are Patient_IDs. Normal samples have a ".A" appended to the Patient_ID and tumor samples have a ".T". Replicates have a ".1" appended to the Patient_ID.

In [70]:
import pandas as pd
import numpy as np
import warnings
from functools import reduce
import re
import glob

In [2]:
''' Adds '_i' to duplicate col names where the int i increases by one with each duplicate 
Returns a df with unique column names. '''

def rename_duplicate_index(df):
    cols = pd.Series(df.index[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.index=cols
    return df

In [3]:
cancer_names = ['GBM','HNSCC','LSCC','LUAD','EC','PDA','ccRCC']
identifier_list = ['total','precursor', 'mature']
#identifier_list = ['total']

# Make all combined files

In [None]:
# Create all files for yize
for identifier in identifier_list:
    print('making '+identifier+' files')
    for ca in cancer_names:
        print(ca)
        path = '../../../miRNA/'+ca +'/WashU_pipeline/' 
        all_files = glob.glob(path + '*'+identifier+'.txt.gz') 
        
        all_df = pd.DataFrame()

        for filename in all_files:
            if ca == 'LUAD':
                if re.search('\w3\w-\d\d\d\d\d\.[AT]', filename) != None:
                    patient = re.search('\w3\w-\d\d\d\d\d\.[AT]', filename).group()
                else: 
                    patient = re.search('\d\dLU\d\d\d\.[AT]', filename).group()
            else:
                patient = re.search('\w3\w-\d\d\d\d\d\.[AT]', filename).group()
            
            # Use different cols in files
            if identifier == 'precursor':
                df = pd.read_csv(filename, sep = '\t', index_col=['Name', 'ID', 'Alias'])
                df = df.drop(columns = ['primary_miR_Count_Raw']) 
            elif identifier == 'mature':
                df = pd.read_csv(filename, sep = '\t', index_col=['Name', 'ID', 'Alias', 'Derives_from'])
                df = df.drop(columns = ['miR_Count_Raw'])
            else:
                df = pd.read_csv(filename, sep = '\t', index_col=['Name', 'ID', 'Alias'])
                df = df.drop(columns = ['Count_Raw'])
                
            df = df.transpose()
            df['Patient_ID'] = patient
            df = df.set_index('Patient_ID')
            all_df = all_df.append(df) # change to join?
        
        # Create unique identifiers for duplicate Patient_IDs
        all_df = rename_duplicate_index(all_df)

        # sort values
        normal = all_df.loc[all_df.index.str.contains('.A$')]
        normal = normal.sort_values(by=["Patient_ID"])
        tumor = all_df.loc[~ all_df.index.str.contains('.A$')] # doesn't contain A (includes .T.1 for replicates)
        tumor = tumor.sort_values(by=["Patient_ID"])
        all_df = tumor.append(normal)

        # Create files for datafreeze (identifier cols, miRNA rows)
        transposed_df = all_df.transpose()
        transposed_df.to_csv('~/Documents/miRNA/datafreeze/'+ca+'_'+identifier+'_miRNA_combined.tsv', sep = '\t', na_rep = 'NA') #yize

making total files
GBM
HNSCC
LSCC
LUAD
EC
PDA
ccRCC
making precursor files
GBM
HNSCC
LSCC
LUAD
EC
PDA
ccRCC
making mature files
GBM
HNSCC
LSCC


# Test individually

In [4]:
cancer_names = ['GBM','HNSCC','LSCC','LUAD','EC','PDA','ccRCC']
identifier_list = ['total','precursor', 'mature']

In [64]:
ca = cancer_names[6]
identifier = identifier_list[0] 

path = '../../../miRNA/'+ca +'/WashU_pipeline/' 
all_files = glob.glob(path + '*'+identifier+'.txt.gz') 
print(ca)
print(identifier)
print('Num files:', len(all_files))

ccRCC
total
Num files: 185


In [29]:
all_df = pd.DataFrame()

for filename in all_files:
    if ca == 'LUAD':
        if re.search('\w3\w-\d\d\d\d\d\.[AT]', filename) != None:
            patient = re.search('\w3\w-\d\d\d\d\d\.[AT]', filename).group()
        else: 
            patient = re.search('\d\dLU\d\d\d\.[AT]', filename).group()
    else:
        patient = re.search('\w3\w-\d\d\d\d\d\.[AT]', filename).group()

    if identifier == 'precursor':
        df = pd.read_csv(filename, sep = '\t', index_col=['Name', 'ID', 'Alias'])
        df = df.drop(columns = ['primary_miR_Count_Raw']) 
    elif identifier == 'mature':
        df = pd.read_csv(filename, sep = '\t', index_col=['Name', 'ID', 'Alias', 'Derives_from'])
        df = df.drop(columns = ['miR_Count_Raw'])
    else:
        df = pd.read_csv(filename, sep = '\t', index_col=['Name', 'ID', 'Alias'])
        df = df.drop(columns = ['Count_Raw'])

    df = df.transpose()
    df['Patient_ID'] = patient
    df = df.set_index('Patient_ID')
    all_df = all_df.append(df) # change to join?

# Create unique identifiers for duplicate Patient_IDs
all_df = rename_duplicate_index(all_df)

# sort values
normal = all_df.loc[all_df.index.str.contains('.A$')]
normal = normal.sort_values(by=["Patient_ID"])
tumor = all_df.loc[~ all_df.index.str.contains('.A$')] # doesn't contain A (includes .T.1 for replicates)
tumor = tumor.sort_values(by=["Patient_ID"])
all_df = tumor.append(normal)

# Create files for datafreeze (identifier cols, miRNA rows)
all_df = all_df.transpose()
#all_df.to_csv('~/Documents/miRNA/datafreeze/'+ca+'_'+identifier+'_miRNA_combined.tsv', sep = '\t', na_rep = 'NA') #yize

In [31]:
len(all_df.columns)

99

# Test created files

In [4]:
cancer_names = ['GBM','HNSCC','LSCC','LUAD','EC','PDA','ccRCC']
identifier_list = ['total','precursor', 'mature']

In [69]:
ca = cancer_names[6]
identifier = identifier_list[2] 

path = '../../../miRNA/datafreeze/'
fn = ca+'_'+identifier+'_miRNA_combined.tsv'

In [68]:
if identifier == 'mature':
    all_df = pd.read_csv(path+fn, delimiter = '\t', index_col = ['Name', 'ID','Alias', 'Derives_from'])
else:
    all_df = pd.read_csv(path+fn, delimiter = '\t', index_col = ['Name', 'ID','Alias'])
all_df = all_df.T

print(ca, identifier)

# check tumor/normal counts
tumor = all_df.loc[~ all_df.index.str.contains('.A$')]
print('Tumor:', len(tumor))
normal = all_df.loc[all_df.index.str.contains('.A$')]
print('Normal:', len(normal))

# check for duplicate indices 
if True in (all_df.index.duplicated()):
    print('Duplicates present')
else:
    print('NO dup index')

all_df

ccRCC mature
Tumor: 110
Normal: 75
NO dup index


Name,hsa-let-7a-2-3p,hsa-let-7a-3p,hsa-let-7a-3p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7b-3p,hsa-let-7b-5p,hsa-let-7c-3p,hsa-let-7c-5p,...,hsa-miR-9851-5p,hsa-miR-9898,hsa-miR-9899,hsa-miR-9901,hsa-miR-9903,hsa-miR-9985,hsa-miR-99a-3p,hsa-miR-99a-5p,hsa-miR-99b-3p,hsa-miR-99b-5p
ID,MIMAT0010195,MIMAT0004481,MIMAT0004481_1,MIMAT0000062,MIMAT0000062_1,MIMAT0000062_2,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0048639,MIMAT0039318,MIMAT0039319,MIMAT0039321,MIMAT0039323,MIMAT0039763,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Alias,MIMAT0010195,MIMAT0004481,MIMAT0004481,MIMAT0000062,MIMAT0000062,MIMAT0000062,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0048639,MIMAT0039318,MIMAT0039319,MIMAT0039321,MIMAT0039323,MIMAT0039763,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Derives_from,MI0000061,MI0000062,MI0000060,MI0000061,MI0000062,MI0000060,MI0000063,MI0000063,MI0000064,MI0000064,...,MI0039502,MI0031826,MI0031827,MI0031829,MI0031831,MI0032313,MI0000101,MI0000101,MI0000746,MI0000746
C3L-00004.T,,57.081886,48.300058,33300.694204,32962.593801,33630.012778,127.336515,12610.705927,,6862.999082,...,,,,,,30.736400,48.300058,9343.865673,61.472800,2415.002876
C3L-00010.T,,41.253197,75.005813,46274.836300,46473.601704,45971.062757,138.760754,17836.382320,,8441.904248,...,,,,,,33.752616,30.002325,8141.880996,112.508719,2430.188340
C3L-00011.T,,22.135035,38.736311,38481.757964,38005.854717,37568.687780,420.565661,58525.031957,,7155.149993,...,,,,11.067517,,,,1610.323780,55.337587,254.552900
C3L-00026.T,,43.049636,32.287227,30000.215248,30511.429678,29881.828749,91.480477,10783.933876,,5811.700891,...,,5.381205,,16.143614,,37.668432,69.955659,7130.096001,150.673727,3024.236945
C3L-00079.T,,57.970268,96.617113,42144.384613,43033.262051,44224.873110,161.028521,14924.123361,,5513.616572,...,,,,,,25.764563,57.970268,2228.634736,64.411409,193.234226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646.A,,90.432266,83.475938,32757.349361,33320.811943,33310.377450,139.126563,14114.389860,10.434492,5161.595503,...,,,,6.956328,,48.694297,69.563282,5373.763513,48.694297,163.473712
C3N-01648.A,,84.388186,111.238972,30690.448792,30644.418872,30767.165324,99.731492,12159.570387,11.507480,4188.722670,...,,,,,,38.358266,65.209053,5082.470272,46.029919,161.104718
C3N-01649.A,,59.224665,71.915664,33555.002792,34659.119752,33774.980117,287.662656,15051.525458,4.230333,5283.686143,...,,4.230333,,,,50.763998,67.685331,5419.056805,71.915664,249.589658
C3N-01651.A,,93.273638,76.617631,35097.537576,35377.358491,35450.644921,193.209679,13211.544611,3.331201,4563.745869,...,,3.331201,,,,36.643215,26.649611,4437.160217,76.617631,169.891270


In [61]:
test = '\.1'
all_df.loc[all_df.index.str.contains(test)]

Name,hsa-let-7a-2-3p,hsa-let-7a-3p,hsa-let-7a-3p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7b-3p,hsa-let-7b-5p,hsa-let-7c-3p,hsa-let-7c-5p,...,hsa-miR-9899,hsa-miR-9901,hsa-miR-9903,hsa-miR-9983-3p,hsa-miR-9985,hsa-miR-9986,hsa-miR-99a-3p,hsa-miR-99a-5p,hsa-miR-99b-3p,hsa-miR-99b-5p
ID,MIMAT0010195,MIMAT0004481,MIMAT0004481_1,MIMAT0000062,MIMAT0000062_1,MIMAT0000062_2,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0039319,MIMAT0039321,MIMAT0039323,MIMAT0041993,MIMAT0039763,MIMAT0039766,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Alias,MIMAT0010195,MIMAT0004481,MIMAT0004481,MIMAT0000062,MIMAT0000062,MIMAT0000062,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0039319,MIMAT0039321,MIMAT0039323,MIMAT0041993,MIMAT0039763,MIMAT0039766,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Derives_from,MI0000061,MI0000062,MI0000060,MI0000061,MI0000062,MI0000060,MI0000063,MI0000063,MI0000064,MI0000064,...,MI0031827,MI0031829,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000101,MI0000746,MI0000746


In [171]:
# check na (first col in LSCC has lots of na)
# no na in file, but na appear when combining when a miRNA was not included in some files
all_df.columns = all_df.columns.droplevel(['ID','Alias','Derives_from'])
all_df.iloc[:,0].dropna()

Patient_ID
C3L-01663       4.727194
C3L-03678       4.113043
C3L-04014       8.284798
C3N-02283       7.457622
C3N-02288      19.479634
C3N-03441       5.603591
C3L-00923.N     3.308104
C3L-02130.N     2.990037
C3L-02358.N     2.943982
C3L-02552.N     4.337586
C3L-02625.N     4.738640
C3L-02968.N     2.691254
C3N-04124.N     8.639831
Name: hsa-let-7a-2-3p, dtype: float64

In [75]:
ident = 'total'
for can in cancer_names:
    df = pd.read_csv('~/Documents/miRNA/datafreeze/'+can+'_'+ ident +'_miRNA_combined.tsv', delimiter = '\t')
    print(can)
    # check tumor/normal counts
    tumor = [col for col in df.columns if '.N' not in col]
    if ident == 'mature':
        print('Tumor:', len(tumor) - 4)
    else:
        print('Tumor:', len(tumor) - 3)
    normal = [col for col in df.columns if '.N' in col]
    print('Normal:', len(normal))
    #print(df.head())

GBM
Tumor: 99
Normal: 0
HNSCC
Tumor: 111
Normal: 61
LSCC
Tumor: 109
Normal: 97
LUAD
Tumor: 110
Normal: 101
EC
Tumor: 95
Normal: 15
PDA
Tumor: 145
Normal: 38
ccRCC
Tumor: 110
Normal: 75


In [35]:
# total and precursor have same num of tumor and normal
# mature is off by a couple

In [43]:
# Check patient_IDs
new = {}
for can in cancer_names:
    #print(can)
    for i in identifier_list:
        #print(i)
        df = pd.read_csv('~/Documents/miRNA/datafreeze/'+can+'_'+ i +'_miRNA_combined.tsv', delimiter = '\t')
        if i == 'mature':
            new[can+'_'+i] = list(df.columns[4:]) # 'Name', 'ID', 'Alias', 'Derives_from'
        else:
            new[can+'_'+i] = list(df.columns[3:])
        #print(list(df.columns[0:6]))
print('done')

done


In [71]:
#for can in cancer_names:
can = 'PDA'
t = new[can+'_total'] 
m = new[can+'_mature'] 
p = new[can+'_precursor']

l3 = {'total':t,'pre':p, 'mature':m}

for l in l3:
    print(l, len(l3[l]))

total 183
pre 183
mature 183


In [72]:
n = set(t) ^ set(p) 
print('total vs pre:', n, '\n')

n2 = set(t) ^ set(m) 
print('total vs mature:', n2, '\n')

n3 = set(m) ^ set(p) 
print('mature vs pre:', n3)

total vs pre: set() 

total vs mature: set() 

mature vs pre: set()


In [73]:
p

['C3L-00017',
 'C3L-00102',
 'C3L-00189',
 'C3L-00277',
 'C3L-00401',
 'C3L-00589',
 'C3L-00598',
 'C3L-00599',
 'C3L-00622',
 'C3L-00625',
 'C3L-00640',
 'C3L-00819',
 'C3L-00881',
 'C3L-00928',
 'C3L-01031',
 'C3L-01036',
 'C3L-01037',
 'C3L-01051',
 'C3L-01052',
 'C3L-01053',
 'C3L-01054',
 'C3L-01124',
 'C3L-01328',
 'C3L-01453',
 'C3L-01598',
 'C3L-01637',
 'C3L-01662',
 'C3L-01687',
 'C3L-01689',
 'C3L-01703',
 'C3L-01971',
 'C3L-02109',
 'C3L-02112',
 'C3L-02115',
 'C3L-02116',
 'C3L-02118',
 'C3L-02463',
 'C3L-02604',
 'C3L-02606',
 'C3L-02610',
 'C3L-02613',
 'C3L-02701',
 'C3L-02809',
 'C3L-02890',
 'C3L-02897',
 'C3L-02899',
 'C3L-03123',
 'C3L-03129',
 'C3L-03356',
 'C3L-03371',
 'C3L-03388',
 'C3L-03394',
 'C3L-03395',
 'C3L-03628',
 'C3L-03630',
 'C3L-03632',
 'C3L-03635',
 'C3L-03639',
 'C3L-03743',
 'C3L-04027',
 'C3L-04072',
 'C3L-04080',
 'C3L-04473',
 'C3L-04475',
 'C3L-04479',
 'C3L-04495',
 'C3L-04848',
 'C3L-04853',
 'C3N-00198',
 'C3N-00249',
 'C3N-00302',
 'C3N-

In [29]:
dif = list(n2)[0]

if dif in t:
    print('in total')
if dif in m:
    print('in mature')
if dif in p:
    print('in pre')

in total
in pre


In [28]:
dif

'Derives_from'