## miRNA for datafreeze

This notebook combines the miRNA data from the WashU pipeline. The format of the output file is for the datafreeze according to Yize's instructions: rows are miRNA identifiers and columns are Patient_IDs. Normal samples have a ".A" appended to the Patient_ID and tumor samples have a ".T". Replicates have a ".1" appended to the Patient_ID.

In [1]:
import pandas as pd
import numpy as np
import warnings
from functools import reduce
import re
import glob

In [2]:
''' Adds '_i' to duplicate col names where the int i increases by one with each duplicate 
Returns a df with unique column names. '''

def rename_duplicate_index(df):
    cols = pd.Series(df.index[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.index=cols
    return df

In [3]:
cancer_names = ['GBM','HNSCC','LSCC','LUAD','EC','PDA','ccRCC']
identifier_list = ['total','precursor', 'mature']
#identifier_list = ['total']

# Make all combined files

In [None]:
# Create all files for yize
for identifier in identifier_list:
    print('making '+identifier+' files')
    for ca in cancer_names:
        print(ca)
        path = '../../../miRNA/'+ca +'/WashU_pipeline/' 
        all_files = glob.glob(path + '*'+identifier+'.txt.gz') 
        
        all_df = pd.DataFrame()

        for filename in all_files:
            if ca == 'LUAD':
                if re.search('\w3\w-\d\d\d\d\d\.[AT]', filename) != None:
                    patient = re.search('\w3\w-\d\d\d\d\d\.[AT]', filename).group()
                else: 
                    patient = re.search('\d\dLU\d\d\d\.[AT]', filename).group()
            else:
                patient = re.search('\w3\w-\d\d\d\d\d\.[AT]', filename).group()

            if identifier == 'precursor':
                df = pd.read_csv(filename, sep = '\t', index_col=['Name', 'ID', 'Alias'])
                df = df.drop(columns = ['primary_miR_Count_Raw']) 
            elif identifier == 'mature':
                df = pd.read_csv(filename, sep = '\t', index_col=['Name', 'ID', 'Alias', 'Derives_from'])
                df = df.drop(columns = ['miR_Count_Raw'])
            else:
                df = pd.read_csv(filename, sep = '\t', index_col=['Name', 'ID', 'Alias'])
                df = df.drop(columns = ['Count_Raw'])
                
            df = df.transpose()
            df['Patient_ID'] = patient
            df = df.set_index('Patient_ID')
            all_df = all_df.append(df) # change to join?
        
        # Create unique identifiers for duplicate Patient_IDs
        all_df = rename_duplicate_index(all_df)

        # sort values
        normal = all_df.loc[all_df.index.str.contains('.A$')]
        normal = normal.sort_values(by=["Patient_ID"])
        tumor = all_df.loc[~ all_df.index.str.contains('.A$')] # doesn't contain A (includes .T.1 for replicates)
        tumor = tumor.sort_values(by=["Patient_ID"])
        all_df = tumor.append(normal)

        # Create files for datafreeze (identifier cols, miRNA rows)
        transposed_df = all_df.transpose()
        transposed_df.to_csv('~/Documents/miRNA/datafreeze/'+ca+'_'+identifier+'_miRNA_combined.tsv', sep = '\t', na_rep = 'NA') #yize

making total files
GBM
HNSCC
LSCC
LUAD
EC
PDA
ccRCC
making precursor files
GBM
HNSCC
LSCC
LUAD
EC
PDA
ccRCC
making mature files
GBM
HNSCC
LSCC


# Test individually

In [4]:
cancer_names = ['GBM','HNSCC','LSCC','LUAD','EC','PDA','ccRCC']
identifier_list = ['total','precursor', 'mature']

In [33]:
ca = cancer_names[2]
identifier = identifier_list[0] 

path = '../../../miRNA/'+ca +'/WashU_pipeline/' 
all_files = glob.glob(path + '*'+identifier+'.txt.gz') 
print(ca)
print(identifier)
print('Num files:', len(all_files))

LSCC
total
Num files: 206


In [29]:
all_df = pd.DataFrame()

for filename in all_files:
    if ca == 'LUAD':
        if re.search('\w3\w-\d\d\d\d\d\.[AT]', filename) != None:
            patient = re.search('\w3\w-\d\d\d\d\d\.[AT]', filename).group()
        else: 
            patient = re.search('\d\dLU\d\d\d\.[AT]', filename).group()
    else:
        patient = re.search('\w3\w-\d\d\d\d\d\.[AT]', filename).group()

    if identifier == 'precursor':
        df = pd.read_csv(filename, sep = '\t', index_col=['Name', 'ID', 'Alias'])
        df = df.drop(columns = ['primary_miR_Count_Raw']) 
    elif identifier == 'mature':
        df = pd.read_csv(filename, sep = '\t', index_col=['Name', 'ID', 'Alias', 'Derives_from'])
        df = df.drop(columns = ['miR_Count_Raw'])
    else:
        df = pd.read_csv(filename, sep = '\t', index_col=['Name', 'ID', 'Alias'])
        df = df.drop(columns = ['Count_Raw'])

    df = df.transpose()
    df['Patient_ID'] = patient
    df = df.set_index('Patient_ID')
    all_df = all_df.append(df) # change to join?

# Create unique identifiers for duplicate Patient_IDs
all_df = rename_duplicate_index(all_df)

# sort values
normal = all_df.loc[all_df.index.str.contains('.A$')]
normal = normal.sort_values(by=["Patient_ID"])
tumor = all_df.loc[~ all_df.index.str.contains('.A$')] # doesn't contain A (includes .T.1 for replicates)
tumor = tumor.sort_values(by=["Patient_ID"])
all_df = tumor.append(normal)

# Create files for datafreeze (identifier cols, miRNA rows)
all_df = all_df.transpose()
#all_df.to_csv('~/Documents/miRNA/datafreeze/'+ca+'_'+identifier+'_miRNA_combined.tsv', sep = '\t', na_rep = 'NA') #yize

In [31]:
len(all_df.columns)

99

# Test created files

In [4]:
cancer_names = ['GBM','HNSCC','LSCC','LUAD','EC','PDA','ccRCC']
identifier_list = ['total','precursor', 'mature']

In [42]:
ca = cancer_names[3]
identifier = identifier_list[2] 

path = '../../../miRNA/datafreeze/'
fn = ca+'_'+identifier+'_miRNA_combined.tsv'

In [43]:
if identifier == 'mature':
    all_df = pd.read_csv(path+fn, delimiter = '\t', index_col = ['Name', 'ID','Alias', 'Derives_from'])
else:
    all_df = pd.read_csv(path+fn, delimiter = '\t', index_col = ['Name', 'ID','Alias'])
all_df = all_df.T

print(ca, identifier)

# check tumor/normal counts
tumor = all_df.loc[~ all_df.index.str.contains('.A$')]
print('Tumor:', len(tumor))
normal = all_df.loc[all_df.index.str.contains('.A$')]
print('Normal:', len(normal))

# check for duplicate indices 
if True in (all_df.index.duplicated()):
    print('Duplicates present')
else:
    print('NO dup index')

all_df

LUAD mature
Tumor: 110
Normal: 101
NO dup index


Name,hsa-let-7a-2-3p,hsa-let-7a-3p,hsa-let-7a-3p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7a-5p,hsa-let-7b-3p,hsa-let-7b-5p,hsa-let-7c-3p,hsa-let-7c-5p,...,hsa-miR-9900,hsa-miR-9901,hsa-miR-9902,hsa-miR-9903,hsa-miR-9983-3p,hsa-miR-9985,hsa-miR-99a-3p,hsa-miR-99a-5p,hsa-miR-99b-3p,hsa-miR-99b-5p
ID,MIMAT0010195,MIMAT0004481,MIMAT0004481_1,MIMAT0000062,MIMAT0000062_1,MIMAT0000062_2,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0039320,MIMAT0039321,MIMAT0039322,MIMAT0039323,MIMAT0041993,MIMAT0039763,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Alias,MIMAT0010195,MIMAT0004481,MIMAT0004481,MIMAT0000062,MIMAT0000062,MIMAT0000062,MIMAT0004482,MIMAT0000063,MIMAT0026472,MIMAT0000064,...,MIMAT0039320,MIMAT0039321,MIMAT0039322,MIMAT0039323,MIMAT0041993,MIMAT0039763,MIMAT0004511,MIMAT0000097,MIMAT0004678,MIMAT0000689
Derives_from,MI0000061,MI0000062,MI0000060,MI0000061,MI0000062,MI0000060,MI0000063,MI0000063,MI0000064,MI0000064,...,MI0031828,MI0031829,MI0031830,MI0031831,MI0033670,MI0032313,MI0000101,MI0000101,MI0000746,MI0000746
11LU013.T,,23.317497,64.123116,17791.250109,18076.889446,17686.321373,75.781865,6721.268472,,1923.693492,...,,5.829374,,,,11.658748,5.829374,1195.021714,81.611239,174.881227
11LU016.T,,66.384787,81.704353,19072.859857,18965.622894,19210.735952,102.130441,5795.902527,5.106522,1746.430541,...,,,,,,15.319566,5.106522,1005.984844,56.171743,81.704353
11LU022.T,,46.374172,42.158338,19650.001476,19637.353974,20463.657404,92.748344,6155.117390,,1694.765199,...,,,,,,21.079169,4.215834,645.022576,54.805840,109.611680
11LU035.T,,45.486386,72.778217,21596.936037,21697.006086,22379.301875,118.264603,6695.596008,,2529.043057,...,,9.097277,,,,18.194554,18.194554,1664.801725,36.389109,45.486386
C3L-00001.T,,33.351677,25.013758,26060.166425,26418.696950,25718.311738,212.616939,12327.613521,,4877.682725,...,,33.351677,,,,66.703354,33.351677,20152.750680,29.182717,2242.900262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-02582.A,,64.844535,68.447009,53295.003008,53442.704449,53900.218670,252.173193,17558.459150,3.602474,7788.549176,...,,,,,,75.651958,111.676700,6945.570218,28.819793,230.558347
C3N-02586.A,,71.007318,78.897020,60273.378173,60131.363538,59567.249847,193.297698,28174.125722,3.944851,9743.781929,...,,,,,,55.227914,39.448510,5278.210616,51.283063,193.297698
C3N-02587.A,,59.187941,78.917255,48922.121822,49290.402346,49218.061529,187.428481,17480.172040,3.288219,6957.871339,...,,,,,,46.035066,49.323285,6369.280143,49.323285,243.328204
C3N-02588.A,,51.857134,35.651779,49001.750178,49734.232190,48794.321644,217.151747,16503.532767,,7178.971932,...,,6.482142,,,,58.339275,61.580346,5376.936540,35.651779,171.776755


In [11]:
test = '\.1'
all_df.loc[all_df.index.str.contains(test)]

Name,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-9900,hsa-mir-9901,hsa-mir-9902-1,hsa-mir-9902-2,hsa-mir-9903,hsa-mir-9983,hsa-mir-9985,hsa-mir-9986,hsa-mir-99a,hsa-mir-99b
ID,MI0000060,MI0000061,MI0000062,MI0000063,MI0000064,MI0000065,MI0000066,MI0000067,MI0000068,MI0000433,...,MI0031828,MI0031829,MI0031830,MI0041071,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000746
Alias,MI0000060,MI0000061,MI0000062,MI0000063,MI0000064,MI0000065,MI0000066,MI0000067,MI0000068,MI0000433,...,MI0031828,MI0031829,MI0031830,MI0041071,MI0031831,MI0033670,MI0032313,MI0032316,MI0000101,MI0000746
C3N-02788.T.1,29631.059528,29128.361042,29003.691818,12394.533858,5742.827498,7399.721706,8899.773987,18274.095344,18125.296592,5493.489049,...,,12.064764,,,,,8.043176,,6004.23071,152.82034


In [171]:
# check na (first col in LSCC has lots of na)
# no na in file, but na appear when combining when a miRNA was not included in some files
all_df.columns = all_df.columns.droplevel(['ID','Alias','Derives_from'])
all_df.iloc[:,0].dropna()

Patient_ID
C3L-01663       4.727194
C3L-03678       4.113043
C3L-04014       8.284798
C3N-02283       7.457622
C3N-02288      19.479634
C3N-03441       5.603591
C3L-00923.N     3.308104
C3L-02130.N     2.990037
C3L-02358.N     2.943982
C3L-02552.N     4.337586
C3L-02625.N     4.738640
C3L-02968.N     2.691254
C3N-04124.N     8.639831
Name: hsa-let-7a-2-3p, dtype: float64

In [75]:
ident = 'total'
for can in cancer_names:
    df = pd.read_csv('~/Documents/miRNA/datafreeze/'+can+'_'+ ident +'_miRNA_combined.tsv', delimiter = '\t')
    print(can)
    # check tumor/normal counts
    tumor = [col for col in df.columns if '.N' not in col]
    if ident == 'mature':
        print('Tumor:', len(tumor) - 4)
    else:
        print('Tumor:', len(tumor) - 3)
    normal = [col for col in df.columns if '.N' in col]
    print('Normal:', len(normal))
    #print(df.head())

GBM
Tumor: 99
Normal: 0
HNSCC
Tumor: 111
Normal: 61
LSCC
Tumor: 109
Normal: 97
LUAD
Tumor: 110
Normal: 101
EC
Tumor: 95
Normal: 15
PDA
Tumor: 145
Normal: 38
ccRCC
Tumor: 110
Normal: 75


In [35]:
# total and precursor have same num of tumor and normal
# mature is off by a couple

In [43]:
# Check patient_IDs
new = {}
for can in cancer_names:
    #print(can)
    for i in identifier_list:
        #print(i)
        df = pd.read_csv('~/Documents/miRNA/datafreeze/'+can+'_'+ i +'_miRNA_combined.tsv', delimiter = '\t')
        if i == 'mature':
            new[can+'_'+i] = list(df.columns[4:]) # 'Name', 'ID', 'Alias', 'Derives_from'
        else:
            new[can+'_'+i] = list(df.columns[3:])
        #print(list(df.columns[0:6]))
print('done')

done


In [71]:
#for can in cancer_names:
can = 'PDA'
t = new[can+'_total'] 
m = new[can+'_mature'] 
p = new[can+'_precursor']

l3 = {'total':t,'pre':p, 'mature':m}

for l in l3:
    print(l, len(l3[l]))

total 183
pre 183
mature 183


In [72]:
n = set(t) ^ set(p) 
print('total vs pre:', n, '\n')

n2 = set(t) ^ set(m) 
print('total vs mature:', n2, '\n')

n3 = set(m) ^ set(p) 
print('mature vs pre:', n3)

total vs pre: set() 

total vs mature: set() 

mature vs pre: set()


In [73]:
p

['C3L-00017',
 'C3L-00102',
 'C3L-00189',
 'C3L-00277',
 'C3L-00401',
 'C3L-00589',
 'C3L-00598',
 'C3L-00599',
 'C3L-00622',
 'C3L-00625',
 'C3L-00640',
 'C3L-00819',
 'C3L-00881',
 'C3L-00928',
 'C3L-01031',
 'C3L-01036',
 'C3L-01037',
 'C3L-01051',
 'C3L-01052',
 'C3L-01053',
 'C3L-01054',
 'C3L-01124',
 'C3L-01328',
 'C3L-01453',
 'C3L-01598',
 'C3L-01637',
 'C3L-01662',
 'C3L-01687',
 'C3L-01689',
 'C3L-01703',
 'C3L-01971',
 'C3L-02109',
 'C3L-02112',
 'C3L-02115',
 'C3L-02116',
 'C3L-02118',
 'C3L-02463',
 'C3L-02604',
 'C3L-02606',
 'C3L-02610',
 'C3L-02613',
 'C3L-02701',
 'C3L-02809',
 'C3L-02890',
 'C3L-02897',
 'C3L-02899',
 'C3L-03123',
 'C3L-03129',
 'C3L-03356',
 'C3L-03371',
 'C3L-03388',
 'C3L-03394',
 'C3L-03395',
 'C3L-03628',
 'C3L-03630',
 'C3L-03632',
 'C3L-03635',
 'C3L-03639',
 'C3L-03743',
 'C3L-04027',
 'C3L-04072',
 'C3L-04080',
 'C3L-04473',
 'C3L-04475',
 'C3L-04479',
 'C3L-04495',
 'C3L-04848',
 'C3L-04853',
 'C3N-00198',
 'C3N-00249',
 'C3N-00302',
 'C3N-

In [29]:
dif = list(n2)[0]

if dif in t:
    print('in total')
if dif in m:
    print('in mature')
if dif in p:
    print('in pre')

in total
in pre


In [28]:
dif

'Derives_from'