# Preprocess miRNA data

In [1]:
import os
import pandas as pd
import glob
import re
import numpy as np

Set the path to save output files (PATH) and folder with data files (DATAPATH)

In [13]:
PATH = "/data/malvika/cTaG2.0"
DATAPATH = "/data/malvika/data/cTaG2.0"
# PATH = "D:/Projects/cTaG2.0"
# DATAPATH = "D:/Projects/data/cTaG2.0"
ctype = "LUAD"

### Uploading the cancer miRNA domain data

In [14]:
os.chdir(DATAPATH + "/miRNA")
fname = "consensus_miRNA.tsv"
onco_mirna = pd.read_csv(fname, sep="\t", comment="#", header=0)
onco_mirna = onco_mirna[onco_mirna["Cancer-type"] == ctype]

In [15]:
onco_mirna.head()

Unnamed: 0,miRNA,Cancer-type
5,hsa-mir-184,LUAD
6,hsa-mir-3154,LUAD


### Upload file with aliqoute to sample mapping

In [16]:
os.chdir(DATAPATH + "/GDC_{}/miRNA".format(ctype))
fname = "gdc_sample_sheet.2021-12-07.tsv"
data_aliq=pd.read_csv(fname, sep="\t", comment="#", header=0, index_col=1)
data_aliq = data_aliq[data_aliq["Sample Type"] == "Primary Tumor"]
data_aliq.drop_duplicates(subset=["Sample ID"], inplace=True)

In [17]:
data_aliq.head()

Unnamed: 0_level_0,File ID,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type
File Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
28aeff63-ff53-417c-868c-156b9ee973c9.mirbase21.mirnas.quantification.txt,c1de89ed-d37c-4d19-9f04-579905c81120,Transcriptome Profiling,miRNA Expression Quantification,TCGA-LUAD,TCGA-44-7672,TCGA-44-7672-01A,Primary Tumor
736f7d76-a7af-4d67-be9c-d7f94d568474.mirbase21.mirnas.quantification.txt,c678f47f-3de5-495f-9a5d-ca0436288a71,Transcriptome Profiling,miRNA Expression Quantification,TCGA-LUAD,TCGA-50-5072,TCGA-50-5072-01A,Primary Tumor
3efaf7a3-ffcb-42d1-bd80-ebb7cf87b2a5.mirbase21.mirnas.quantification.txt,c298ebea-5856-4622-bcdf-45c149d37a0b,Transcriptome Profiling,miRNA Expression Quantification,TCGA-LUAD,TCGA-05-4425,TCGA-05-4425-01A,Primary Tumor
089c5bc7-08b0-47c9-8ddd-e3c006e6b063.mirbase21.mirnas.quantification.txt,3eac86e8-3bb1-4ef2-8979-557afa68662f,Transcriptome Profiling,miRNA Expression Quantification,TCGA-LUAD,TCGA-69-A59K,TCGA-69-A59K-01A,Primary Tumor
79a1f665-4862-411c-a7f1-d43310707b39.mirbase21.mirnas.quantification.txt,0f6fd768-d21c-4be7-94bf-beb60f021c22,Transcriptome Profiling,miRNA Expression Quantification,TCGA-LUAD,TCGA-69-7761,TCGA-69-7761-01A,Primary Tumor


### Load miRNA data

In [18]:
%%time
os.chdir(DATAPATH + "/GDC_{}/miRNA".format(ctype))
data = [None] * len(glob.glob("*/*.mirnas.quantification.txt"))
for idx, file in enumerate(glob.glob("*/*.mirnas.quantification.txt")):
    if file.split("/")[1] in data_aliq.index:
        temp = pd.read_csv(file, sep="\t", comment="#", header=0)
        temp["Sample ID"] = [data_aliq.loc[file.split("/")[1], "Sample ID"]] * len(temp)
        data[idx] = temp[temp["miRNA_ID"].isin(onco_mirna.miRNA)]

CPU times: user 2.16 s, sys: 108 ms, total: 2.26 s
Wall time: 2.26 s


In [19]:
%%time
data = pd.concat(data)

CPU times: user 173 ms, sys: 8 ms, total: 181 ms
Wall time: 178 ms


Save file

In [20]:
os.chdir(PATH + "/data/GDC_{}/miRNA".format(ctype))
fname="{}_miRNA.tsv".format(ctype)
data.to_csv(fname, sep="\t", header=True, index=False)

In [21]:
data.head()

Unnamed: 0,miRNA_ID,read_count,reads_per_million_miRNA_mapped,cross-mapped,Sample ID
230,hsa-mir-184,1,0.387757,N,TCGA-44-3919-01A
406,hsa-mir-3154,0,0.0,N,TCGA-44-3919-01A
230,hsa-mir-184,3,0.610196,N,TCGA-L9-A8F4-01A
406,hsa-mir-3154,0,0.0,N,TCGA-L9-A8F4-01A
230,hsa-mir-184,29,4.526039,N,TCGA-97-A4M6-01A


In [22]:
data.shape

(1032, 5)