In [1]:
import pandas as pd


In [2]:
df = pd.read_csv("data/disease.txt", sep="\t")
df = (df[["miRNA", "disease", "description"]]
      .rename(columns={"miRNA": "name"})
      .loc[lambda x: x.name.str.startswith("hsa-")]
      .assign(name=lambda x: x.name.str.replace("mir", "miR")))


In [3]:
df.head()

Unnamed: 0,name,disease,description
0,hsa-miR-40,Liver Cirrhosis,MiR-340 mediates the involvement of high mobil...
1,hsa-miR-29b,Zika Virus Infection,Zika virus NS1 suppresses VE-cadherin via hsa-...
2,hsa-miR-130b,Wound Healing,ADSCs-derived exosomal H19 accelerates cutaneo...
3,hsa-miR-221,Stomach Neoplasms,Downregulation of miR-221-3p promotes the ferr...
4,hsa-miR-497,Pancreatic Neoplasms,Long noncoding RNA CASC9 promotes pancreatic c...


In [4]:
df.description.str.contains("miR-").value_counts()



description
True     30724
False    22651
Name: count, dtype: int64

In [5]:
df['miR_words'] = df['description'].str.extract(r'\b(miR-\w+(?:-\w+)*-\w+)\b')

df.head()

Unnamed: 0,name,disease,description,miR_words
0,hsa-miR-40,Liver Cirrhosis,MiR-340 mediates the involvement of high mobil...,
1,hsa-miR-29b,Zika Virus Infection,Zika virus NS1 suppresses VE-cadherin via hsa-...,miR-29b-3p
2,hsa-miR-130b,Wound Healing,ADSCs-derived exosomal H19 accelerates cutaneo...,miR-130b-3p
3,hsa-miR-221,Stomach Neoplasms,Downregulation of miR-221-3p promotes the ferr...,miR-221-3p
4,hsa-miR-497,Pancreatic Neoplasms,Long noncoding RNA CASC9 promotes pancreatic c...,miR-497-5p


In [6]:
df["miR_words"] = df["miR_words"].apply(lambda x: f'hsa-{x}' if pd.notna(x) else x)

df.head()

Unnamed: 0,name,disease,description,miR_words
0,hsa-miR-40,Liver Cirrhosis,MiR-340 mediates the involvement of high mobil...,
1,hsa-miR-29b,Zika Virus Infection,Zika virus NS1 suppresses VE-cadherin via hsa-...,hsa-miR-29b-3p
2,hsa-miR-130b,Wound Healing,ADSCs-derived exosomal H19 accelerates cutaneo...,hsa-miR-130b-3p
3,hsa-miR-221,Stomach Neoplasms,Downregulation of miR-221-3p promotes the ferr...,hsa-miR-221-3p
4,hsa-miR-497,Pancreatic Neoplasms,Long noncoding RNA CASC9 promotes pancreatic c...,hsa-miR-497-5p


In [7]:
df['name'] = df['miR_words'].where(df['miR_words'].notna(), df['name'])

df.head()

Unnamed: 0,name,disease,description,miR_words
0,hsa-miR-40,Liver Cirrhosis,MiR-340 mediates the involvement of high mobil...,
1,hsa-miR-29b-3p,Zika Virus Infection,Zika virus NS1 suppresses VE-cadherin via hsa-...,hsa-miR-29b-3p
2,hsa-miR-130b-3p,Wound Healing,ADSCs-derived exosomal H19 accelerates cutaneo...,hsa-miR-130b-3p
3,hsa-miR-221-3p,Stomach Neoplasms,Downregulation of miR-221-3p promotes the ferr...,hsa-miR-221-3p
4,hsa-miR-497-5p,Pancreatic Neoplasms,Long noncoding RNA CASC9 promotes pancreatic c...,hsa-miR-497-5p


In [8]:
df = df[["name", "disease"]]

df.head()

Unnamed: 0,name,disease
0,hsa-miR-40,Liver Cirrhosis
1,hsa-miR-29b-3p,Zika Virus Infection
2,hsa-miR-130b-3p,Wound Healing
3,hsa-miR-221-3p,Stomach Neoplasms
4,hsa-miR-497-5p,Pancreatic Neoplasms


# miRNAs

In [9]:
df2 = pd.read_csv("data/targetscan.csv", usecols=["name", "sequence"])

df2.head()

Unnamed: 0,name,sequence
0,hsa-let-7a-5p,TGAGGTAGTAGGTTGTATAGTT
1,hsa-let-7b-5p,TGAGGTAGTAGGTTGTGTGGTT
2,hsa-let-7c-5p,TGAGGTAGTAGGTTGTATGGTT
3,hsa-let-7d-5p,AGAGGTAGTAGGTTGCATAGTT
4,hsa-let-7e-5p,TGAGGTAGGAGGTTGTATAGTT


In [13]:
merged_df = df2.merge(df, on="name", how="inner")

merged_df.head()

Unnamed: 0,name,sequence,disease
0,hsa-miR-4458,AGAGGTAGGTGTGGAAGAA,Breast Neoplasms
1,hsa-miR-4458,AGAGGTAGGTGTGGAAGAA,Stomach Neoplasms
2,hsa-miR-4458,AGAGGTAGGTGTGGAAGAA,"Carcinoma, Non-Small-Cell Lung"
3,hsa-miR-4458,AGAGGTAGGTGTGGAAGAA,Breast Neoplasms
4,hsa-miR-4458,AGAGGTAGGTGTGGAAGAA,"Leukemia, Myeloid, Acute"


In [11]:
len(df)

53375

In [14]:
len(merged_df)

13414

In [15]:
merged_df = merged_df.drop_duplicates()

merged_df.head()

Unnamed: 0,name,sequence,disease
0,hsa-miR-4458,AGAGGTAGGTGTGGAAGAA,Breast Neoplasms
1,hsa-miR-4458,AGAGGTAGGTGTGGAAGAA,Stomach Neoplasms
2,hsa-miR-4458,AGAGGTAGGTGTGGAAGAA,"Carcinoma, Non-Small-Cell Lung"
4,hsa-miR-4458,AGAGGTAGGTGTGGAAGAA,"Leukemia, Myeloid, Acute"
6,hsa-miR-4458,AGAGGTAGGTGTGGAAGAA,gastric adenocarcinoma


In [16]:
len(merged_df)

8384

In [17]:
merged_df.to_csv("data/data.csv", index=False)
