In [None]:
import pandas as pd
import numpy as np
from google.colab import drive 

DATA_PATH = "gdrive/MyDrive/DataMining_project/data/"
PATH_TUMOR = DATA_PATH + "data_tumor_snorna/snoRNA_list_clean.tsv"
PATH_SNODB = DATA_PATH + "data_snodb/snoDB_All_V2.0.tsv"
PATH_OUTPUT = DATA_PATH + "data_tumor_snorna/snoRNA_processed.tsv"

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from numpy import isnan
ds_tumor = pd.read_csv(PATH_TUMOR, sep="\t", encoding_errors= "ignore")
ds_snodb = pd.read_csv(PATH_SNODB, sep="\t")

ds_tumor["Cancer"] = ds_tumor["Cancer"].apply(lambda x: x.split(", "))
ds_tumor["snoRNAs"] = ds_tumor["snoRNAs"].apply(lambda x: x.split("/"))

# list_sno_main is the list of the sno with their more frequent name (SNORAx, SNORDx)
list_sno_main = [snos[0] for snos in list(ds_tumor["snoRNAs"])]

cancer_types = list()
for el in ds_tumor["Cancer"]:
    for ctype in el:
        if ctype not in cancer_types:
            cancer_types.append(ctype)
cancer_types.sort()

snodb_name_str = ds_snodb["gene_name"] + (';' + ds_snodb["synonyms"]).fillna("")
snodb_name = snodb_name_str.str.split(";")

snodb_conv = pd.concat([ds_snodb["ensembl_id"], snodb_name], axis=1, keys=["ensembl_id", "gene_name"])

# assignEns finds the ensembl name for each sno considering also when it is present in the "synonyms" column in the snoDB dataset
# and return a dict where to each sno is associated te list of its ensembl
def assignEns(sno_list: list, conv_mat: pd.DataFrame):
    res_dict = {}
    for sno in sno_list:
        indexer = [sno in el for el in list(conv_mat["gene_name"].fillna(""))]
        ens_s = conv_mat.loc[indexer, "ensembl_id"].dropna()
        res_dict[sno] = list(ens_s)
    return res_dict

sno_ass = assignEns(list_sno_main, snodb_conv)

ds_tumor["ensembl_id"] = ds_tumor["snoRNAs"].apply(lambda x: sno_ass[x[0]])

def assignTumorVal(df: pd.DataFrame):

    ind = 0
    canc_dict = {
        "Brain": ["GBM", "LGG", "NB", "DIPG"],
        "Prostate": ["PCa"],
        "Osteosarcoma": ["OS", "Sarcoma"],
        "Myeloma": ["MM"],
        "Lung": ["LC", "LUAD", "NSCLC"],
        "Hepatocellular": ["HCC"],
        "Bladder": ["BLCA"],
        "Gallbladder": ["GBC"],
        "Breast_normal": ["BRCA", "TNBC"],
        "Renal": ["ccRCC"],
        "Cervical": ["CACX", "CC"],
        "Gastric": ["GC"],
        "Ovarian": ["OC"],
        "Breast_aggressive": ["BCBM"],
        "Leukemia": ["AML", "APL", "BCP-ALL", "CLL", "Leukemia"],
        "Colon": ["COC", "CRC"]
        }

    for cancList in df["Cancer"]:
        for cancEl in cancList:
            for canc in canc_dict:
                if cancEl in canc_dict[canc]:
                    if df["Expression_level"][ind] == "Increased":
                        df.at[ind, canc] = 1
                    elif df["Expression_level"][ind] == "Decreased":
                        df.at[ind, canc] = 2
                    else:
                        df.at[ind, canc] = 3

        ind += 1

    for canc in canc_dict.keys():
        df[canc] = df[canc].fillna(0)
        df[canc] = df[canc].astype(int)

    return

assignTumorVal(ds_tumor)

ds_tumor["Cancer"] = ds_tumor["Cancer"].apply(lambda x: "/".join(x))
ds_tumor["snoRNAs"] = ds_tumor["snoRNAs"].apply(lambda x: "/".join(x))
ds_tumor["ensembl_id"] = ds_tumor["ensembl_id"].apply(lambda x: "/".join(x))

ds_tumor.to_csv(PATH_OUTPUT, sep="\t", index=False)
