In [None]:
import pandas as pd
from google.colab import drive 

In [None]:
DATA_PATH = "gdrive/MyDrive/DataMining_project/data/data_"
PATH_TUMOR = DATA_PATH + "tumor_snorna.csv"
PATH_FILTR = DATA_PATH + "exp_snorna.csv"
PATH_SNODB = DATA_PATH + "data_snodb/snoDB_All_V2.0.tsv"

# Preliminary steps

Allow the script to access the drive in order to load the files. It might require some time. It is assumed that you added the "DataMining_poject" folder as a shortcut (simply right click on the folder > "Add shortcut to drive").

In [None]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**Load datasets**. Notice that:
- **ds_filtr** contains the filtered expression dataset
- **ds_tumor** contains the list of tumor related snoRNAs
- **ds_snodb** contains the data from snoDB2 (used for id conversion)

In [None]:
ds_filtr = pd.read_csv(PATH_FILTR)
ds_tumor = pd.read_csv(PATH_TUMOR, encoding_errors= "ignore")
ds_snodb = pd.read_csv(PATH_SNODB, sep="\t")

**Subset** only needed **information** for ease of access.

In [None]:
list_filtr = list(ds_filtr['ensembl_id'])
list_tumor = [str(element).split("/") for element in list(ds_tumor['snoRNAs'])]

snodb_name = ds_snodb["gene_name"] + (';' + ds_snodb['synonyms']).fillna('')
snodb_conv = pd.concat(
    [ds_snodb["ensembl_id"], snodb_name],
    axis=1, 
    keys=['ensembl_id', 'gene_name']
)

# Processing step

Create a **function** that, given:
- a **snoRNA ensembl_id** from the filtered list
- a **list of gene names** of snoRNA related to cancer
- a dataframe representing the **conversion matrix** from ensembl_id and gene name

Returns a dictionary containing:
- **number of groups** of snoRNA matched
- **row line** of the reference for tumoral correlation **in the "tumor_snorna.csv"**
- **gene names** matched

In [None]:
def get_tumor_mapping(
    ens_id: str,
    tum_list: list,
    conv_mat: pd.DataFrame
):  
    
    """
    Retrieve tumor-linkage information position for a snoRNA. 

    Given a snoRNA ensembl_id, retrieve from a manually curated list of papers,
    the position of those suggesting some role of the snoRNA in cancer.

    Parameters
    ----------
    ens_id : str
        ensembl_id of a snoRNA of interest.
    tum_list : list
        List of lists of gene names, each corresponding to an individual snoRNA
        in an individual paper. Multiple lists may map to the same paper.
    conv_mat : pd.DataFrame
        Pandas dataframe containing pairings of ensembl_ids and lists of gene
        names (semi-colon separated).

    Returns
    -------
    res_dict : dict
        Dictionary containing number of gene name groups corresponding to the
        ensembl_id, line positions of the references in the manually curated
        .csv file (semi-colon separated), gene names corresponding to the 
        ensembl_id (semi-colon separated).
    """
    
    # Initialize empty container to store info
    res_dict = {
        "num_matches": 0,
        "pos_matches": list(),
        "name_matches": list()
    }
    
    # Retrieve from conversion matrix all gene names associated to ensembl_id
    indexer = conv_mat["ensembl_id"] == ens_id
    all_names = str(conv_mat.loc[indexer, "gene_name"].item()).split(";")
    
    # Return empty object if no valid name is available
    if all_names == ["nan"]:
        res_dict["pos_matches"] = ""
        res_dict["name_matches"] = ""
        return res_dict
    
    # If there is an intersection between filtered snoRNA gene names and...
    # ... tumoral snoRNA gene names, update information
    for i in range(len(tum_list)):
        if [gene for gene in tum_list[i] if gene in all_names]:
            res_dict["num_matches"] += 1
            res_dict["pos_matches"].append(str(i+2))  # +2 to match excel line
            res_dict["name_matches"].extend(tum_list[i])
    
    # Remove duplicates from list of gene names
    res_dict["name_matches"] = list(set(res_dict["name_matches"]))
    res_dict["name_matches"].sort()  # set() operation does not preserve order
    
    # Convert to strings for ordering purpose
    res_dict["name_matches"] = ";".join(res_dict["name_matches"])
    res_dict["pos_matches"] = ";".join(res_dict["pos_matches"])
    
    return res_dict


Apply the function to all snoRNA ensembl_ids in the filtered list.\
Store the results in a dataframe with genes on the rows, information on the columns.

In [None]:
# Store the information into a dictionary
match_dict = {
    gene: get_tumor_mapping(gene, list_tumor, snodb_conv) 
    for gene in list_filtr
}

# Convert into a pandas dataframe
match_ds = pd.DataFrame.from_dict(match_dict).T

The number of unique values in the "name_matches" column should roughly
correspond to the number of distinct genes; the number could be slightly lower
since further manual curation might be needed to join further some clusters.

In [None]:
match_ds["name_matches"].nunique()

144

# Export

Export the tumor correlations in .csv format.

In [None]:
match_ds.to_csv(DATA_PATH + "tumor_matches.csv")