# Preliminary steps


In [19]:
import os

from google.colab import drive 
import pandas as pd

Allow the script to access the drive in order to load the files. It might require some time. It is assumed that you added the "DataMining_poject" folder as a shortcut (simply right click on the folder > "Add shortcut to drive").

In [20]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Define constants

In [21]:
DRIVE_PATH = "gdrive/MyDrive/DataMining_project/"

EXP_SNORNA = DRIVE_PATH + "data/data_expansion_input/exp_snorna.txt"
EXP_DATASET = DRIVE_PATH + "data/data_expansion_input/filtered_dataset.csv"
CANON_INT = DRIVE_PATH + "data/data_sno_canonical/ensg_ensg_interactions.csv"

NET_INT_PATH = DRIVE_PATH + "data/data_parsed_networks/networks"
MERGED_INT_PATH = DRIVE_PATH + "data/data_parsed_networks/single_expansions"
INPUT_DIRS = [NET_INT_PATH, MERGED_INT_PATH]

OUTPUT_DIR = DRIVE_PATH + "data/data_net_cutoffs/"
OUTPUT_SUMMARY = OUTPUT_DIR + "summary_table.csv"


# Function definiton

In [22]:
def make_ref_dict(
    canon_int: pd.DataFrame,
    exp_dataf: pd.DataFrame,
    exp_min: float,
    exp_frac: float,
    info_cols: dict = dict(),
    ):
    
    # Use default columns if no new ones are provided
    COLS = {
        "gen_col": "gene_of_interest",
        "lst_col": "ensg_target_list", 
        "map_col": "target_std_name_to_ensg_map",
    }
    if info_cols:
        COLS = {
        key: (info_cols[key] if key in info_cols else val)
        for key, val in COLS.items()
        }
    
    # Filter the dataset according to the parameters
    req_samples = exp_dataf.shape[1] * exp_frac // 1
    gene_counts = (exp_dataf >= exp_min).sum(axis = 1)
    indexer = (gene_counts >= req_samples)
    ds_filtered = exp_dataf[indexer]

    # Retrieve list of genes after filtering
    filt_genes = ds_filtered.index.to_list()
    # Save list of snoRNAs to iterate
    snornas = canon_int[COLS["gen_col"]].to_list()

    # Dict to populate with filtered genes
    filt_dict = dict()
    # Maximum number of canonical interactions given the filtered df
    max_canon = 0
    
    # Iterate over all snoRNAs
    for sno in snornas:
        
        # Skip if snorna is not in filtered ds
        if sno not in filt_genes:
            continue
        
        # Retrieve snorna line from df to avoid continuous lookup
        row = canon_int[canon_int[COLS["gen_col"]] == sno].to_dict("records")[0]

        # Filter canonical interactions for the snoRNA
        sno_dict = {
            key: [id for id in val if id in filt_genes]
            for key, val in eval(row[COLS["map_col"]]).items()
        }

        # Remove keys with no ens_ids associated
        sno_dict = {k: v for k, v in sno_dict.items() if v}
        
        # If all interactors were removed, skip the snoRNA
        if not sno_dict:
            continue
        
        # Create lookup for the gene
        sno_list = []
        [sno_list.extend(val) for val in sno_dict.values()]
        
        # Increase max interactions counters
        max_canon += len(sno_dict.keys())

        # Update sno entry in final reference dict
        filt_dict[sno] = {
            "lookup":sno_list,
            "interactors":sno_dict,
        }

    return filt_dict, max_canon


In [23]:
def validate_interactions(
    to_valid: pd.DataFrame,
    ref_dict: dict,
    list_thresh: list,
    ):

    # Initialize results container
    tot_canonical = {thr: 0 for thr in list_thresh}

    # Iterate for each snorna
    for sno, info in ref_dict.items():

        # Create variables for snoRNA info
        lookup = info["lookup"]
        interactors = info["interactors"]

        # Subset the dataframe to snoRNA of interest
        cur_df = to_valid[(to_valid["x"]==sno)|(to_valid["y"]==sno)]
        
        # Iterate subset network
        for _, row in cur_df.iterrows():
            
            # If no more possible interactions are present
            if not lookup:
                break
            
            # Save interaction relative frequency or correlation
            try:
                strength = row.loc["frel"]
            except:
                strength = row.loc["corr"]

            # Decide whether column x or y is the interactor
            x_val = row.loc["x"]
            interactor = x_val if x_val != sno else row.loc["y"]

            # If interactor is not canonical, skip it
            if interactor not in lookup:
                continue
            
            # Find all interactor ens_ids
            for key, value in interactors.items(): 
                
                # If that interactor ens_id is not in the list, skip the list
                if interactor not in value:
                    continue

                # Remove all interactor gene_ids from pooled list
                lookup = [g for g in lookup if g not in value]
                break
            
            # Update total canonical interactions found
            tot_canonical = {
                thr: (tot_canonical[thr] + 1 if strength > thr 
                      else tot_canonical[thr])
                for thr in list_thresh
            }

    return tot_canonical


In [24]:
def full_validation(
    canon_int: pd.DataFrame,
    exp_dataf: pd.DataFrame,
    to_valid_paths: list,
    post_filt_grid: list,
    threshold_grid: list,
    decimals: int = 2,
    ):

    # Results container
    validations = {post_filt:dict() for post_filt in post_filt_grid}
    
    # Iterate for each post filtering set of parameters
    for post_filt in post_filt_grid:
        
        # Compute canonical interactions present according to filtering
        ref_dict, max_canon = make_ref_dict(canon_int, exp_dataf, *post_filt)

        # Iterate for each network
        for to_valid in to_valid_paths:
            
            # Compute number of canonical interactions according to filtering
            num_canon = validate_interactions(
                pd.read_csv(to_valid, index_col=0),
                ref_dict.copy(),
                threshold_grid,
                )
            
            # Update results
            validations[post_filt][os.path.basename(to_valid).split(".")[0]] = {
                "tot_canon": {thr: max_canon for thr in threshold_grid},
                "found_canon": num_canon,
                "fraction_canon": {
                    thr: round(val / max_canon, decimals)
                    for thr, val in num_canon.items()
                    }
            }
            
    return validations


In [25]:
def build_res_table(validation_res: pd.DataFrame):

    OUT_COLS = [
        "Post filter",
        "Network",
        "Threshold",
        "Total canonical",
        "Found canonical",
        "Fraction canonical",
    ]

    res_dict = [
        {
            OUT_COLS[0]: post_k,
            OUT_COLS[1]: netw_k,
            OUT_COLS[2]: thresh,
            OUT_COLS[3]: netw_val["tot_canon"][thresh],
            OUT_COLS[4]: netw_val["found_canon"][thresh],
            OUT_COLS[5]: netw_val["fraction_canon"][thresh],
         }
        for post_k, post_val in res_validations.items()
        for netw_k, netw_val in post_val.items()
        for thresh in netw_val["found_canon"].keys()
        if (netw_val["found_canon"][thresh] != 0 or 
            thresh == min(netw_val["found_canon"].keys()))
    ]

    return pd.DataFrame.from_dict(res_dict)

# Perform analyses

Load canonical interactions and expression dataframe

In [26]:
canonical_interactions = pd.read_csv(CANON_INT, index_col=0, header=0)
exp_dataf = pd.read_csv(EXP_DATASET, index_col=0, header=0)

Define strength thresholds to test

In [27]:
strength_thresholds = [0, 0.2, 0.5, 0.9]

Define post filtering thresholds

In [28]:
post_filt_thresholds = [(1, 0.2), (5, 0.4), (10, 0.4)]

Define list of paths to networks to validate

In [29]:
val_paths = [
    os.path.join(dir, net)
    for dir in INPUT_DIRS for net in os.listdir(dir)
    if os.path.isfile(os.path.join(dir, net))
]

Compute validations

In [30]:
res_validations = full_validation(
    canonical_interactions,
    exp_dataf, 
    val_paths,
    post_filt_thresholds,
    strength_thresholds,
    )

Build and save result table

In [31]:
res_table = build_res_table(res_validations)
res_table.to_csv(OUTPUT_SUMMARY)