# Preliminary steps


In [1]:
from collections import Counter

from google.colab import drive
import os 
import pandas as pd

In [12]:
DATA_PATH = "gdrive/MyDrive/DataMining_project/data/"
FULL_GO_PATH = DATA_PATH + "data_GSEA/total_go_annot.tsv"
TUMOR_PATH = DATA_PATH + "data_GSEA/tumor_related/"
TISSUE_PATH = DATA_PATH + "data_GSEA/tissue_specific/"
BOTH_PATHS = [TUMOR_PATH, TISSUE_PATH]
REFERENCE_FILES = [
    "total_snodb_host.txt",
    "count101_host.txt"
]
OUT_PATH_DUP = DATA_PATH + "data_GSEA/duplicate/"
OUT_PATH_NON_DUP = DATA_PATH + "data_GSEA/non_duplicate/"

Allow the script to access the drive in order to load the files. It might require some time. It is assumed that you added the "DataMining_poject" folder as a shortcut (simply right click on the folder > "Add shortcut to drive").

In [3]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [19]:
go_annot = pd.read_csv(FULL_GO_PATH, sep = "\t")

# Function definition

Function to create background lists WITHOUT duplicates (aside from hostless and not annotated)

In [25]:
def create_bg(file_path: str, go_annot: pd.DataFrame):
    
    print(file_path)
    host_genes = list(set(pd.read_csv(file_path, header=None)[0].to_list()))
    full_list = []
    
    hl_counter = 0
    not_ann_counter = 0

    for ens_id in host_genes:
        
        got = list(go_annot[go_annot["ensembl_gene_id"] == ens_id]["go_id"])
        got = [term for term in got if type(term) != float]

        [full_list.extend([[term, ens_id]]) for term in got]

        if not got:

            print(ens_id)
            
            if ens_id == "hl":
                repl_str = "Host-less"
                entry = [repl_str, ens_id + "." + str(hl_counter)]
                hl_counter += 1
                
            else:
                repl_str = "Not-annotated host"
                entry = [repl_str, "no_go." + str(not_ann_counter)]
                not_ann_counter += 1

            full_list.extend([entry])
        
    out_df = pd.DataFrame(full_list)

    return out_df
    

Function to create background lists WITH duplicates allowed

In [21]:
def custom_bg(file_path: str, go_annot: pd.DataFrame):

    host_counts = Counter(pd.read_csv(file_path, header=None)[0].to_list())
    full_list = []

    for ens_id in host_counts.keys():
        
        id_count = host_counts[ens_id]
        got = list(go_annot[go_annot["ensembl_gene_id"] == ens_id]["go_id"])

        [
            full_list.extend([
                [term, ens_id + "." + str(c)]
                for c in range(id_count)
            ])
            for term in got
        ]

        if not got:
            
            repl_str = "Host-less"
            if ens_id != "hl":
                repl_str = "Not-annotated host"

            full_list.extend([
                [repl_str, ens_id + "." + str(c)]
                for c in range(id_count)
            ])
        
    out_df = pd.DataFrame(full_list).dropna(axis=0)

    return out_df
    

# Generate files

Create lists without duplicates

In [None]:
for fold in BOTH_PATHS:
    for path in os.listdir(fold):
        file_path = os.path.join(fold, path)
        file_mat = create_bg(file_path, go_annot) 

        if path not in REFERENCE_FILES:
            file_mat.drop(columns=0, inplace=True)
            file_mat.drop_duplicates(inplace=True)
        
        file_mat.to_csv(
            OUT_PATH_NON_DUP + path.split(".")[0] + ".tsv", 
            header=False,
            index=False,
            sep="\t"
            )

Create lists with duplicates

In [27]:
for fold in BOTH_PATHS:
    for path in os.listdir(fold):
        file_path = os.path.join(fold, path)
        file_mat = custom_bg(file_path, go_annot) 

        if path not in REFERENCE_FILES:
            file_mat.drop(columns=0, inplace=True)
            file_mat.drop_duplicates(inplace=True)
        
        file_mat.to_csv(
            OUT_PATH_DUP + path.split(".")[0] + ".tsv", 
            header=False,
            index=False,
            sep="\t"
            )
