In [15]:
import pandas as pd
import os
from google.colab import drive 

In [16]:
DATA_PATH = "gdrive/MyDrive/DataMining_project/data/"
OUTPUT_DIR = "gdrive/MyDrive/temp_outputs_dm_project"
SNO_DB_PATH = DATA_PATH + "/data_snodb/snoDB_All_V2.0.tsv"
SNO_OF_INTEREST_PATH = DATA_PATH + "/data_tumor_snorna/snoRNA_processed_final.tsv"
ONE_VS_ALL_PATH = DATA_PATH + "data_differential_expression/one_vs_all/csv_format"
DATASET101_PATH = DATA_PATH + "data_reanalysis/count_v101.xlsx"

In [17]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Useful functions

```remove_duplicates``` returns a list without duplicate items

In [None]:
def remove_duplicates(list):
  new_list = [] 
  [new_list.append(x) for x in list if x not in new_list]
  return new_list

```flatten_list``` retuns a list of indivisual items starting from a list of lists 

In [None]:
def flatten_list(l):
  return [item for sublist in l for item in sublist]

```split_items``` returns a flat list of ensg_id from an initial complex total_list, uses a parameter ```splitting_term``` to personalize the splitting value

In [None]:
def split_items(total_list, splitting_term):
  final_list = []
  for l in total_list:
    if isinstance(l, str) and splitting_term in l:
      final_list.append(l.split(splitting_term))
    elif isinstance(l, str) and l != "":
      final_list.append([l])
  return final_list

#Preliminary steps

Load data into dataframes

In [None]:
sno_of_interest = pd.read_csv(SNO_OF_INTEREST_PATH, delimiter = "\t")
sno_db = pd.read_csv(SNO_DB_PATH, index_col = 0, delimiter ="\t")
count_101 = pd.read_excel(DATASET101_PATH)

get a clean list of the sno genes present in the entire snoDB. By clean I mean a list of *single* labels, with no separator

In [None]:
total_sno_list = list(sno_db["ensembl_id"])
total_sno_list = split_items(total_sno_list, ";")
total_sno_list = flatten_list(total_sno_list)
# useful for removing multiple void values
total_sno_list = remove_duplicates(total_sno_list)

###create a dictionary
in the format
{key : gene_id, value : host_id}
of every single sno present in snoDB. The staring list is indeed ```total_sno_list```

Whenever a sno is not associated to a host, its value in the dictionary becomes ```-```

In [None]:
# dictionary that connects gene to host
gene_host = {}

# for each possible sno
for sno in total_sno_list:
  # for each entru of sno_db
  for index, row in sno_db.iterrows():

    # if the sno I am interested in is inside the fiels "ensembl_id", then
    # I try to collect the host

    # note that this step is useful because "ensembl_id" field on 
    # snoDB often stores multiple ENSG_ids concatenated

    if sno in str(row.loc["ensembl_id"]):
      if isinstance(row.loc["host_gene_id"], str):
        # I there is an actual host
        gene_host[sno] = row.loc["host_gene_id"]
      else:
        # has no host associated
        gene_host[sno] = "hl"

#Get hosts for lists of interest

###SnoDB
retrive a complete list of hosts (also counting double values and missing hosts) starting for the complete list of sno present in snodb

In [None]:
# this is the list of every host in snoDB (considering multiple ensamble for 
# single snoRNAs)
total_hosts_from_snodb = []

for sno in total_sno_list:
  total_hosts_from_snodb.append(gene_host[sno])

len(total_hosts_from_snodb)

995

###Tumor related

get a clean list of the sno genes of which we are interested in (tumor related). By clean I mean a list of *single* labels, with no separator 

Also, retrive every host from this list through the dictionary

In [None]:
sno_list = list(sno_of_interest["ensembl_id"])
sno_list = split_items(sno_list, "/")
sno_list = flatten_list(sno_list)
# useful for removing multiple void values
sno_list = remove_duplicates(sno_list)

host_of_interest = []

for sno in sno_list:
  if sno in total_sno_list:
    host_of_interest.append(gene_host[sno])

len(host_of_interest)

303

###DATA from GSEA
retrive hosts for each tissue one_vs_all analyses

In [None]:
# dictionary (key : tissue, val : [hosts of differentially expressed snos])
tissue_DE_sno_host = {}

# read all files
for f in os.listdir(ONE_VS_ALL_PATH):
  complete_file_path = os.path.join(ONE_VS_ALL_PATH, f)
  
  if os.path.isfile(complete_file_path):
    # get name of the tissue
    tissue = f.split("_")[0]
    tissue_DE_sno_host[tissue] = []

    dataframe = pd.read_csv(complete_file_path)

    for index, row in dataframe.iterrows():
      # the gene_id corresponds to one of the sno stored into snodb
      if row.loc["gene_id"] in total_sno_list:
        tissue_DE_sno_host[tissue].append(gene_host[row.loc["gene_id"]])

for tissue in tissue_DE_sno_host.keys():
  print(tissue, len(tissue_DE_sno_host[tissue]))

Brain 311
Testis 196
Skeletalmuscle 48
Liver 44
Prostate 14
Ovary 155
Breast 33


###Data from count101
retrive hosts for the overall dataset 101

In [None]:
all_genes = list(count_101["gene_id"])
host_list_101 = []

for gene in all_genes:
  if gene in total_sno_list:
    # the gene is a sno
    host_list_101.append(gene_host[gene])
  
len(host_list_101)

976

#Save lists to files

In [None]:
with open((OUTPUT_DIR + "/tumor_host.txt"), 'w') as fp:
    fp.write('\n'.join(host_of_interest))

with open((OUTPUT_DIR + "/total_host.txt"), 'w') as fp:
  fp.write('\n'.join(total_hosts_from_snodb))

for tissue in tissue_DE_sno_host.keys():
  with open((OUTPUT_DIR + "/" + tissue + "_DE_hosts.txt"), 'w') as fp:
    fp.write('\n'.join(tissue_DE_sno_host[tissue]))

with open((OUTPUT_DIR + "/count101_host.txt"), 'w') as fp:
  fp.write('\n'.join(host_list_101))

In [19]:
with open((OUTPUT_DIR + "/gene_host_link.txt"), 'w') as fp:
    print(gene_host, file=fp)