In [None]:
import json
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import requests
from sklearn.model_selection import train_test_split
import torch
import h5py
import ast

In [None]:
#Data processing for therapeutic target prediction task
data_folder = "..data/therapeutic_target_data"

In [None]:
unique_genes = []
ppi_edgelist_path = os.path.join(data_folder, "networks/global_ppi_edgelist.txt")

with open(ppi_edgelist_path, "r") as ppi_file:
    for line in ppi_file:
        genes = line.strip().split()
        if genes[0] not in unique_genes:
            unique_genes.append(genes[0])
        if genes[1] not in unique_genes:
            unique_genes.append(genes[1])


In [None]:
def get_uniprot_id(gene_symbol):
    url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_symbol}&format=json"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if data['results']:
            # Extract the first entry's UniProt ID
            uniprot_id = data['results'][0]['primaryAccession']
            return uniprot_id
        else:
            return "No UniProt ID found for this gene symbol."
    else:
        return f"Error: {response.status_code}"

In [None]:
#Mapping all of the genes to proteins
protein_dict = {}

for gene in tqdm(unique_genes):
    uniprot_id = get_uniprot_id(gene)
    protein_dict[gene] = uniprot_id
    
#Saving the dictionary that maps genes to proteins
#with open("protein_dict.pkl", "wb") as saved_file:
    #pickle.dump(protein_dict, saved_file)


In [None]:
unmatched_genes = []

for gene in list(protein_dict.keys()):
    if protein_dict[gene] == "No UniProt ID found for this gene symbol.":
        unmatched_genes.append(gene)
    

In [None]:
def get_amino_acid_sequence(uniprot_id):

    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
    response = requests.get(url)

    if response.status_code == 200:
        fasta_data = response.text
        sequence = "".join(fasta_data.splitlines()[1:])  # Skip the first line (header)
        return sequence
    else:
        return "Error"
        #return f"Error: Unable to fetch data for {uniprot_id}. Status code: {response.status_code}"


In [None]:
aa_dict = {}
uniprot_ids = protein_dict.values()
for uni_id in tqdm(uniprot_ids):
    if uni_id != "No UniProt ID found for this gene symbol.":
        aa_sequence = get_amino_acid_sequence(uni_id)
        aa_dict[uni_id] = aa_sequence
        
#Saving the dictionary that maps proteins to amino acids
#with open("aa_dict.pkl", "wb") as saved_file:
    #pickle.dump(aa_dict, saved_file)

In [None]:
#Converting dictionary to dataframe:
columns = ["Gene", "AA Sequence"]
genes = [gene for gene in aa_dict.keys()]
sequences = [sequence for sequence in aa_dict.values()]

sequence_df = pd.DataFrame(list(zip(genes, sequences)), columns=columns)

In [None]:
not_included = []
for protein in all_proteins:
    if protein not in aa_dict.keys():
        not_included.append(protein)

In [None]:
#Checking for duplicate proteins:
unique_proteins = []
duplicates_dict = {}

for gene, protein in protein_dict.items():
    if protein != "No UniProt ID found for this gene symbol.":
        if protein in unique_proteins:
            duplicates_dict[gene] = protein
        else:
            unique_proteins.append(protein)


In [None]:
#Dictionary that maps genes to amino acids
gene_aa_dict = {}
for gene in unique_genes:
    uniprot_id = protein_dict[gene]
    if uniprot_id != "No UniProt ID found for this gene symbol.":
        aa_sequence = aa_dict[uniprot_id]
        gene_aa_dict[gene] = aa_sequence

In [None]:
#Saving the dictionary that maps proteins to amino acids
gene_aa_dict_path = os.path.join(data_folder, "gene_aa_dict.pkl")

with open(gene_aa_dict_path, "wb") as saved_file:
    pickle.dump(gene_aa_dict, saved_file)


In [None]:
labels_dict_path = os.path.join(data_folder, "pinnacle_embeds/pinnacle_labels_dict.txt")

with open(labels_dict_path, "r") as f:
    labels_dict = f.read()
labels_dict = labels_dict.replace("\'", "\"")
labels_dict = json.loads(labels_dict)
celltypes = [c for c in labels_dict["Cell Type"] if c.startswith("CCI")]
celltype_dict = {ct.split("CCI_")[1]: i for i, ct in enumerate(celltypes)}

In [None]:
protein_names = []
protein_celltypes = []
for c, p in zip(labels_dict["Cell Type"], labels_dict["Name"]):
    if c.startswith("BTO") or c.startswith("CCI") or c.startswith("Sanity"): continue
    protein_names.append(p)
    protein_celltypes.append(c)

proteins = pd.DataFrame.from_dict({"target": protein_names, "cell type": protein_celltypes})
celltype_protein_dict = proteins.pivot_table(values="target", index="cell type", aggfunc={"target": list}).to_dict()["target"]

In [None]:
celltype_dict_path = os.path.join(data_folder, "celltype_dict.pkl")
celltype_protein_dict_path = os.path.join(data_folder, "celltype_protein_dict.pkl")

with open(celltype_dict_path, "wb") as saved_file:
    pickle.dump(celltype_dict, saved_file)

with open(celltype_protein_dict_path, "wb") as saved_file:
    pickle.dump(celltype_protein_dict, saved_file)


In [None]:
mg_labels_dict_path = os.path.join(data_folder, "pinnacle_embeds/pinnacle_mg_labels_dict.txt")
with open(mg_labels_dict_path, "r") as file:
    mg_labels_dict = ast.literal_eval(file.read())


In [None]:
mg_labels = mg_labels_dict["Cell Type"]
cell_embeds = {}
#156 is the number of different cell-contexts
for i in range(156):
    label = mg_labels[i].split("_")[1]
    embedding = mg_embeds[i]
    cell_embeds[label] = embedding

cell_embeds_path = os.path.join(data_folder, "cell_embeds.pkl")
with open(cell_embeds_path, "wb") as saved_file:
    pickle.dump(cell_embeds, saved_file)
