<a href="https://colab.research.google.com/github/SVJLucas/GraphMining/blob/main/GNN/GNN_pre_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install langchain
!pip install langchain-openai

In [None]:
!pip uninstall typing_extensions
!pip install typing_extensions

Found existing installation: typing_extensions 4.10.0
Uninstalling typing_extensions-4.10.0:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/typing_extensions-4.10.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/typing_extensions.py
Proceed (Y/n)? 

In [None]:
import os
import re
import json
import heapq
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from tqdm import tqdm
from google.colab import userdata
from collections import defaultdict
from langchain_openai import ChatOpenAI

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
PATH = '/content/drive/MyDrive/Projetos/GNN-Gene-Disease/Data'

## Tables View

In [None]:
df_diseaseDOID = pd.read_csv(PATH + '/D-DoMiner_miner-diseaseDOID.tsv', sep='	')
df_diseaseDOID['Name'] = df_diseaseDOID['Name'].transform(lambda x: x.title())

In [None]:
df_diseaseDOID.head()

In [None]:
df_disease = pd.read_csv(PATH + '/D-MeshMiner_miner-disease.tsv', sep='	')
df_disease['Name'] = df_disease['Name'].transform(lambda x: x.title())
df_disease.head()

In [None]:
df_disease_class = pd.read_csv(PATH + '/D-DoPathways_diseaseclasses.csv')
df_disease_class['Disease Name'] = df_disease_class['Disease Name'].transform(lambda x: x.title())
df_disease_class.head()

In [None]:
df_geneHUGO = pd.read_csv(PATH + '/G-SynMiner_miner-geneHUGO.tsv', sep='	')
df_geneHUGO = df_geneHUGO[['entrez_id', 'hgnc_id', 'name', 'locus_group', 'locus_type', 'location', 'gene_family', 'gene_family_id']]
df_geneHUGO['entrez_id']
df_geneHUGO.head()

In [None]:
df_disease_gene = pd.read_csv(PATH + '/DG-AssocMiner_miner-disease-gene.tsv', sep='	')
df_disease_gene['Disease Name'] = df_disease_gene['Disease Name'].transform(lambda x: x.title())
df_disease_gene.head()

## Disease Feature Table


In [None]:
df_disease_ = df_disease_gene[['# Disease ID', 'Disease Name']].drop_duplicates()

df_disease_.columns=['Disease ID', 'Disease Name']

# Merge with disease class
df_disease_ = df_disease_.merge(df_disease_class, on='Disease ID', how='left')
df_disease_ = df_disease_.drop(columns=['Disease Name_y'])

df_disease_.columns=['Disease ID', 'Disease Name', 'Disease Class']

df_disease_

In [None]:
# Merge with miner disease
# df_disease
df_disease_2 = df_disease_.merge(df_disease, left_on='Disease Name', right_on = 'Name', how='left')
# df_disease_2 = df_disease_.drop(columns=['Disease Name_y'])

# df_disease_2.columns=['Disease ID', 'Disease Name', 'Disease Class']

df_disease_2

In [None]:
# Merge with DOID disease
# df_disease
df_disease_3 = df_disease_2.merge(df_diseaseDOID, left_on='Disease Name', right_on = 'Name', how='left')
# df_disease_2 = df_disease_.drop(columns=['Disease Name_y'])

# df_disease_2.columns=['Disease ID', 'Disease Name', 'Disease Class']

df_disease_3

In [None]:
nan_sum = df_disease_3.isna().sum()
print(nan_sum)

In [None]:
df_disease_3.drop(columns=['Name_x', 'Name_y', '# MESH_ID'])

## Teste Match (não usado)

In [None]:
# df_disease_f = df_disease_gene[['# Disease ID', 'Disease Name']].drop_duplicates()

# df_disease_f.columns=['Disease ID', 'Disease Name']

# # Merge with disease class
# df_disease_f = df_disease_f.merge(df_disease_class, on='Disease ID', how='left')
# df_disease_f = df_disease_f.drop(columns=['Disease Name_y'])

# df_disease_f.columns=['Disease ID', 'Disease Name', 'Disease Class']

df_names_doid = df_diseaseDOID[['# Disease(DOID)', 'Synonym', 'Name']]

names_syn_doid = []

for i,row in df_names_doid.iterrows():
  if row['Synonym'] is not np.nan and row['Name'] is not np.nan:
    names_syn_doid.append(row['Synonym'] + ', ' + row['Name'])

  elif row['Synonym'] is np.nan and row['Name'] is not np.nan:
    names_syn_doid.append(row['Name'])

  elif row['Synonym'] is not np.nan and row['Name'] is np.nan:
    names_syn_doid.append(row['Synonym'])

  else:
    names_syn_doid.append("")

df_names_doid['Complete_Name'] = names_syn_doid


In [None]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Replace all non-alphabetical characters with spaces
    text = re.sub(r'[^a-z]', ' ', text)

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def match_score(a, b):
  word_a = preprocess_text(a).split()
  word_b = set(preprocess_text(b).split())

  d = defaultdict(int)
  for word in word_a:
      if word in word_b:
          d[word] += 1

  return sum(d.values()) / len(word_a) if word_a else 0

# Example usage
a = "Salivary Gland Neoplasms"
b = "Cancer of Salivary Gland|Cancer of the Salivary Gland|Cancer, Salivary Gland|Cancers, Salivary Gland|Gland Neoplasm, Salivary|Gland Neoplasms, Salivary|Neoplasm, Salivary Gland|Neoplasms, Salivary Gland|Salivary Gland Cancer|Salivary Gland Cancers|Salivary Gland Neoplasm"
b = 'uaghaghaghagha'
score = match_score(a, b)
print(score)

0.0


In [None]:
df_disease_correct_names = df_disease_gene[['# Disease ID', 'Disease Name']].drop_duplicates()
best_matches = {}

for id,name in tqdm(zip(df_disease_correct_names['# Disease ID'], df_disease_correct_names['Disease Name'])):
  score_name = [(match_score(name, complete_name),complete_name, doid ) for complete_name, doid in zip(df_names_doid['Complete_Name'], df_names_doid['# Disease(DOID)'])]
  heapq.heapify(score_name)
  best_matches[name] =  heapq.nlargest(5,score_name)


519it [01:27,  5.95it/s]


In [None]:
name = 'Amnesia'

best_matches[name]

# [best_matches[name][i][1] for i in best_matches[name]]
options = '\n'.join([preprocess_text(best_matches[name][i][1]) for i in range(len(best_matches[name]))] + ['None of the before'])

prompt_text = "You are a specialist in medical diseases. I need you to find the correspondent disease between the following list. The key disease is " + name +  """
and your options are the following: """ + options + "\nWhat is the correspondent option? justify"

prompt_text

'You are a specialist in medical diseases. I need you to find the correspondent disease between the following list. The key disease is Amnesia\nand your options are the following: psychogenic amnesia exact csp dissociative amnesia\ntransient global amnesia\nretrograde amnesia\nanterograde amnesia\nzaspopathy exact myofibrillar myopathy\nNone of the before\nWhat is the correspondent option? justify'

## Load and Adjust Gene Loc

In [None]:
processed_data_old = PATH + '/processed_data/Old'

df_gene_pp = pd.read_csv(processed_data_old + '/genes_table.csv')
df_gene_pp.head()

In [None]:
# Split the 'location' column
df_gene_pp['Start Chromossome'] = df_gene_pp['location'].str.extract(r'(\d+)')
df_gene_pp['Start Chromossome Arm'] = df_gene_pp['location'].str.extract(r'\d+([pq])')
df_gene_pp['Start Chromossome Loc'] = df_gene_pp['location'].str.extract(r'[pq](\d+)')
df_gene_pp['Start Chromossome SubLoc'] = df_gene_pp['location'].str.extract(r'\.(\d+)')
df_gene_pp['End Chromossome Arm'] = df_gene_pp['location'].str.extract(r'-([pq])')
df_gene_pp['End Chromossome Loc'] = df_gene_pp['location'].str.extract(r'-(?:p|q)?(\d+)\.\d+')
df_gene_pp['End Chromossome SubLoc'] = df_gene_pp['location'].str.extract(r'-(?:p|q)?\d+\.(\d+)')
df_gene_pp

In [None]:
df_gene_pp.to_csv('/content/drive/MyDrive/Projetos/GNN-Gene-Disease/Data/processed_data/genes_table_final.csv')

## Add Disease Features LLM

In [None]:
processed_data = PATH + '/processed_data'

df_disease_pp = pd.read_csv(processed_data + '/diseases_table.csv')
df_disease_pp.head()

Unnamed: 0,Disease ID,Disease Name,Disease Class,Definitions
0,C0043459,Zellweger Syndrome,inherited metabolic disorder,An autosomal recessive disorder due to defects...
1,C0033860,Psoriasis,integumentary system disease,"A common genetically determined, chronic, infl..."
2,C0027726,Nephrotic Syndrome,urinary system disease,A condition characterized by severe PROTEINURI...
3,C0236969,Substance-Related Disorders,substance-related disorder,Disorders related to substance abuse.
4,C0002878,"Anemia, Hemolytic",immune system disease,A condition of inadequate circulating red bloo...


In [None]:
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

In [None]:
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY,model='gpt-3.5-turbo')

In [None]:
response_list = []
for disease, definition in zip (df_disease_pp['Disease Name'], df_disease_pp['Definitions']):
  prompt = f'''
  The {disease} has the following definition: {definition}
  Based on your available knowledge and in the definition provided, give me information about {disease} only in JSON format on:
                {{
                  "main_symptom": "",
                  "risk_factors": "",
                  "disease_class": "",
                  "main_system_affected": ""
                }}
  Go straight to the point: only list the important terms and don't talk too much
  '''
  # response = llm.invoke(prompt).content
  data = json.loads(response)
  print(data)
  response_list.append(data)

{'main_symptom': 'dysmorphic skull, muscle hypotonia, sensorineural hearing loss, visual compromise, seizures', 'risk_factors': 'autosomal recessive inheritance', 'disease_class': 'autosomal recessive disorder', 'main_system_affected': 'peroxisome biogenesis, kidneys, liver'}
{'main_symptom': 'rounded erythematous, dry, scaling patches', 'risk_factors': 'genetic factors', 'disease_class': 'chronic, inflammatory skin disease', 'main_system_affected': 'skin'}
{'main_symptom': 'Severe proteinuria', 'risk_factors': 'Chronic kidney dysfunction', 'disease_class': 'Nephrotic syndrome', 'main_system_affected': 'Renal system'}
{'main_symptom': 'Substance abuse', 'risk_factors': 'Genetic predisposition, family history of substance abuse, trauma, mental health disorders', 'disease_class': 'Mental health disorders', 'main_system_affected': 'Central nervous system'}
{'main_symptom': 'inadequate circulating red blood cells or insufficient hemoglobin', 'risk_factors': 'premature destruction of red bl

In [None]:
json_df = pd.DataFrame(response_list)
json_df.rename(columns={
    'main_symptom': 'Main Symptom',
    'risk_factors': 'Risk Factors',
    'disease_class': 'Disease Class GPT',
    'main_system_affected': 'Main System Affected'
}, inplace=True)
json_df

Unnamed: 0,Main Symptom,Risk Factors,Disease Class GPT,Main System Affected
0,"dysmorphic skull, muscle hypotonia, sensorineu...",autosomal recessive inheritance,autosomal recessive disorder,"peroxisome biogenesis, kidneys, liver"
1,"rounded erythematous, dry, scaling patches",genetic factors,"chronic, inflammatory skin disease",skin
2,Severe proteinuria,Chronic kidney dysfunction,Nephrotic syndrome,Renal system
3,Substance abuse,"Genetic predisposition, family history of subs...",Mental health disorders,Central nervous system
4,inadequate circulating red blood cells or insu...,premature destruction of red blood cells,Anemia,Blood
...,...,...,...,...
202,Abnormal vaginal bleeding,"Obesity, diabetes, hormone therapy, family his...",Tumors or cancer of the endometrium,Reproductive system
203,Pelvic pain,"Family history of endometriosis, early onset o...",Gynecological disorder,Reproductive system
204,Difficulty swallowing,"Tobacco and alcohol use, obesity, GERD, Barret...",Cancer,Digestive system
205,Esophageal mucosal eosinophilia,IgE-mediated hypersensitivity to food or inhal...,Chronic esophagitis,Gastrointestinal system


In [None]:
df_disease_final = pd.concat([df_disease_pp, json_df], axis=1)
df_disease_final.head()

Unnamed: 0,Disease ID,Disease Name,Disease Class,Definitions,Main Symptom,Risk Factors,Disease Class GPT,Main System Affected
0,C0043459,Zellweger Syndrome,inherited metabolic disorder,An autosomal recessive disorder due to defects...,"dysmorphic skull, muscle hypotonia, sensorineu...",autosomal recessive inheritance,autosomal recessive disorder,"peroxisome biogenesis, kidneys, liver"
1,C0033860,Psoriasis,integumentary system disease,"A common genetically determined, chronic, infl...","rounded erythematous, dry, scaling patches",genetic factors,"chronic, inflammatory skin disease",skin
2,C0027726,Nephrotic Syndrome,urinary system disease,A condition characterized by severe PROTEINURI...,Severe proteinuria,Chronic kidney dysfunction,Nephrotic syndrome,Renal system
3,C0236969,Substance-Related Disorders,substance-related disorder,Disorders related to substance abuse.,Substance abuse,"Genetic predisposition, family history of subs...",Mental health disorders,Central nervous system
4,C0002878,"Anemia, Hemolytic",immune system disease,A condition of inadequate circulating red bloo...,inadequate circulating red blood cells or insu...,premature destruction of red blood cells,Anemia,Blood


In [None]:
df_disease_final.to_csv('/content/drive/MyDrive/Projetos/GNN-Gene-Disease/Data/processed_data/diseases_table_final.csv')

## Graph

In [None]:
# Creating a graph from the DataFrame
G = nx.from_pandas_edgelist(df_disease_gene[:100], '# Disease ID', 'Gene ID')

node_colors = []
for node in G.nodes():
    if isinstance(node, int):  # Assuming Gene ID is an integer
        node_colors.append('blue')  # Color for Gene nodes
    else:
        node_colors.append('red')  # Color for Disease nodes

# Plotting the graph
plt.figure(figsize=(10, 6))
nx.draw(G, with_labels=False, edge_color='gray',node_color=node_colors, node_size=1)
plt.title('Disease-Gene Relations Graph')
plt.show()

In [None]:
# Creating a graph from the DataFrame
G = nx.from_pandas_edgelist(df_disease_gene[:2000], '# Disease ID', 'Gene ID')

node_colors = []
for node in G.nodes():
    if isinstance(node, int):  # Assuming Gene ID is an integer
        node_colors.append('blue')  # Color for Gene nodes
    else:
        node_colors.append('red')  # Color for Disease nodes

# Plotting the graph
plt.figure(figsize=(10, 6))
nx.draw(G, with_labels=False, edge_color='gray',node_color=node_colors, node_size=1)
plt.title('Disease-Gene Relations Graph')
plt.show()