# Generative AI for Drug Repurposing in Anaplastic Thyroid Cancer (ATC)
This notebook generates scientific hypotheses for drug repurposing in ATC using open-source LLMs like BioGPT.

In [None]:
# Install necessary libraries
!pip install transformers
!pip install torch
!pip install requests


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m897.5/897.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForMaskedLM
import requests
import pandas as pd
import re

model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)
unmasker = pipeline("fill-mask", model=model, tokenizer=tokenizer, top_k=5)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
# List of repurposed drugs for ATC
drugs = {
    "Auranofin": ["TXNRD1", "NF-Œ∫B", "ROS"],
    "Sorafenib": ["CRAF", "BRAF", "VEGFR-2/-3", "PDGFR-Œ≤", "c-Kit", "RET"],
    "Lenvatinib": ["VEGFR1-3", "FGFR1-4", "PDGFRŒ±", "RET", "KIT"],
    "Cabozantinib": ["MET", "VEGFR2", "RET", "AXL", "KIT"],
    "Vandetanib": ["VEGFR", "EGFR", "RET"],
    "Everolimus": ["mTORC1"],
    "Nivolumab": ["PD-1 "],
    "Doxorubicin": ["DNA", "topoisomerase II", "ROS "],
    "Epirubicin" : ["DNA", "topoisomerase II", "ROS"],
    "Pazopanib" : ["VEGFR", "PDGFR", "c-Kit"]
}


In [None]:
def clean_prediction(word):
    # Keep terms with at least 2 characters
    return len(word) >= 2

results = {}

In [None]:
biomedical_verbs = {
    "suppress", "inhibit", "block", "reduce", "impair", "slow", "control",
    "reverse", "downregulate", "prevent", "regress", "stop", "modulate", "restrict"
}

results = {}
# The original code used drug_targets, which was not defined.
# This line is changed to use 'drugs' instead, which is the dictionary containing the drug and target information.
for drug, targets in drugs.items():
    results[drug] = {}
    for target in targets:
        prompt = f"{drug} may inhibit {target}, which can {tokenizer.mask_token} tumor progression in anaplastic thyroid cancer."
        predictions = unmasker(prompt)
        filtered_preds = [p['token_str'] for p in predictions if p['token_str'].lower() in biomedical_verbs]
        results[drug][target] = filtered_preds[:3] if filtered_preds else ["No biomedical match"]

In [None]:
for drug, targets in drugs.items(): # Changed drug_targets to drugs
    results[drug] = {}
    for target in targets:
        prompt = f"{drug} may inhibit {target}, which can [MASK] tumor progression in anaplastic thyroid cancer."
        predictions = unmasker(prompt)
        clean_preds = [p['token_str'] for p in predictions if clean_prediction(p['token_str'])]
        results[drug][target] = clean_preds[:3]  # Take top 3 cleaned predictions

In [None]:
for drug, targets in drugs.items():
    print(f"\nüî¨ {drug}")
    for target in targets:
        print(f"  ‚û§ {target}: {results.get(drug, {}).get(target)}")


üî¨ Auranofin
  ‚û§ TXNRD1: ['##rophe', '##real', '##rop']
  ‚û§ NF-Œ∫B: ['##rophe', '##rome', '##real']
  ‚û§ ROS: ['##rophe', '##real', '##rop']

üî¨ Sorafenib
  ‚û§ CRAF: ['##rophe', '##rop', '##real']
  ‚û§ BRAF: ['##rophe', '##rop', '##real']
  ‚û§ VEGFR-2/-3: ['##rophe', '##rop', '##real']
  ‚û§ PDGFR-Œ≤: ['##rophe', '##real', '##rop']
  ‚û§ c-Kit: ['##rophe', '##real', '##rop']
  ‚û§ RET: ['##real', '##rophe', '##rome']

üî¨ Lenvatinib
  ‚û§ VEGFR1-3: ['##rophe', '##real', '##rop']
  ‚û§ FGFR1-4: ['##rophe', '##real', '##rop']
  ‚û§ PDGFRŒ±: ['##rophe', '##real', '##plied']
  ‚û§ RET: ['##rophe', '##real', '##rome']
  ‚û§ KIT: ['##rophe', '##real', '##rome']

üî¨ Cabozantinib
  ‚û§ MET: ['##rophe', '##rome', '##real']
  ‚û§ VEGFR2: ['##rophe', '##rop', '##real']
  ‚û§ RET: ['##rophe', '##real', '##rome']
  ‚û§ AXL: ['##rophe', '##rop', '##real']
  ‚û§ KIT: ['##rophe', '##rome', '##real']

üî¨ Vandetanib
  ‚û§ VEGFR: ['##rophe', '##rop', '##real']
  ‚û§ EGFR: ['##rophe', '#

In [None]:
import pprint
pprint.pprint(results)

{'Auranofin': {'NF-Œ∫B': ['##rophe', '##rome', '##real'],
               'ROS': ['##rophe', '##real', '##rop'],
               'TXNRD1': ['##rophe', '##real', '##rop']},
 'Cabozantinib': {'AXL': ['##rophe', '##rop', '##real'],
                  'KIT': ['##rophe', '##rome', '##real'],
                  'MET': ['##rophe', '##rome', '##real'],
                  'RET': ['##rophe', '##real', '##rome'],
                  'VEGFR2': ['##rophe', '##rop', '##real']},
 'Doxorubicin': {'DNA': ['##rop', '##rophe', '##real'],
                 'ROS ': ['##rop', '##rophe', '##real'],
                 'topoisomerase II': ['##rop', '##rophe', '##real']},
 'Epirubicin': {'DNA': ['##rophe', '##real', '##rop'],
                'ROS': ['##rophe', '##rop', '##real'],
                'topoisomerase II': ['##rop', '##rophe', '##real']},
 'Everolimus': {'mTORC1': ['##rophe', '##real', '##rop']},
 'Lenvatinib': {'FGFR1-4': ['##rophe', '##real', '##rop'],
                'KIT': ['##rophe', '##real', '##rome'],
  

## Next Step: Literature Validation
Use PubMed or Semantic Scholar APIs to validate each hypothesis against recent literature.

In [None]:
# Install Biopython for PubMed access
!pip install biopython




In [None]:
from Bio import Entrez
Entrez.email = "nida.amir0083@gmail.com"  # Replace with your email for NCBI Entrez

def search_pubmed(query, max_results=5):
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

def fetch_abstracts(id_list):
    ids = ",".join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, rettype="abstract", retmode="text")
    results = handle.read()
    handle.close()
    return results


In [None]:
# Validate hypotheses with PubMed search
for drug, targets in results.items():
    print(f"\nValidating hypotheses for {drug}:")
    for target, hypotheses_list in targets.items(): # Renamed variable to hypotheses_list
        print(f"\nHypotheses for {target}: {hypotheses_list}") # Updated print statement
        for hypothesis in hypotheses_list: # Iterate through each hypothesis in the list
            search_results = search_pubmed(hypothesis)
            if search_results:
                abstracts = fetch_abstracts(search_results)
                print("Matching PubMed abstracts:")
                print(abstracts[:1000])  # Print first 1000 characters
            else:
                print(f"No matches found on PubMed for hypothesis: {hypothesis}") # Updated print statement


Validating hypotheses for Auranofin:

Hypotheses for TXNRD1: ['##rophe', '##real', '##rop']
Matching PubMed abstracts:
1. Neuroimage Clin. 2020;27:102282. doi: 10.1016/j.nicl.2020.102282. Epub 2020
May  26.

Dynamic association between AT(N) profile and cognition mediated by cortical 
thickness in Alzheimer's continuum.

Jang JW(1), Kim Y(2), Kim S(3), Park SW(4), Kwon SO(5), Park YH(6), Lim JS(7), 
Youn YC(8), Hun Kim S(9), Kim S(10); Alzheimer's Disease Neuroimaging 
Initiative.

Author information:
(1)Department of Neurology, Kangwon National University Hospital, Chuncheon, 
Republic of Korea; Kangwon National University School of Medicine, Chuncheon, 
Republic of Korea. Electronic address: light26@kangwon.ac.kr.
(2)Department of Neurology, Kangwon National University Hospital, Chuncheon, 
Republic of Korea; Kangwon National University School of Medicine, Chuncheon, 
Republic of Korea. Electronic address: yeshins@gmail.com.
(3)Department of Neurology, Kangwon National University Ho