In [5]:
import requests
import pandas as pd
import json

# Europe PMC API: Search for recent anti-obesity articles
def fetch_epmc_articles(query, from_year=2024, to_year=2025, max_results=1000):
    url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    params = {
        'query': f'{query} AND PUB_YEAR:[{from_year} TO {to_year}]',
        'format': 'json',
        'pageSize': max_results,
        'resultType': 'core'
    }
    response = requests.get(url, params=params)
    if response.ok:
        results = response.json()
        articles = results.get('resultList', {}).get('result', [])
        #for art in articles[:5]:
        #    print("Article keys:", list(art.keys()))
        # Extract titles and abstracts
        data = [{ 
                'title': art.get('title', ''), 
                'abstract': art.get('abstractText', art.get('abstract', '')), 
                'pubYear': art.get('pubYear', '')
            } for art in articles]
        return pd.DataFrame(data)
    else:
        print("Europe PMC request failed:", response.status_code)
        return pd.DataFrame()

# Open Targets Platform API: Get latest targets for obesity
def fetch_opentargets_targets(disease_efo="EFO_0001073", size=50):
    # EFO_0001073 is the code for "obesity"
    url = "https://api.platform.opentargets.org/api/v4/graphql"
    query = """
    query TargetSearch($efoId: String!, $size: Int!) {
      disease(efoId:$efoId){
          id
          name
          associatedTargets(page: {size: $size, index: 0}){
              count
              rows{
                  target{
                      id
                      approvedSymbol
                  }
                  score
                  datasourceScores{
                      id
                      score
                  }
              }
          }
      }
    }  
    """
    variables = {"efoId": disease_efo, "size": size}
    response = requests.post(url, json={"query": query, "variables": variables})
    print(response.status_code)
    print(response.text)

    if response.ok:
        results = response.json()
        data = []
        for item in results['data']['disease']['associatedTargets']['rows']:
            gene = item['target']['approvedSymbol']
            score = item['score']
            data.append({'target': gene, 'score': score})
        return pd.DataFrame(data)
    else:
        print("Open Targets request failed:", response.status_code)
        return pd.DataFrame()

# Example usage:
# Fetch literature
df_articles = fetch_epmc_articles("obesity target", 2023, 2025)
print(df_articles.head())

# Fetch targets
df_targets = fetch_opentargets_targets()
print(df_targets.head())


                                               title  \
0  Obesity promotes ARDS by modulating ceramide t...   
1  Adipose Tissue as a Target for Precision Medic...   
2  A blueprint of synergistic effect in Crataegus...   
3  Nanomaterials for the treatment and monitoring...   
4  Reflection of therapy progress in virtual real...   

                                            abstract pubYear  
0  Obesity is an independent risk factor for acut...    2025  
1  Following the trends of the adult obesity epid...    2025  
2  <h4>Background</h4>Current obesity treatments ...    2025  
3  Obesity represents a growing global health cri...    2025  
4  <h4>Purpose</h4>Obesity is a major health chal...    2025  
200
{"data":{"disease":{"id":"EFO_0001073","name":"obesity","associatedTargets":{"count":2976,"rows":[{"target":{"id":"ENSG00000166603","approvedSymbol":"MC4R"},"score":0.831734824240775,"datasourceScores":[{"id":"eva","score":0.9405285610759455},{"id":"gene_burden","score":0.93154469

In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

def chunk_text(text, tokenizer, max_length=510):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk_tokens = tokens[i:i + max_length]
        chunk = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk)
    return chunks

# Step 1: Load BioBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModelForTokenClassification.from_pretrained("alvaroalon2/biobert_genetic_ner")
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Step 2: Run NER on your abstracts/titles
def extract_targets_biobert(text_list, top_n=20):
    entity_freq = {}
    for text in text_list:
        token_count = len(tokenizer.tokenize(text))
        if token_count > 510:
            text_chunks = chunk_text(text, tokenizer)
        else:
            text_chunks = [text]
        for chunk in text_chunks:
            ner_results = nlp_ner(chunk)
            for entity in ner_results:
                # Check your model's field name! If it uses entity['entity'], use that
                ent_grp = entity.get('entity_group', entity.get('entity', ''))
                if ent_grp in ['GENE', 'PROTEIN','GENETIC']:
                    ent = entity['word']
                    entity_freq[ent] = entity_freq.get(ent, 0) + 1
    sorted_entities = sorted(entity_freq.items(), key=lambda x: x[1], reverse=True)
    return sorted_entities[:top_n]

# Example usage:
top_targets = extract_targets_biobert(df_articles["abstract"].dropna().tolist())
print(top_targets)
df_articles['targets'] = str(top_targets)
df_articles.to_csv("output.csv", index=False)


Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[('insulin', 378), ('leptin', 93), ('glp - 1', 57), ('ampk', 54), ('g', 47), ('fto', 46), ('h', 45), ('hba1c', 43), ('glp - 1 ras', 35), ('sirt1', 33), ('pparγ', 32), ('ucp1', 32), ('il - 6', 31), ('s', 30), ('mtor', 28), ('glucagon - like peptide - 1 receptor', 28), ('adiponectin', 28), ('glp - 1 receptor', 28), ('tnf - α', 28), ('akt', 26)]


In [None]:
# Attempt using BERN2 for extraction and normalization
#  References:
#  - BERN2:https://github.com/sudha-vijayakumar/BERN2_TigerGraph_BioMedical_KnowledgeGraph/blob/main/bioNLP%202/1_bioNLP_Data.ipynb
#  https://medium.com/@sudha.vijayakumar_74093/implementing-a-biomedical-knowledge-graph-using-bern2-and-tigergraph-56a5e670782a
# There are many other models available at https://huggingface.co/models?search=dmis-lab but chief among them is BioBERT. BERN2 also does normalization.
# and was created as a NER + normalization tool over BioBERT. BioBERT has independently been improved, so we will try that out separately as well.




In [8]:
def extract_entities_biobert(text):
    # Handles long texts by chunking
    token_count = len(tokenizer.tokenize(text))
    if token_count > 510:
        text_chunks = chunk_text(text, tokenizer)
    else:
        text_chunks = [text]
    entities = []
    for chunk in text_chunks:
        ner_results = nlp_ner(chunk)
        for entity in ner_results:
            ent_grp = entity.get('entity_group', entity.get('entity', ''))
            if ent_grp in ['GENE', 'PROTEIN', 'GENETIC']:
                entities.append(entity['word'])
    # Optionally, remove duplicates by converting to set: list(set(entities))  
    return entities

# Apply to dataframe
df_articles['targets'] = df_articles['abstract'].fillna('').apply(extract_entities_biobert)

# Save to CSV
df_articles.to_csv("output.csv", index=False)


In [2]:
!pip uninstall numpy
!pip install numpy==1.26.4 --only-binary=:all:
!pip install scikit-learn
!pip install gilda

^C


ERROR: Could not find a version that satisfies the requirement numpy==1.26.4 (from versions: 2.1.0, 2.1.1, 2.1.2, 2.1.3, 2.2.0, 2.2.1, 2.2.2, 2.2.3, 2.2.4, 2.2.5, 2.2.6, 2.3.0, 2.3.1, 2.3.2, 2.3.3)
ERROR: No matching distribution found for numpy==1.26.4


Collecting gilda
  Using cached gilda-1.4.1-py3-none-any.whl.metadata (9.2 kB)
Collecting boto3 (from gilda)
  Using cached boto3-1.40.44-py3-none-any.whl.metadata (6.7 kB)
Collecting adeft (from gilda)
  Using cached adeft-0.12.3.tar.gz (177 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting flask<4.0,>=3.0 (from gilda)
  Using cached flask-3.1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting flask-restx>=1.3.0 (from gilda)
  Using cached flask_restx-1.3.2-py2.py3-none-any.whl.metadata (9.3 kB)
Collecting pystow>=0.1.10 (from gilda)
  Using cached pystow-0.7.11-py3-none-any.whl.metadata (17 kB)
Collecting unidecode (from gilda)
  Using cached Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Colle

  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [9 lines of output]
      Collecting setuptools
        Using cached setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
      Collecting wheel
        Using cached wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
      Collecting Cython>=3.0.8
        Using cached cython-3.1.4-cp313-cp313-win_amd64.whl.metadata (5.1 kB)
      ERROR: Ignored the following versions that require a different python version: 1.21.2 Requires-Python >=3.7,<3.11; 1.21.3 Requires-Python >=3.7,<3.11; 1.21.4 Requires-Python >=3.7,<3.11; 1.21.5 Requires-Python >=3.7,<3.11; 1.21.6 Requires-Python >=3.7,<3.11; 1.26.0 Requires-Python >=3.9,<3.13; 1.26.1 Requires-Python >=3.9,<3.13
      ERROR: Could not find a version that satisfies the requirement numpy==2.0.0rc1 (from versions: 1.3.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.7.1, 1.7.2, 1.8.0, 1.8.1, 1.8.2, 1.9.0, 1.9

In [11]:
import pandas as pd
import gilda
import re

def extract_term_label(term_obj):
    term_str = repr(term_obj)
    m = re.match(r"Term\(([^,]+)", term_str)
    if m:
        return m.group(1).strip()
    else:
        return None

def normalize_entity_list(entity_list):
    normalized_names = []
    if not isinstance(entity_list, list):
        return normalized_names  # empty if input is not a list
    for term in entity_list:
        norm = gilda.ground(term)
        if norm:
            best = norm[0]
            n_label = extract_term_label(best.term)
            if n_label and n_label not in normalized_names and len(n_label) > 2:
                normalized_names.append(n_label)
    return normalized_names

# Add normalized_text column (list of normalized names for each abstract)
df_articles['normalized_text'] = df_articles['targets'].apply(normalize_entity_list)

# Save as CSV
df_articles.to_csv("output_with_normalized.csv", index=False)


In [26]:
def extract_targets_biobert(ent_list, top_n=20):
    entity_freq = {}
    for ent in ent_list:
        entity_freq[ent] = entity_freq.get(ent, 0) + 1
    sorted_entities = sorted(entity_freq.items(), key=lambda x: x[1], reverse=True)
    return sorted_entities[:top_n]

print(extract_targets_biobert(df_articles['normalized_text'].explode().dropna().tolist(), top_n=500))

[('insulin', 186), ('glucagon like peptide 1 receptor', 32), ('glp 1', 28), ('hba1c', 27), ('ampk', 26), ('leptin', 26), ('glp 1 receptor', 23), ('glucagon like peptide 1', 22), ('mtor', 19), ('pparγ', 19), ('ucp1', 19), ('akt', 18), ('pi3k', 15), ('uncoupling protein 1', 15), ('adiponectin', 14), ('sirt1', 12), ('gip', 12), ('incretin', 10), ('stat3', 9), ('amp activated protein kinase', 9), ('interleukin 6', 9), ('hba1', 8), ('renin', 8), ('protein kinase b', 7), ('nrf2', 7), ('pancreatic lipase', 7), ('ghrelin', 7), ('fgf21', 7), ('pparg', 6), ('fasn', 6), ('nlrp3', 6), ('g protein coupled receptors', 6), ('fto', 6), ('egfr', 6), ('low density lipoprotein', 6), ('mtorc1', 6), ('p38', 6), ('tnf', 5), ('fatty acid synthase', 5), ('tlr4', 5), ('esr1', 5), ('insulin like growth factor 1', 5), ('histone', 5), ('ppar', 5), ('glucagon receptor', 5), ('ldl', 5), ('glucagon', 5), ('c reactive protein', 5), ('angiotensin', 5), ('mc4r', 5), ('jnk', 5), ('gipr', 5), ('sglt2', 5), ('hemoglobin',

In [None]:
import re
from typing import List, Tuple, Dict
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoModelForSequenceClassification
import torch
import nltk

# 1) Sentence splitter
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# 2) BioBERT NER
ner_tok = AutoTokenizer.from_pretrained("dmis-lab/bern2-ner")
ner_model = AutoModelForTokenClassification.from_pretrained("dmis-lab/bern2-ner")
ner = pipeline("ner", model=ner_model, tokenizer=ner_tok, aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1)

# 3) Biomedical Relation Classifier (binary: relation vs no relation)
# Replace with a relation model you prefer; here we assume a generic biomed RE binary classifier
re_model_name = "dmis-lab/bern2-ner"  # example placeholder; choose a binary RE model for your use case
re_tok = AutoTokenizer.from_pretrained(re_model_name)
re_model = AutoModelForSequenceClassification.from_pretrained(re_model_name)
re_pipe = pipeline("text-classification", model=re_model, tokenizer=re_tok, return_all_scores=True, device=0 if torch.cuda.is_available() else -1)

# Helper: extract candidate targets (GENE/PROTEIN) from sentence
def extract_targets_from_sentence(sentence: str) -> List[str]:
    ents = ner(sentence)
    targets = []
    for e in ents:
        if e["entity_group"] in ["GENE", "PROTEIN"]:
            targets.append(e["word"])
    # deduplicate
    return list(dict.fromkeys([t.strip() for t in targets]))

# Build RE input: we’ll format as "[SUBJ] {target} [/SUBJ] ... [OBJ] obesity [/OBJ]"
def make_re_text(sentence: str, subj: str, obj: str = "obesity") -> str:
    # simple insertion: highlight mentions; if multiple occurrences, mark the first occurrence
    s = sentence
    # case-insensitive mark of subject
    pat_subj = re.compile(re.escape(subj), re.IGNORECASE)
    s = pat_subj.sub(f"[SUBJ]{subj}[/SUBJ]", s, count=1)
    # mark object "obesity"
    pat_obj = re.compile(r"\bobesity\b", re.IGNORECASE)
    s = pat_obj.sub("[OBJ]obesity[/OBJ]", s, count=1)
    return s

# Score relation per (target, sentence)
def score_relation(sentence: str, target: str) -> float:
    # require "obesity" to be present to attempt RE
    if not re.search(r"\bobesity\b", sentence, re.IGNORECASE):
        return 0.0
    re_text = make_re_text(sentence, target, "obesity")
    scores = re_pipe(re_text)[0]   # list of dicts with 'label' and 'score'
    # Map to a positive relation score; adjust labels depending on model
    # Example assumption: labels ['NEGATIVE','POSITIVE']
    label_scores = {d['label'].upper(): d['score'] for d in scores}
    pos_score = label_scores.get('POSITIVE', 0.0)
    return float(pos_score)

def extract_target_relations_from_abstract(abstract: str) -> List[Tuple[str, str, float]]:
    relations = []
    for sent in sent_tokenize(abstract):
        targets = extract_targets_from_sentence(sent)
        if not targets:
            continue
        if not re.search(r"\bobesity\b", sent, re.IGNORECASE):
            continue
        for tgt in targets:
            score = score_relation(sent, tgt)
            if score > 0:  # keep only positive evidence
                relations.append((tgt.upper(), sent, score))
    return relations

# Aggregate over your dataframe df_articles
def aggregate_relations(df_articles, min_score=0.5):
    evidence = []
    for idx, row in df_articles.dropna(subset=["abstract"]).iterrows():
        abs_text = row["abstract"]
        rels = extract_target_relations_from_abstract(abs_text)
        for tgt, sent, score in rels:
            if score >= min_score:
                evidence.append({"target": tgt, "sentence": sent, "score": score, "pubYear": row.get("pubYear", "")})
    return evidence

# Example
evidence = aggregate_relations(df_articles, min_score=0.5)
# Summarize by target
from collections import defaultdict
agg = defaultdict(lambda: {"count":0, "max_score":0.0})
for e in evidence:
    agg[e["target"]]["count"] += 1
    agg[e["target"]]["max_score"] = max(agg[e["target"]]["max_score"], e["score"])

top_targets_by_rel = sorted([(t, v["count"], v["max_score"]) for t, v in agg.items()], key=lambda x: (-x[1], -x[2]))
print(top_targets_by_rel[:20])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at dm

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\aksha/nltk_data'
    - 'c:\\Users\\aksha\\anaconda3\\envs\\psp\\nltk_data'
    - 'c:\\Users\\aksha\\anaconda3\\envs\\psp\\share\\nltk_data'
    - 'c:\\Users\\aksha\\anaconda3\\envs\\psp\\lib\\nltk_data'
    - 'C:\\Users\\aksha\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [27]:
import streamlit as st
import pandas as pd

# Load harmonized extraction results
df_articles = pd.read_csv("output_with_normalized.csv")

# Collect all unique harmonized targets
all_extracted_targets = set()
for targets_str in df_articles['harmonized_text'].dropna():
    if isinstance(targets_str, str):
        targets = eval(targets_str)  # Only use eval on trusted data
    else:
        targets = targets_str
    all_extracted_targets.update(targets)
all_extracted_targets = sorted(list(all_extracted_targets))

# Define the known drug targets (could come from a curated list, e.g. UniProt, HGNC, etc.)
known_drug_targets = [
    "insulin receptor", "glucagon receptor", "GLP1R", "MTOR", "PPARγ", "TNF", "LEPR",
    # ...extend with your own list of drug targets
]

# --- APP UI ---
st.title("Gene/Protein Target Selection App")

st.subheader("Extracted Targets From Data")
st.write("These are all harmonized targets found in your dataset (read-only):")
st.write(all_extracted_targets)

st.subheader("Select Known Drug Creation Targets")
selected_targets = st.multiselect(
    "Choose drug targets for prioritization or further analysis:",
    known_drug_targets
)

st.write("### Your Selected Drug Targets")
if selected_targets:
    st.write(selected_targets)
else:
    st.write("No drug targets selected.")



ModuleNotFoundError: No module named 'streamlit'

In [2]:
!pip install mygene bioservices
import pandas as pd
import gilda
import mygene
from bioservices import UniProt

# Set up lookup services
mg = mygene.MyGeneInfo()
u = UniProt()

def lookup_gene_symbol(database, id_):
    """
    Given HGNC database and ID, return canonical gene symbol or name.
    """
    if database == "HGNC" and pd.notnull(id_):
        try:
            result = mg.getgene(id_, fields="symbol,name")
            if result is not None:
                # Prefer symbol, fallback to name
                return result.get("symbol") or result.get("name")
        except Exception as e:
            print(f"Gene lookup error for {id_}: {e}")
    return None

def lookup_protein_name(database, id_):
    """
    Given UniProt db (UP) and ID, return recommended protein name.
    """
    if database == "UP" and pd.notnull(id_):
        try:
            # The response is a tab-separated table: ID\tProtein names
            result = u.search(id_, frmt="tab", columns="id,protein names", limit=1)
            lines = result.strip().split("\n")
            if len(lines) > 1:
                # Second line is data row
                return lines[1].split("\t")[1]
        except Exception as e:
            print(f"UniProt lookup error for {id_}: {e}")
    return None

def harmonize_name(row):
    """
    Returns the harmonized entity name, using external services if needed.
    """
    db, id_ = row["database"], row["id"]
    # Gene harmonization
    if db == "HGNC":
        canonical = lookup_gene_symbol(db, id_)
        if canonical:
            return canonical
    # UniProt protein harmonization
    if db == "UP":
        canonical = lookup_protein_name(db, id_)
        if canonical:
            return canonical
    # FPLX, CHEBI, MESH, EFO fallbacks: use normalized_name if present
    if db in ["FPLX", "CHEBI", "MESH", "EFO"]:
        return row["normalized_name"] or row["official_name"] or row["original"]
    # Fallback: use whatever is available
    return row["official_name"] or row["normalized_name"] or row["original"]

# Your entity list (replace with your actual source)
# entities = [term for term, freq in top_targets]

results = []
official_names = {}

for term in entities:
    norm = gilda.ground(term)
    if norm:
        best = norm[0]
        entry_obj = getattr(best, 'entry', None)
        official_name = None
        if entry_obj is not None:
            official_name = getattr(entry_obj, "name", None) or getattr(entry_obj, "label", None)
        if not official_name and hasattr(entry_obj, "names"):
            official_name = entry_obj.names[0] if entry_obj.names else None
        if not official_name:
            official_name = str(entry_obj)  # fallback

        print(f"DEBUG: {term} => official_name: {official_name}; repr(entry): {repr(entry_obj)}")
        groundings = best.get_groundings()
        if groundings:
            db, id_ = next(iter(groundings))
        else:
            db, id_ = None, None

        # Try to extract official name from match entry
        official_name = getattr(best.entry, 'name', None) if hasattr(best, 'entry') else None

        results.append({
            'original': term,
            'normalized_name': best.term.name if hasattr(best.term, "name") else str(best.term),
            'database': db,
            'id': id_,
            'score': best.score,
            'official_name': official_name
        })

        key = (db, id_)
        if official_name and key not in official_names:
            official_names[key] = official_name

    else:
        results.append({
            'original': term,
            'normalized_name': None,
            'database': None,
            'id': None,
            'score': None,
            'official_name': None
        })

df_norm = pd.DataFrame(results)

# Harmonized names using external lookup or fallbacks
df_norm['harmonized_name'] = df_norm.apply(harmonize_name, axis=1)

print(df_norm)
df_norm.to_csv("normalized_targets_harmonized.csv", index=False)


Collecting mygene
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting bioservices
  Downloading bioservices-1.12.1-py3-none-any.whl.metadata (19 kB)
Collecting biothings-client>=0.2.6 (from mygene)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Collecting beautifulsoup4<5.0.0,>=4.12.3 (from bioservices)
  Downloading beautifulsoup4-4.14.2-py3-none-any.whl.metadata (3.8 kB)
Collecting colorlog<7.0.0,>=6.9.0 (from bioservices)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting easydev<0.14.0,>=0.13.3 (from bioservices)
  Downloading easydev-0.13.3-py3-none-any.whl.metadata (4.0 kB)
Collecting grequests<0.8.0,>=0.7.0 (from bioservices)
  Downloading grequests-0.7.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting lxml<6.0.0,>=5.3.0 (from bioservices)
  Downloading lxml-5.4.0-cp311-cp311-win_amd64.whl.metadata (3.6 kB)
Collecting matplotlib>=3.9 (from bioservices)
  Downloading matplotlib-3.10.6-cp311-cp311-win_amd64.w

NameError: name 'entities' is not defined

In [30]:
# Test on a sample abstract
sample_text = df_articles["abstract"].dropna().iloc[0]
ner_results = nlp_ner(sample_text)
print(ner_results)

# Print all unique entity groups found
unique_entities = set([entity['entity_group'] for entity in ner_results])
print("Unique entity types found:", unique_entities)


[{'entity_group': 'GENETIC', 'score': np.float32(0.9999895), 'word': 'pulmonary ceramide transfer protein', 'start': 719, 'end': 754}, {'entity_group': 'GENETIC', 'score': np.float32(0.99985635), 'word': 'cert', 'start': 756, 'end': 760}, {'entity_group': 'GENETIC', 'score': np.float32(0.99662757), 'word': 'cer', 'start': 832, 'end': 835}, {'entity_group': 'GENETIC', 'score': np.float32(0.9998863), 'word': 'cert', 'start': 907, 'end': 911}, {'entity_group': 'GENETIC', 'score': np.float32(0.999957), 'word': 'cert', 'start': 1005, 'end': 1009}, {'entity_group': 'GENETIC', 'score': np.float32(0.98633677), 'word': 'cer', 'start': 1040, 'end': 1043}, {'entity_group': 'GENETIC', 'score': np.float32(0.99900615), 'word': 'cer', 'start': 1074, 'end': 1077}, {'entity_group': 'GENETIC', 'score': np.float32(0.99995786), 'word': 'cert', 'start': 1243, 'end': 1247}, {'entity_group': 'GENETIC', 'score': np.float32(0.99996424), 'word': 'cert', 'start': 1379, 'end': 1383}, {'entity_group': 'GENETIC', '

In [17]:
print(model.config.id2label)


{0: 'O', 1: 'B-Activity', 2: 'B-Administration', 3: 'B-Age', 4: 'B-Area', 5: 'B-Biological_attribute', 6: 'B-Biological_structure', 7: 'B-Clinical_event', 8: 'B-Color', 9: 'B-Coreference', 10: 'B-Date', 11: 'B-Detailed_description', 12: 'B-Diagnostic_procedure', 13: 'B-Disease_disorder', 14: 'B-Distance', 15: 'B-Dosage', 16: 'B-Duration', 17: 'B-Family_history', 18: 'B-Frequency', 19: 'B-Height', 20: 'B-History', 21: 'B-Lab_value', 22: 'B-Mass', 23: 'B-Medication', 24: 'B-Non[biological](Detailed_description', 25: 'B-Nonbiological_location', 26: 'B-Occupation', 27: 'B-Other_entity', 28: 'B-Other_event', 29: 'B-Outcome', 30: 'B-Personal_[back](Biological_structure', 31: 'B-Personal_background', 32: 'B-Qualitative_concept', 33: 'B-Quantitative_concept', 34: 'B-Severity', 35: 'B-Sex', 36: 'B-Shape', 37: 'B-Sign_symptom', 38: 'B-Subject', 39: 'B-Texture', 40: 'B-Therapeutic_procedure', 41: 'B-Time', 42: 'B-Volume', 43: 'B-Weight', 44: 'I-Activity', 45: 'I-Administration', 46: 'I-Age', 47: 