In [3]:
from weaviate.util import generate_uuid5
from weaviate.classes.init import Auth
import weaviate
import json
import pandas as pd

client = weaviate.connect_to_weaviate_cloud(
    cluster_url="https://xxxxx.weaviate.cloud",  # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key("xxxx")
)



In [4]:
df = pd.read_csv("PubMed Multi Label Text Classification Dataset Processed.csv")

In [5]:
#Filter the dataset if needed
df = df[:1000]

In [6]:
import numpy as np
# Replace infinity values with NaN and then fill NaN values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna('', inplace=True)

# Convert columns to string type
df['Title'] = df['Title'].astype(str)
df['abstractText'] = df['abstractText'].astype(str)
df['meshMajor'] = df['meshMajor'].astype(str)

In [7]:
import urllib.parse
from rdflib import Graph, RDF, RDFS, Namespace, URIRef, Literal


# Function to create a valid URI
def create_valid_uri(base_uri, text):
    if pd.isna(text):
        return None
    # Encode text to be used in URI
    sanitized_text = urllib.parse.quote(text.strip().replace(' ', '_').replace('"', '').replace('<', '').replace('>', '').replace("'", "_"))
    return URIRef(f"{base_uri}/{sanitized_text}")


# Function to create a valid URI for Articles
import re
from urllib.parse import quote
def create_article_uri(title, base_namespace="http://example.org/article/"):
    if pd.isna(title):
        return None
    # Replace non-word characters with underscores
    sanitized_title = re.sub(r'\W+', '_', title.strip())
    # Condense multiple underscores into a single underscore
    sanitized_title = re.sub(r'_+', '_', sanitized_title)
    # URL-encode the term
    encoded_title = quote(sanitized_title)
    # Concatenate with base_namespace without adding underscores
    uri = f"{base_namespace}{encoded_title}"
    return URIRef(uri)

# Add a new column to the DataFrame for the article URIs
df['Article_URI'] = df['Title'].apply(lambda title: create_valid_uri("http://example.org/article", title))


In [8]:
# Function to clean and parse MeSH terms
def parse_mesh_terms(mesh_list):
    if pd.isna(mesh_list):
        return []
    return [
        term.strip().replace(' ', '_')
        for term in mesh_list.strip("[]'").split(',')
    ]

# Function to create a valid URI for MeSH terms
def create_valid_uri(base_uri, text):
    if pd.isna(text):
        return None
    sanitized_text = urllib.parse.quote(
        text.strip()
        .replace(' ', '_')
        .replace('"', '')
        .replace('<', '')
        .replace('>', '')
        .replace("'", "_")
    )
    return f"{base_uri}/{sanitized_text}"

# Extract and process all MeSH terms
#mdf = df.head(5000)
all_mesh_terms = []
for mesh_list in df["meshMajor"]:
    all_mesh_terms.extend(parse_mesh_terms(mesh_list))

# Deduplicate terms
unique_mesh_terms = list(set(all_mesh_terms))

# Create a DataFrame of MeSH terms and their URIs
mesh_df = pd.DataFrame({
    "meshTerm": unique_mesh_terms,
    "URI": [create_valid_uri("http://example.org/mesh", term) for term in unique_mesh_terms]
})

# Display the DataFrame
print(mesh_df)


                                    meshTerm  \
0                     'Staining_and_Labeling   
1                                Endosseous'   
2                                  Afferent'   
3                                  Hormonal'   
4                              'Arthrodesis'   
...                                      ...   
5303              'Polydeoxyribonucleotides'   
5304                   'Cerebral_Ventricles'   
5305  'Sodium-Potassium-Chloride_Symporters'   
5306                  'Lung_Transplantation'   
5307                 'Myocardial_Infarction'   

                                                    URI  
0        http://example.org/mesh/_Staining_and_Labeling  
1                   http://example.org/mesh/Endosseous_  
2                     http://example.org/mesh/Afferent_  
3                     http://example.org/mesh/Hormonal_  
4                 http://example.org/mesh/_Arthrodesis_  
...                                                 ...  
5303  http://exam

In [11]:
keywords = [
   "cancer", "tumor", "tumour", "neoplasm", "carcinoma", "sarcoma", "leukemia",
    "lymphoma", "melanoma", "adenoma", "metastasis", "malignancy", "oncology",
    "blastoma", "myeloma", "lymphosarcoma", "glioma", "astrocytoma", "retinoblastoma",
    "teratoma", "myosarcoma", "fibrosarcoma", "mesothelioma", "adenocarcinoma",
    "squamous cell", "basal cell", "glioblastoma", "neuroblastoma", "rhabdomyosarcoma",
    "osteosarcoma", "chondrosarcoma", "angiosarcoma", "hepatoma", "hepatocellular",
    "cholangiocarcinoma", "medulloblastoma", "ependymoma", "Wilms tumor", "Burkitt",
    "Kaposi", "myelodysplastic", "myeloproliferative", "plasmacytoma", "thymoma",
    "pheochromocytoma", "paraganglioma", "seminoma", "dysgerminoma", "yolk sac tumor",
    "choriocarcinoma", "germ cell tumor", "granulosa cell tumor", "serous carcinoma",
    "mucinous carcinoma", "clear cell carcinoma", "endometrioid carcinoma",
    "small cell carcinoma", "large cell carcinoma", "anaplastic", "in situ", "metastatic",
    "malignant", "benign tumor", "carcinosarcoma", "sarcomatoid", "Paget", "Ewing",
    "Langerhans cell histiocytosis", "Langerhans cell sarcoma", "desmoid tumor",
    "liposarcoma", "leiomyosarcoma", "hemangiosarcoma", "hemangiopericytoma",
    "schwannoma", "neurofibroma", "neurofibromatosis", "oligodendroglioma",
    "pineoblastoma", "pineocytoma", "pituitary adenoma", "craniopharyngioma",
    "chordoma", "ameloblastoma", "odontogenic tumor", "myxoma", "cystadenoma",
    "cystadenocarcinoma", "papilloma", "papillary carcinoma", "follicular carcinoma",
    "Hurthle cell carcinoma", "medullary carcinoma", "anaplastic carcinoma",
    "Merkel cell carcinoma", "cutaneous T-cell lymphoma", "cutaneous B-cell lymphoma",
    "mycosis fungoides", "Sezary syndrome", "lymphoproliferative disorder",
    "plasmacytoma", "multiple myeloma", "extramedullary myeloma",
    "lymphomatoid granulomatosis", "lymphomatoid papulosis", "Castleman disease",
    "Waldenstrom macroglobulinemia", "hairy cell leukemia", "chronic lymphocytic leukemia",
    "acute lymphoblastic leukemia", "acute myeloid leukemia", "chronic myeloid leukemia",
    "polycythemia vera", "essential thrombocythemia", "primary myelofibrosis",
    "myeloproliferative neoplasm", "myelodysplastic syndrome",
    "blastic plasmacytoid dendritic cell neoplasm"
        # Broader and related terms
    "neoplastic", "hyperplasia", "dysplasia", "precancerous", "preneoplastic", "intraepithelial neoplasia", "lesion", "mass", "growth", "nodule", "polyp", "cyst",
    "hyperplastic", "metaplasia", "carcinogenesis", "sarcomagenesis", "tumor marker", "tumor suppressor", "proto-oncogene", "oncoprotein", "neoplastic syndrome",
    "paraneoplastic", "cancerous", "tumorous", "neoplasia", "neoplastic process", "neoplastic disease",

    # Organ/location-specific cancers
    "breast cancer", "lung cancer", "prostate cancer", "colorectal cancer", "colon cancer", "rectal cancer", "pancreatic cancer", "liver cancer", "gastric cancer",
    "stomach cancer", "esophageal cancer", "thyroid cancer", "ovarian cancer", "cervical cancer", "endometrial cancer", "uterine cancer", "testicular cancer",
    "bladder cancer", "renal cancer", "kidney cancer", "skin cancer", "oral cancer", "mouth cancer", "head and neck cancer", "nasopharyngeal cancer", "brain cancer",
    "central nervous system tumor", "spinal tumor", "eye cancer", "retinal cancer", "bone cancer", "soft tissue sarcoma", "muscle cancer", "adrenal cancer",
    "thymic cancer", "lymph node cancer", "lymphatic cancer", "hematologic cancer", "blood cancer", "germ cell tumor", "placental tumor", "placental site trophoblastic tumor",

    # Common abbreviations
    "ALL", "AML", "CLL", "CML", "DLBCL", "NHL", "HL", "GBM", "HCC", "NSCLC", "SCLC", "MM", "MDS", "MPN", "GIST", "CNS",

    # Other related disease terms
    "leukoplakia", "erythroplakia", "actinic keratosis", "Bowen disease", "Paget disease", "carcinomatosis", "sarcomatosis", "lymphadenopathy", "paraneoplastic syndrome",
    "tumor lysis syndrome", "tumor necrosis factor", "oncocytoma", "angioma", "hemangioma", "lymphangioma", "neuroendocrine tumor", "pheochromocytoma", "paraganglioma",
    "schwannoma", "neurofibroma", "fibroma", "lipoma", "myoma", "adenofibroma", "cystadenoma", "cystadenocarcinoma", "papilloma"
]

import re
pattern = "|".join(keywords)

disease_terms_df = mesh_df[
    mesh_df["meshTerm"].str.contains(pattern, case=False, na=False, regex=True)
].drop_duplicates(subset=["meshTerm"])

print(f"Number of unique disease-related terms: {len(disease_terms_df)}")
print(disease_terms_df.head())

Number of unique disease-related terms: 336
              meshTerm                                         URI
24            'Glioma'            http://example.org/mesh/_Glioma_
29  'Medical_Oncology'  http://example.org/mesh/_Medical_Oncology_
37    Tumor-Associated    http://example.org/mesh/Tumor-Associated
50            Adenoma'            http://example.org/mesh/Adenoma_
54            Alleles'            http://example.org/mesh/Alleles_


In [68]:
client.collections.delete("Article")

In [69]:
from weaviate.classes.config import Configure


#define the collection
articles = client.collections.create(
    name = "Article",
    vectorizer_config=Configure.Vectorizer.text2vec_weaviate(),  # Hugging Face model  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    generative_config=None  
)

In [34]:
print(client.is_ready())

True


In [35]:
    print(client.collections.list_all())

{'Article': _CollectionConfigSimple(name='Article', description=None, generative_config=None, properties=[_Property(name='meshMajor', description="This property was generated by Weaviate's auto-schema feature on Wed Jul 16 12:51:05 2025", data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=_PropertyVectorizerConfig(skip=False, vectorize_property_name=False), vectorizer='text2vec-huggingface', vectorizer_configs=None), _Property(name='title', description="This property was generated by Weaviate's auto-schema feature on Wed Jul 16 12:51:05 2025", data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=_PropertyVectorizerConfig(skip=False, vectorize_property_name=False), vectorizer='text2vec-huggingface', vectorizer_c

In [20]:
     articles = client.collections.get("Article")
     articles.data.insert(
         properties={
             "title": "Test Title",
             "abstractText": "Test Abstract",
             "Article_URI": "http://example.org/article/test",
             "meshMajor": "Test Mesh"
         }
     )

UUID('776f6e33-8af3-4ffb-8efc-66b1eba9b51d')

In [14]:
terms = client.collections.get("term")
terms.data.insert(
    properties={
        "meshTerm": "Test Term",
        "URI": "http://example.org/mesh/test_term"
    }
)

UUID('b7fdad01-32a4-4aef-9b63-5c98ef587f67')

In [51]:
#define the collection
terms = client.collections.create(
    name = "term",
    vectorizer_config=Configure.Vectorizer.text2vec_weaviate(),  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    generative_config=None  
)

In [None]:
existing_terms = set()
batch_size = 100
offset = 0

while True:
    # Query a batch of objects
    results = terms.query.fetch_objects(
        limit=batch_size,
        offset=offset,
        return_properties=["meshTerm"]
    )
    if not results.objects:
        break
    for obj in results.objects:
        existing_terms.add(obj.properties["meshTerm"])
    offset += batch_size

In [17]:
import time

for _, row in disease_terms_df.iterrows():
    mesh_term = str(row["meshTerm"])
    if mesh_term in existing_terms:
        continue  # Skip already imported
    terms.data.insert(
        properties={
            "meshTerm": mesh_term,
            "URI": str(row["URI"]),
        }
    )
    time.sleep(0.1)  # 100ms delay between inserts

In [18]:
#Check that it works
from weaviate.classes.query import MetadataQuery

response = terms.query.near_text(
    query="mouth cancer",
    limit=10,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(f"UUID: {o.uuid}")
    print(o.properties)
    print(o.metadata.distance)


UUID: 7e269644-f751-4888-b926-c94b4ddc8d26
{'meshTerm': "'Mouth_Neoplasms'", 'uRI': 'http://example.org/mesh/_Mouth_Neoplasms_'}
0.6161625385284424
UUID: cc6123ab-b3aa-4f2b-86a8-4d78f540d6a6
{'meshTerm': "'Tongue_Neoplasms", 'uRI': 'http://example.org/mesh/_Tongue_Neoplasms'}
0.6701860427856445
UUID: 69761bb9-a52b-4dbb-8c00-a6b47def8391
{'meshTerm': 'Adenocarcinoma', 'uRI': 'http://example.org/mesh/Adenocarcinoma'}
0.7214798331260681
UUID: f8543051-ff27-47a5-b894-64b9546ce05f
{'uRI': 'http://example.org/mesh/Adenocarcinoma_', 'meshTerm': "Adenocarcinoma'"}
0.722684919834137
UUID: 8c9fb117-577c-4c09-ba05-75af226f59a4
{'meshTerm': "'Esophageal_Squamous_Cell_Carcinoma'", 'uRI': 'http://example.org/mesh/_Esophageal_Squamous_Cell_Carcinoma_'}
0.7333685159683228
UUID: 621ea70d-f5ee-4dc8-b3be-e7ea72e5e73b
{'meshTerm': "'Submandibular_Gland_Neoplasms", 'uRI': 'http://example.org/mesh/_Submandibular_Gland_Neoplasms'}
0.7388957142829895
UUID: ee2d1c01-7c85-46f8-ab65-522e4adcd17a
{'meshTerm': "Ad

In [21]:
#Check that it works
response = articles.query.near_text(
    query="mouth cancer",
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(f"UUID: {o.uuid}")
    print(o.properties)
    print(o.metadata.distance)


UUID: 0fe9b9cc-83bf-4076-9a43-ce5eaf3caefc
{'meshMajor': "['Antineoplastic Agents', 'Apoptosis', 'Autophagy', 'Carcinoma, Squamous Cell', 'Cell Cycle Checkpoints', 'Cell Line, Tumor', 'Cell Proliferation', 'Cell Survival', 'Humans', 'I-kappa B Kinase', 'Mitosis', 'Mouth Neoplasms', 'Phosphoproteins', 'Protein-Serine-Threonine Kinases', 'Proto-Oncogene Proteins c-akt', 'Pyrimidines', 'Signal Transduction', 'Thiophenes']", 'abstractText': 'TANK-binding kinase 1 (TBK1), a member of IêB Kinase (IKK)-related kinases, plays a role in regulating innate immunity, inflammation and oncogenic signaling. This study aims to investigate the role of BX795, an inhibitor of TBK1, in a panel of oral squamous cell carcinoma (OSCC) cell lines. The antitumor effects and mechanisms of BX795 were assessed by MTT assays, flow cytometry, Western blotting, and confocal microscopy. BX795 exhibited a dose-responsive antiproliferative effect on OSCC cells with relative sparing of normal human oral keratinocytes. T

In [22]:
aggregation = articles.aggregate.over_all(total_count=True)
print(aggregation.total_count)

1989


In [24]:
#Check that similarity works
response = articles.query.near_object(
    near_object="0fe9b9cc-83bf-4076-9a43-ce5eaf3caefc",  # A UUID of an object (e.g. "56b9449e-65db-5df4-887b-0a4773f52aa7")
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

{'meshMajor': "['Antineoplastic Agents', 'Apoptosis', 'Autophagy', 'Carcinoma, Squamous Cell', 'Cell Cycle Checkpoints', 'Cell Line, Tumor', 'Cell Proliferation', 'Cell Survival', 'Humans', 'I-kappa B Kinase', 'Mitosis', 'Mouth Neoplasms', 'Phosphoproteins', 'Protein-Serine-Threonine Kinases', 'Proto-Oncogene Proteins c-akt', 'Pyrimidines', 'Signal Transduction', 'Thiophenes']", 'abstractText': 'TANK-binding kinase 1 (TBK1), a member of IêB Kinase (IKK)-related kinases, plays a role in regulating innate immunity, inflammation and oncogenic signaling. This study aims to investigate the role of BX795, an inhibitor of TBK1, in a panel of oral squamous cell carcinoma (OSCC) cell lines. The antitumor effects and mechanisms of BX795 were assessed by MTT assays, flow cytometry, Western blotting, and confocal microscopy. BX795 exhibited a dose-responsive antiproliferative effect on OSCC cells with relative sparing of normal human oral keratinocytes. The compound caused apoptosis as evidenced b

### Knowledge Graph


In [27]:
from rdflib import Graph, RDF, RDFS, Namespace, URIRef, Literal
from rdflib.namespace import SKOS, XSD
import pandas as pd
import urllib.parse
import random
from datetime import datetime, timedelta
import re
from urllib.parse import quote

# --- Initialization ---
g = Graph()

# Define namespaces
schema = Namespace('http://schema.org/')
ex = Namespace('http://example.org/')
prefixes = {
    'schema': schema,
    'ex': ex,
    'skos': SKOS,
    'xsd': XSD
}
for p, ns in prefixes.items():
    g.bind(p, ns)

# Define classes and properties
Article = URIRef(ex.Article)
MeSHTerm = URIRef(ex.MeSHTerm)
g.add((Article, RDF.type, RDFS.Class))
g.add((MeSHTerm, RDF.type, RDFS.Class))

title = URIRef(schema.name)
abstract = URIRef(schema.description)
date_published = URIRef(schema.datePublished)
access = URIRef(ex.access)

g.add((title, RDF.type, RDF.Property))
g.add((abstract, RDF.type, RDF.Property))
g.add((date_published, RDF.type, RDF.Property))
g.add((access, RDF.type, RDF.Property))

# Function to clean and parse MeSH terms
def parse_mesh_terms(mesh_list):
    if pd.isna(mesh_list):
        return []
    return [term.strip() for term in mesh_list.strip("[]'").split(',')]

# Enhanced convert_to_uri function
def convert_to_uri(term, base_namespace="http://example.org/mesh/"):
    if pd.isna(term):
        return None  # Handle NaN or None terms gracefully
    
    # Step 1: Strip existing leading and trailing non-word characters (including underscores)
    stripped_term = re.sub(r'^\W+|\W+$', '', term)
    
    # Step 2: Replace non-word characters with underscores (one or more)
    formatted_term = re.sub(r'\W+', '_', stripped_term)
    
    # Step 3: Replace multiple consecutive underscores with a single underscore
    formatted_term = re.sub(r'_+', '_', formatted_term)
    
    # Step 4: URL-encode the term to handle any remaining special characters
    encoded_term = quote(formatted_term)
    
    # Step 5: Add single leading and trailing underscores
    term_with_underscores = f"_{encoded_term}_"
    
    # Step 6: Concatenate with base_namespace without adding an extra underscore
    uri = f"{base_namespace}{term_with_underscores}"

    return URIRef(uri)

# Function to generate a random date within the last 5 years
def generate_random_date():
    start_date = datetime.now() - timedelta(days=5*365)
    random_days = random.randint(0, 5*365)
    return start_date + timedelta(days=random_days)

# Function to generate a random access value between 1 and 10
def generate_random_access():
    return random.randint(1, 10)

# Function to create a valid URI for Articles
def create_article_uri(title, base_namespace="http://example.org/article"):
    if pd.isna(title):
        return None
    # Encode text to be used in URI
    sanitized_text = urllib.parse.quote(title.strip().replace(' ', '_').replace('"', '').replace('<', '').replace('>', '').replace("'", "_"))
    return URIRef(f"{base_namespace}/{sanitized_text}")

# Loop through each row in the DataFrame and create RDF triples
for index, row in df.iterrows():
    article_uri = create_article_uri(row['Title'])
    if article_uri is None:
        continue
    
    # Add Article instance
    g.add((article_uri, RDF.type, Article))
    g.add((article_uri, title, Literal(row['Title'], datatype=XSD.string)))
    g.add((article_uri, abstract, Literal(row['abstractText'], datatype=XSD.string)))
    
    # Add random datePublished and access
    random_date = generate_random_date()
    random_access = generate_random_access()
    g.add((article_uri, date_published, Literal(random_date.date(), datatype=XSD.date)))
    g.add((article_uri, access, Literal(random_access, datatype=XSD.integer)))
    
    # Add MeSH Terms
    mesh_terms = parse_mesh_terms(row['meshMajor'])
    for term in mesh_terms:
        term_uri = convert_to_uri(term, base_namespace="http://example.org/mesh/")
        if term_uri is None:
            continue
        
        # Add MeSH Term instance
        g.add((term_uri, RDF.type, MeSHTerm))
        g.add((term_uri, RDFS.label, Literal(term.replace('_', ' '), datatype=XSD.string)))
        
        # Link Article to MeSH Term
        g.add((article_uri, schema.about, term_uri))

# Serialize the graph to a file (optional)
g.serialize(destination='PubMedGraph.ttl', format='turtle')


<Graph identifier=N30cceac156784a82b7ae320f3d617d39 (<class 'rdflib.graph.Graph'>)>

In [None]:
import os

# Get the Desktop path
desktop = os.path.join(os.path.expanduser("~"), "Desktop")
file_path = os.path.join(desktop, "PubMedGraph.ttl")

# Ensure the directory exists (usually not needed for Desktop, but good practice)
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Save the graph
g.serialize(destination=file_path, format='turtle')

print(f"File saved at {file_path}")