### Embed PubMed journal articles into Weaviate

PubMed MultiLabel Text Classification Dataset MeSH: https://www.kaggle.com/datasets/owaiskhan9654/pubmed-multilabel-text-classification

In [None]:
from weaviate.util import generate_uuid5
import weaviate
import json
import pandas as pd


In [None]:
import weaviate
from weaviate.util import generate_uuid5
from weaviate.classes.init import Auth
import os
import json
import pandas as pd

client = weaviate.connect_to_weaviate_cloud(
    cluster_url="XXX",  # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key("XXX"),  # Replace with your Weaviate Cloud key
    headers={'X-OpenAI-Api-key': "XXX"}  # Replace with your OpenAI API key
)

In [None]:
df = spark.sql("SELECT * FROM workspace.default.pub_med_multi_label_text_classification_dataset_processed").toPandas()

  if LooseVersion(pa.__version__) >= LooseVersion("13.0.0"):


In [None]:
#Filter the dataset if needed
df = df[:1000]

In [None]:
import numpy as np
# Replace infinity values with NaN and then fill NaN values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna('', inplace=True)

# Convert columns to string type
df['Title'] = df['Title'].astype(str)
df['abstractText'] = df['abstractText'].astype(str)
df['meshMajor'] = df['meshMajor'].astype(str)

In [None]:
import urllib.parse
from rdflib import Graph, RDF, RDFS, Namespace, URIRef, Literal


# Function to create a valid URI
def create_valid_uri(base_uri, text):
    if pd.isna(text):
        return None
    # Encode text to be used in URI
    sanitized_text = urllib.parse.quote(text.strip().replace(' ', '_').replace('"', '').replace('<', '').replace('>', '').replace("'", "_"))
    return URIRef(f"{base_uri}/{sanitized_text}")


# Function to create a valid URI for Articles
def create_article_uri(title, base_namespace="http://example.org/article/"):
    """
    Creates a URI for an article by replacing non-word characters with underscores and URL-encoding.

    Args:
        title (str): The title of the article.
        base_namespace (str): The base namespace for the article URI.

    Returns:
        URIRef: The formatted article URI.
    """
    if pd.isna(title):
        return None
    # Replace non-word characters with underscores
    sanitized_title = re.sub(r'\W+', '_', title.strip())
    # Condense multiple underscores into a single underscore
    sanitized_title = re.sub(r'_+', '_', sanitized_title)
    # URL-encode the term
    encoded_title = quote(sanitized_title)
    # Concatenate with base_namespace without adding underscores
    uri = f"{base_namespace}{encoded_title}"
    return URIRef(uri)

# Add a new column to the DataFrame for the article URIs
df['Article_URI'] = df['Title'].apply(lambda title: create_valid_uri("http://example.org/article", title))


In [None]:
# Function to clean and parse MeSH terms
def parse_mesh_terms(mesh_list):
    if pd.isna(mesh_list):
        return []
    return [
        term.strip().replace(' ', '_')
        for term in mesh_list.strip("[]'").split(',')
    ]

# Function to create a valid URI for MeSH terms
def create_valid_uri(base_uri, text):
    if pd.isna(text):
        return None
    sanitized_text = urllib.parse.quote(
        text.strip()
        .replace(' ', '_')
        .replace('"', '')
        .replace('<', '')
        .replace('>', '')
        .replace("'", "_")
    )
    return f"{base_uri}/{sanitized_text}"

# Extract and process all MeSH terms
all_mesh_terms = []
for mesh_list in df["meshMajor"]:
    all_mesh_terms.extend(parse_mesh_terms(mesh_list))

# Deduplicate terms
unique_mesh_terms = list(set(all_mesh_terms))

# Create a DataFrame of MeSH terms and their URIs
mesh_df = pd.DataFrame({
    "meshTerm": unique_mesh_terms,
    "URI": [create_valid_uri("http://example.org/mesh", term) for term in unique_mesh_terms]
})

# Display the DataFrame
print(mesh_df)


                    meshTerm                                             URI
0                                                   http://example.org/mesh/
1               'Motivation'            http://example.org/mesh/_Motivation_
2      'Amino_Acid_Sequence'   http://example.org/mesh/_Amino_Acid_Sequence_
3                 Meningeal'              http://example.org/mesh/Meningeal_
4                   Natural'                http://example.org/mesh/Natural_
...                      ...                                             ...
5291  'Respiratory_Mechanics  http://example.org/mesh/_Respiratory_Mechanics
5292              'Syndrome'              http://example.org/mesh/_Syndrome_
5293         'Nanoparticles'         http://example.org/mesh/_Nanoparticles_
5294                 'Masks'                 http://example.org/mesh/_Masks_
5295     'Diagnostic_Errors'     http://example.org/mesh/_Diagnostic_Errors_

[5296 rows x 2 columns]


In [None]:
from weaviate.classes.config import Configure


#define the collection
articles = client.collections.create(
    name = "Article",
    vectorizer_config=Configure.Vectorizer.text2vec_openai(),  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    generative_config=Configure.Generative.openai(),  # Ensure the `generative-openai` module is used for generative queries
)

In [None]:
#add ojects
articles = client.collections.get("Article")

with articles.batch.dynamic() as batch:
    for index, row in df.iterrows():
        batch.add_object({
            "title": row["Title"],
            "abstractText": row["abstractText"],
            "Article_URI": row["Article_URI"],
            "meshMajor": row["meshMajor"],
        })

In [None]:
#define the collection
terms = client.collections.create(
    name = "term",
    vectorizer_config=Configure.Vectorizer.text2vec_openai(),  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    generative_config=Configure.Generative.openai(),  # Ensure the `generative-openai` module is used for generative queries
)

In [None]:
#add ojects
terms = client.collections.get("term")

with terms.batch.dynamic() as batch:
    for index, row in mesh_df.iterrows():
        batch.add_object({
            "meshTerm": row["meshTerm"],
            "URI": row["URI"],
        })

In [None]:
#Check that it works
from weaviate.classes.query import MetadataQuery

response = terms.query.near_text(
    query="mouth cancer",
    limit=10,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(f"UUID: {o.uuid}")
    print(o.properties)
    print(o.metadata.distance)
    #print(o.Object._WeaviateUUIDInt)

UUID: 1acdd564-4ca0-47f9-94bc-381c54c39d78
{'meshTerm': "'Melanoma'", 'uRI': 'http://example.org/mesh/_Melanoma_'}
0.5611209869384766
UUID: 380ddcff-34a4-419c-a26e-e1ef2e6d9106
{'meshTerm': "'Mouth_Neoplasms'", 'uRI': 'http://example.org/mesh/_Mouth_Neoplasms_'}
0.5695425868034363
UUID: 8489e332-bfa2-4004-b859-d687702ef674
{'meshTerm': 'Carcinoma', 'uRI': 'http://example.org/mesh/Carcinoma'}
0.5727190375328064
UUID: 227adc65-58fa-4e2b-aeab-73034309244c
{'meshTerm': "Adenocarcinoma_of_Lung'", 'uRI': 'http://example.org/mesh/Adenocarcinoma_of_Lung_'}
0.5770481824874878
UUID: 2bb491bb-14d7-45c9-870c-bd967d363744
{'meshTerm': "Tumor'", 'uRI': 'http://example.org/mesh/Tumor_'}
0.5853508710861206
UUID: 3f2f74b2-c6f5-4c73-951f-3b6af2f3250f
{'meshTerm': 'Adenocarcinoma', 'uRI': 'http://example.org/mesh/Adenocarcinoma'}
0.5880469083786011
UUID: 5bb11fb8-ff0f-45f7-b5d1-c6005c15d79f
{'meshTerm': "'Lung_Neoplasms'", 'uRI': 'http://example.org/mesh/_Lung_Neoplasms_'}
0.5928151607513428
UUID: 344356

In [None]:
#Check that it works
response = articles.query.near_text(
    query="mouth cancer",
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(f"UUID: {o.uuid}")
    print(o.properties)
    print(o.metadata.distance)
    #print(o.Object._WeaviateUUIDInt)

UUID: 9083872f-f1e4-42cb-9a30-12d6177dc7f8
{'title': 'Malignant fibrous histiocytoma of the pharynx.', 'meshMajor': "['Histiocytoma, Benign Fibrous', 'Humans', 'Male', 'Middle Aged', 'Pharyngeal Neoplasms', 'Tomography, X-Ray Computed']", 'abstractText': 'Malignant fibrous histiocytoma (MFH) is the most common soft-tissue sarcoma of late adult life, but is relatively uncommon in the head and neck region. That region has been reported to be the origin of malignant fibrous histiocytoma in 3-10% of cases. Only one case of the tumor occurring in the pharynx has been reported. Histologically it is sometimes hard to distinguish this tumor from some sarcomas and pleomorphic carcinomas. The treatment of choice is a large surgical resection, while radiotherapy and chemotherapy are reserved for recurrences. The authors present a case of oropharyngeal malignant fibrous histiocytoma. The patient complained dysphagia and dyslalia progressively worsening in six months. Pharyngo-laryngoscopy revealed

In [None]:
aggregation = articles.aggregate.over_all(total_count=True)
print(aggregation.total_count)

1000


In [None]:
#Check that similarity works
response = articles.query.near_object(
    near_object="9083872f-f1e4-42cb-9a30-12d6177dc7f8",  # A UUID of an object (e.g. "56b9449e-65db-5df4-887b-0a4773f52aa7")
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

{'title': 'Malignant fibrous histiocytoma of the pharynx.', 'meshMajor': "['Histiocytoma, Benign Fibrous', 'Humans', 'Male', 'Middle Aged', 'Pharyngeal Neoplasms', 'Tomography, X-Ray Computed']", 'abstractText': 'Malignant fibrous histiocytoma (MFH) is the most common soft-tissue sarcoma of late adult life, but is relatively uncommon in the head and neck region. That region has been reported to be the origin of malignant fibrous histiocytoma in 3-10% of cases. Only one case of the tumor occurring in the pharynx has been reported. Histologically it is sometimes hard to distinguish this tumor from some sarcomas and pleomorphic carcinomas. The treatment of choice is a large surgical resection, while radiotherapy and chemotherapy are reserved for recurrences. The authors present a case of oropharyngeal malignant fibrous histiocytoma. The patient complained dysphagia and dyslalia progressively worsening in six months. Pharyngo-laryngoscopy revealed a mass of the left lateral wall of oro and

In [None]:
prompt = "Please explain this article {title} like you would to someone without a medical degree."

response = articles.generate.near_text(
    query="Malignant fibrous histiocytoma of the pharynx.",
    limit=1,
    single_prompt=prompt
)

# print source properties and generated responses
for o in response.objects:
    print(o.properties)
    print(o.generated)

{'title': 'Malignant fibrous histiocytoma of the pharynx.', 'meshMajor': "['Histiocytoma, Benign Fibrous', 'Humans', 'Male', 'Middle Aged', 'Pharyngeal Neoplasms', 'Tomography, X-Ray Computed']", 'abstractText': 'Malignant fibrous histiocytoma (MFH) is the most common soft-tissue sarcoma of late adult life, but is relatively uncommon in the head and neck region. That region has been reported to be the origin of malignant fibrous histiocytoma in 3-10% of cases. Only one case of the tumor occurring in the pharynx has been reported. Histologically it is sometimes hard to distinguish this tumor from some sarcomas and pleomorphic carcinomas. The treatment of choice is a large surgical resection, while radiotherapy and chemotherapy are reserved for recurrences. The authors present a case of oropharyngeal malignant fibrous histiocytoma. The patient complained dysphagia and dyslalia progressively worsening in six months. Pharyngo-laryngoscopy revealed a mass of the left lateral wall of oro and

In [None]:
#Grouped RAG

task = "Summarize the key information here in bullet points. Make it understandable to someone without a medical degree."

response = articles.generate.near_text(
    query="mouth cancer",
    limit=3,
    grouped_task=task
)

# print the generated response
print(response.generated)

- Malignant fibrous histiocytoma (MFH) is a rare soft-tissue sarcoma that can occur in the head and neck region, with only one reported case in the pharynx.
- Treatment for MFH typically involves surgical resection, with radiotherapy and chemotherapy used for recurrences.
- A case of oropharyngeal MFH presented with symptoms of dysphagia and dyslalia, which were successfully treated with surgical excision.
- Low malignant mucoepidermoid carcinoma with stromal fibrosis and eosinophilic infiltration is a subtype of mucoepidermoid carcinoma that may be misdiagnosed as a more aggressive form.
- Accurate histological diagnosis is important for determining prognosis and treatment options for this subtype of mucoepidermoid carcinoma.
- Primary leiomyosarcoma in the trachea is rare, with only six cases reported in the English literature, making clinical diagnosis and classification challenging.


### Turn metadata into a KG

In [None]:
from rdflib import Graph, RDF, RDFS, Namespace, URIRef, Literal
from rdflib.namespace import SKOS, XSD
import pandas as pd
import urllib.parse
import random
from datetime import datetime, timedelta
import re
from urllib.parse import quote

# --- Initialization ---
g = Graph()

# Define namespaces
schema = Namespace('http://schema.org/')
ex = Namespace('http://example.org/')
prefixes = {
    'schema': schema,
    'ex': ex,
    'skos': SKOS,
    'xsd': XSD
}
for p, ns in prefixes.items():
    g.bind(p, ns)

# Define classes and properties
Article = URIRef(ex.Article)
MeSHTerm = URIRef(ex.MeSHTerm)
g.add((Article, RDF.type, RDFS.Class))
g.add((MeSHTerm, RDF.type, RDFS.Class))

title = URIRef(schema.name)
abstract = URIRef(schema.description)
date_published = URIRef(schema.datePublished)
access = URIRef(ex.access)

g.add((title, RDF.type, RDF.Property))
g.add((abstract, RDF.type, RDF.Property))
g.add((date_published, RDF.type, RDF.Property))
g.add((access, RDF.type, RDF.Property))

# Function to clean and parse MeSH terms
def parse_mesh_terms(mesh_list):
    if pd.isna(mesh_list):
        return []
    return [term.strip() for term in mesh_list.strip("[]'").split(',')]

# Enhanced convert_to_uri function
def convert_to_uri(term, base_namespace="http://example.org/mesh/"):
    """
    Converts a MeSH term into a standardized URI by replacing spaces and special characters with underscores,
    ensuring it starts and ends with a single underscore, and URL-encoding the term.

    Args:
        term (str): The MeSH term to convert.
        base_namespace (str): The base namespace for the URI.

    Returns:
        URIRef: The formatted URI.
    """
    if pd.isna(term):
        return None  # Handle NaN or None terms gracefully
    
    # Step 1: Strip existing leading and trailing non-word characters (including underscores)
    stripped_term = re.sub(r'^\W+|\W+$', '', term)
    
    # Step 2: Replace non-word characters with underscores (one or more)
    formatted_term = re.sub(r'\W+', '_', stripped_term)
    
    # Step 3: Replace multiple consecutive underscores with a single underscore
    formatted_term = re.sub(r'_+', '_', formatted_term)
    
    # Step 4: URL-encode the term to handle any remaining special characters
    encoded_term = quote(formatted_term)
    
    # Step 5: Add single leading and trailing underscores
    term_with_underscores = f"_{encoded_term}_"
    
    # Step 6: Concatenate with base_namespace without adding an extra underscore
    uri = f"{base_namespace}{term_with_underscores}"

    return URIRef(uri)

# Function to generate a random date within the last 5 years
def generate_random_date():
    start_date = datetime.now() - timedelta(days=5*365)
    random_days = random.randint(0, 5*365)
    return start_date + timedelta(days=random_days)

# Function to generate a random access value between 1 and 10
def generate_random_access():
    return random.randint(1, 10)

# Function to create a valid URI for Articles
def create_article_uri(title, base_namespace="http://example.org/article"):
    """
    Creates a URI for an article by replacing non-word characters with underscores and URL-encoding.

    Args:
        title (str): The title of the article.
        base_namespace (str): The base namespace for the article URI.

    Returns:
        URIRef: The formatted article URI.
    """
    if pd.isna(title):
        return None
    # Encode text to be used in URI
    sanitized_text = urllib.parse.quote(title.strip().replace(' ', '_').replace('"', '').replace('<', '').replace('>', '').replace("'", "_"))
    return URIRef(f"{base_namespace}/{sanitized_text}")

# Loop through each row in the DataFrame and create RDF triples
for index, row in df.iterrows():
    article_uri = create_article_uri(row['Title'])
    if article_uri is None:
        continue
    
    # Add Article instance
    g.add((article_uri, RDF.type, Article))
    g.add((article_uri, title, Literal(row['Title'], datatype=XSD.string)))
    g.add((article_uri, abstract, Literal(row['abstractText'], datatype=XSD.string)))
    
    # Add random datePublished and access
    random_date = generate_random_date()
    random_access = generate_random_access()
    g.add((article_uri, date_published, Literal(random_date.date(), datatype=XSD.date)))
    g.add((article_uri, access, Literal(random_access, datatype=XSD.integer)))
    
    # Add MeSH Terms
    mesh_terms = parse_mesh_terms(row['meshMajor'])
    for term in mesh_terms:
        term_uri = convert_to_uri(term, base_namespace="http://example.org/mesh/")
        if term_uri is None:
            continue
        
        # Add MeSH Term instance
        g.add((term_uri, RDF.type, MeSHTerm))
        g.add((term_uri, RDFS.label, Literal(term.replace('_', ' '), datatype=XSD.string)))
        
        # Link Article to MeSH Term
        g.add((article_uri, schema.about, term_uri))

# Serialize the graph to a file (optional)
g.serialize(destination='PubMedGraph.ttl', format='turtle')


  """


<Graph identifier=Nb0e4bfd11a6c43338668c3ed1facc6e9 (<class 'rdflib.graph.Graph'>)>



In [None]:
# Path to save the file
file_path = "/Workspace/PubMedGraph.ttl"

# Save the file
g.serialize(destination=file_path, format='turtle')

print(f"File saved at {file_path}")

File saved at /Workspace/PubMedGraph.ttl
