In [1]:
%pip install torch torchvision torchaudio
%pip install transformers
%pip install groq
%pip install transformers sentencepiece accelerate
%pip install -q transformers accelerate bitsandbytes sentencepiece torch
%pip install openai
%pip install pillow

Collecting groq
  Downloading groq-1.0.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-1.0.0-py3-none-any.whl (138 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.3/138.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-1.0.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m


In [None]:
import os
os.environ["GROQ_API_KEY"] = "GROQ_API_KEY"
OWNER = "Springblade"
REPO = "Project-LLM"
ROOT_PATH = "extracted_data"

TOKEN = "GITHUB_TOKEN"   

HEADERS = {"Authorization": f"token {TOKEN}"}

API_URL = f"https://api.github.com/repos/{OWNER}/{REPO}/contents/{ROOT_PATH}"

In [4]:
import requests
API_URL = f"https://api.github.com/repos/{OWNER}/{REPO}/contents/{ROOT_PATH}"

def get_md_files(url, md_files):
    response = requests.get(url, headers=HEADERS)
    items = response.json()

    # Handle GitHub error cleanly
    if isinstance(items, dict) and items.get("message"):
        print("GitHub API error:", items["message"])
        return md_files   # return what we have instead of None

    for item in items:
        if item["type"] == "file" and item["name"].endswith(".md"):
            raw_url = item["download_url"]
            text = requests.get(raw_url, headers=HEADERS).text
            md_files[item["path"]] = text

        elif item["type"] == "dir":
            get_md_files(item["url"], md_files)

    return md_files

In [6]:
all_md = get_md_files(API_URL, {})

print("Downloaded:", len(all_md), "markdown files")
for path, content in all_md.items():
    print("----", path, "----")
    print(content[:200], "...")  # preview first 200 chars

Downloaded: 93 markdown files
---- extracted_data/1---A-20-Year-Old-Woman-from-Sudan-With-Fever--_2022_Clinical-Cases-in-Tropi/1---A-20-Year-Old-Woman-from-Sudan-With-Fever--_2022_Clinical-Cases-in-Tropi.md ----
1

# A 20-Year-Old Woman from Sudan With Fever, Haemorrhage and Shock

DANIEL G. BAUSCH

# Clinical Presentation

# History

A 20-year-old housewife presents to a hospital in northern Uganda with a 2- ...
---- extracted_data/10---A-55-Year-Old-Indigenous-Woman-from-Australia-W_2022_Clinical-Cases-in-/10---A-55-Year-Old-Indigenous-Woman-from-Australia-W_2022_Clinical-Cases-in-.md ----
# 10

## A 55-Year-Old Indigenous Woman from Australia With a Widespread Exfoliating Rash and Sepsis

BART J. CURRIE AND JAMES MCCARTHY

#### Clinical Presentation

#### History

You are working in a  ...
---- extracted_data/11---A-45-Year-Old-Male-Security-Guard-from-Malawi-_2022_Clinical-Cases-in-T/11---A-45-Year-Old-Male-Security-Guard-from-Malawi-_2022_Clinical-Cases-in-T.md ----
# 11

# A 45-Y

In [7]:
len(all_md)

93

In [8]:
#particion the orginal dictionary to prevent wasting tokens if error happen
def chunk_dict_contiguous(d, parts=4):
    items = list(d.items())
    n = len(items)
    q, r = divmod(n, parts)
    result = []
    start = 0
    for i in range(parts):
        size = q + (1 if i < r else 0)   # distribute remainder to first r parts
        chunk_items = items[start:start+size]
        result.append(dict(chunk_items))
        start += size
    return result

# Example
#d = {f"k{i}": i for i in range(11)}
#a, b, c, d4 = chunk_dict_contiguous(d, 4)
#print(len(a), len(b), len(c), len(d4))  # -> balanced sizes like 3,3,3,2

In [9]:
batch1, batch2, batch3, batch4 = chunk_dict_contiguous(all_md, 4)

In [10]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ["GROQ_API_KEY"], base_url="https://api.groq.com/openai/v1")

def run_llama(prompt, model="llama-3.1-8b-instant",pre_prompt_instruction='You are a help medical assistant'):

    completion = openai_client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": pre_prompt_instruction},
            {"role": "user", "content": prompt},
        ],
    )
    return completion.choices[0].message.content

In [11]:
user_pre_prompt_instruction = """
           You are TropID-Extractor, an expert clinical information extractor for tropical & infectious diseases.  ,
           Your task is to create a short descripition about the disease based on provided document and your base knowledge. The output must follow below json format, no comment or deviation from instructed format:
           {
                "Disease_summary": "Sufficient information about the disease includes primary area of occurance (optional), symptom (mandatory), clinical presentation (optional) , differential diagnosis (manadatory if it is included in the context), key finding to diagnose this exact disease (Lab test, scanning result, blood test, etc,mandatory)",
                "Keyword": "Name of the disease, short tag to identify the disease"
           }
           Example output
           {
                "Disease_summary": "Ebola virus disease primarily occurs in sub-Saharan Africa and typically presents with acute fever, severe headache, myalgia, vomiting, and diarrhea, with hemorrhagic manifestations in severe cases. Clinically, it is characterized by rapid progression to gastrointestinal illness, dehydration, and multiorgan dysfunction. Important differential diagnoses include malaria, typhoid fever, Lassa fever, and dengue hemorrhagic fever. The diagnosis is confirmed by detection of Ebola virus RNA using RT-PCR from blood or other body fluids, supported by relevant epidemiological exposure.",
                "Keyword": "Ebola"
           }
          """


In [12]:
from openai import OpenAI, RateLimitError, APIError
import time

def summarize_text(Batch: dict, pre_prompt_instruction: str = ''):
    outputSummarised = {}
    error_list = {}
    prompt_template = (
        "Now do the assigned task from this clinical case:"
    )

    for file_name, raw_text in Batch.items():
        print(f"Working with {file_name}\n")
        user_prompt = f"{prompt_template}\nCase report:\n{raw_text}"

        try:
            summary = run_llama(
                    user_prompt,
                    pre_prompt_instruction=pre_prompt_instruction
                )
            outputSummarised[file_name] = summary

        except Exception as e:
            print(f"Unexpected error on {file_name}: {e}. Skipping this iteration")
            error_list[file_name] = raw_text


    return outputSummarised,error_list


In [13]:
batch3_summary,error_list_3 = summarize_text(batch3,pre_prompt_instruction = user_pre_prompt_instruction)

Working with extracted_data/53---A-24-Year-Old-Woman-from-Uganda-With-Fe_2022_Clinical-Cases-in-Tropical/53---A-24-Year-Old-Woman-from-Uganda-With-Fe_2022_Clinical-Cases-in-Tropical.md

Working with extracted_data/54---A-52-Year-Old-Male-Safari-Tourist-Returning-fro_2022_Clinical-Cases-in-/54---A-52-Year-Old-Male-Safari-Tourist-Returning-fro_2022_Clinical-Cases-in-.md

Working with extracted_data/55---A-40-Year-Old-Male-Farmer-from-Peru-With-Ch_2022_Clinical-Cases-in-Trop/55---A-40-Year-Old-Male-Farmer-from-Peru-With-Ch_2022_Clinical-Cases-in-Trop.md

Working with extracted_data/56---A-21-Year-Old-Pregnant-Woman-from-The-Ga_2022_Clinical-Cases-in-Tropica/56---A-21-Year-Old-Pregnant-Woman-from-The-Ga_2022_Clinical-Cases-in-Tropica.md

Working with extracted_data/57---A-37-Year-Old-Woman-from-Malawi-With-H_2022_Clinical-Cases-in-Tropical-/57---A-37-Year-Old-Woman-from-Malawi-With-H_2022_Clinical-Cases-in-Tropical-.md

Working with extracted_data/58---A-25-Year-Old-Woman-from-Egypt-With-S

In [14]:
batch4_summary,error_list_4 = summarize_text(batch4,pre_prompt_instruction = user_pre_prompt_instruction)

Working with extracted_data/74---A-28-Year-Old-Woman-from-Sierra-Leone-With-_2022_Clinical-Cases-in-Trop/74---A-28-Year-Old-Woman-from-Sierra-Leone-With-_2022_Clinical-Cases-in-Trop.md

Working with extracted_data/75---A-25-Year-Old-Woman-from-Zambia-With-a-N_2022_Clinical-Cases-in-Tropica/75---A-25-Year-Old-Woman-from-Zambia-With-a-N_2022_Clinical-Cases-in-Tropica.md

Working with extracted_data/76---A-55-Year-Old-Woman-from-Turkey-With-Feve_2022_Clinical-Cases-in-Tropic/76---A-55-Year-Old-Woman-from-Turkey-With-Feve_2022_Clinical-Cases-in-Tropic.md

Working with extracted_data/77---A-51-Year-Old-Female-Traveller-Returning-from-Cen_2022_Clinical-Cases-i/77---A-51-Year-Old-Female-Traveller-Returning-from-Cen_2022_Clinical-Cases-i.md

Working with extracted_data/78---A-42-Year-Old-British-Man-Living-in-Malawi_2022_Clinical-Cases-in-Tropi/78---A-42-Year-Old-British-Man-Living-in-Malawi_2022_Clinical-Cases-in-Tropi.md

Working with extracted_data/79---A-34-Year-Old-Male-Immigrant-from-Per

In [15]:
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image
import torch
import numpy as np
import base64, io

# Load Jina CLIP model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-clip-v1")
model = AutoModel.from_pretrained("jinaai/jina-clip-v1")
processor = AutoProcessor.from_pretrained("jinaai/jina-clip-v1")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

jinaai/jina-clip-implementation You can inspect the repository content at https://hf.co/jinaai/jina-clip-v1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


configuration_clip.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- configuration_clip.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


The repository jinaai/jina-clip-v1 references custom code contained in jinaai/jina-clip-implementation which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/jinaai/jina-clip-implementation .
 You can inspect the repository content at https://hf.co/jinaai/jina-clip-v1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


modeling_clip.py: 0.00B [00:00, ?B/s]

rope_embeddings.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- rope_embeddings.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


hf_model.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- hf_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


eva_model.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- eva_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


transform.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- transform.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- modeling_clip.py
- rope_embeddings.py
- hf_model.py
- eva_model.py
- transform.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/891M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_bert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py: 0.00B [00:00, ?B/s]

mlp.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


bert_padding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- bert_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


block.py: 0.00B [00:00, ?B/s]

mha.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- block.py
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-flash-implementation:
- modeling_bert.py
- mlp.py
- embedding.py
- bert_padding.py
- block.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


preprocessor_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

jinaai/jina-clip-implementation You can inspect the repository content at https://hf.co/jinaai/jina-clip-v1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


processing_clip.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-clip-implementation:
- processing_clip.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [19]:
import json
import re
import numpy as np
import torch
def safe_json_loads(text, file_name=None):
    """Try to safely parse JSON-like text produced by LLMs."""
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        # Attempt common fixes
        fixed = text.strip()

        # Replace single quotes with double quotes (careful with numbers)
        fixed = fixed.replace("'", '"')

        # Remove trailing commas before } or ]
        fixed = re.sub(r",(\s*[}\]])", r"\1", fixed)

        # Ensure it starts/ends correctly
        if not fixed.startswith("{"):
            fixed = "{" + fixed
        if not fixed.endswith("}"):
            fixed = fixed + "}"

        try:
            return json.loads(fixed)
        except Exception as e:
            print(f"⚠️ Failed to parse JSON for {file_name or 'unknown file'}: {e}")
            return None
def embed_text(ALL_Text_Extraction):
    context, hashtag = [], []
    embedded_context, embedded_hashtag = [], []

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Running on device: {device}")

    for file_name, text in ALL_Text_Extraction.items():
        print(f"Working with {file_name}")
        json_objects = safe_json_loads(text, file_name=file_name)
        if json_objects is None:
            continue

        ctx = json_objects["Disease_summary"]
        tag = json_objects["Keyword"]

        context.append(ctx)
        hashtag.append(tag)

        # ---- context embedding ----
        inputs = tokenizer(
            ctx,
            return_tensors="pt",
            truncation=True,
            max_length=12000,
        ).to(device)

        with torch.no_grad():
            feat = model.get_text_features(**inputs)

        feat = feat[0]  # ✅ remove batch dimension
        feat = feat / feat.norm(p=2)
        embedded_context.append(feat.cpu().tolist())

        # ---- hashtag embedding ----
        inputs = tokenizer(
            tag,
            return_tensors="pt",
            truncation=True,
            max_length=12000,
        ).to(device)

        with torch.no_grad():
            feat = model.get_text_features(**inputs)

        feat = feat[0]  # ✅ remove batch dimension
        feat = feat / feat.norm(p=2)
        embedded_hashtag.append(feat.cpu().tolist())

    return context, embedded_context, hashtag, embedded_hashtag
def embed_plain_text(text_string):
    """
    Embed a plain list of text strings using Jina CLIP v1.
    """
    # Prefer GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Tokenize text
    inputs = tokenizer(
        text_string,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=3000  # typical safe limit for plain text
    ).to(device)

    # Compute embeddings
    with torch.no_grad():
        text_features = model.get_text_features(**inputs)

    # Normalize embeddings (cosine similarity compatible)
    text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)

    # Convert to numpy
    return text_features.cpu().numpy()


In [20]:
context3, embedded_context3, hashtag3, embedded_hashtag3 = embed_text(batch3_summary)

Running on device: cpu
Working with extracted_data/53---A-24-Year-Old-Woman-from-Uganda-With-Fe_2022_Clinical-Cases-in-Tropical/53---A-24-Year-Old-Woman-from-Uganda-With-Fe_2022_Clinical-Cases-in-Tropical.md
Working with extracted_data/54---A-52-Year-Old-Male-Safari-Tourist-Returning-fro_2022_Clinical-Cases-in-/54---A-52-Year-Old-Male-Safari-Tourist-Returning-fro_2022_Clinical-Cases-in-.md
Working with extracted_data/55---A-40-Year-Old-Male-Farmer-from-Peru-With-Ch_2022_Clinical-Cases-in-Trop/55---A-40-Year-Old-Male-Farmer-from-Peru-With-Ch_2022_Clinical-Cases-in-Trop.md
Working with extracted_data/56---A-21-Year-Old-Pregnant-Woman-from-The-Ga_2022_Clinical-Cases-in-Tropica/56---A-21-Year-Old-Pregnant-Woman-from-The-Ga_2022_Clinical-Cases-in-Tropica.md
Working with extracted_data/57---A-37-Year-Old-Woman-from-Malawi-With-H_2022_Clinical-Cases-in-Tropical-/57---A-37-Year-Old-Woman-from-Malawi-With-H_2022_Clinical-Cases-in-Tropical-.md
Working with extracted_data/58---A-25-Year-Old-Woman

In [22]:
context4, embedded_context4, hashtag4, embedded_hashtag4 = embed_text(batch4_summary)

Running on device: cpu
Working with extracted_data/74---A-28-Year-Old-Woman-from-Sierra-Leone-With-_2022_Clinical-Cases-in-Trop/74---A-28-Year-Old-Woman-from-Sierra-Leone-With-_2022_Clinical-Cases-in-Trop.md
Working with extracted_data/75---A-25-Year-Old-Woman-from-Zambia-With-a-N_2022_Clinical-Cases-in-Tropica/75---A-25-Year-Old-Woman-from-Zambia-With-a-N_2022_Clinical-Cases-in-Tropica.md
Working with extracted_data/76---A-55-Year-Old-Woman-from-Turkey-With-Feve_2022_Clinical-Cases-in-Tropic/76---A-55-Year-Old-Woman-from-Turkey-With-Feve_2022_Clinical-Cases-in-Tropic.md
Working with extracted_data/77---A-51-Year-Old-Female-Traveller-Returning-from-Cen_2022_Clinical-Cases-i/77---A-51-Year-Old-Female-Traveller-Returning-from-Cen_2022_Clinical-Cases-i.md
Working with extracted_data/78---A-42-Year-Old-British-Man-Living-in-Malawi_2022_Clinical-Cases-in-Tropi/78---A-42-Year-Old-British-Man-Living-in-Malawi_2022_Clinical-Cases-in-Tropi.md
Working with extracted_data/79---A-34-Year-Old-Male-

In [23]:
%pip install qdrant-client

Collecting qdrant-client
  Downloading qdrant_client-1.16.2-py3-none-any.whl.metadata (11 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading qdrant_client-1.16.2-py3-none-any.whl (377 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m377.2/377.2 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, qdrant-client
Successfully installed portalocker-3.2.0 qdrant-client-1.16.2


In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
import numpy as np
import uuid
QDRANT_URL = "QDRANT_URL_ENDPOINT"
QDRANT_API_KEY = "QDRANT_API_KEY"
client = QdrantClient(url=QDRANT_URL,api_key=QDRANT_API_KEY)

In [25]:
VECTOR_SIZE = 768  # vector size for jina clip embedding

client.create_collection(
    collection_name="combine_multimodal_embeddings",
    vectors_config=models.VectorParams(
        size=VECTOR_SIZE,
        distance=models.Distance.COSINE
    )
)

True

In [26]:
import uuid
def store_text(context, embedded_context, hashtag, embedded_hashtag):
    for each_context, each_context_emb, each_hashtag, each_hashtag_emb in zip(
        context, embedded_context, hashtag, embedded_hashtag
    ):
        client.upsert(
            collection_name="combine_multimodal_embeddings",
            points=[
                models.PointStruct(
                    id=str(uuid.uuid4()),
                    vector=each_context_emb,  # ✅ FIXED
                    payload={
                        "type": "disease_overview",
                        "content": each_context,
                        "disease_name": each_hashtag,
                    },
                )
            ],
        )

        client.upsert(
            collection_name="combine_multimodal_embeddings",
            points=[
                models.PointStruct(
                    id=str(uuid.uuid4()),
                    vector=each_hashtag_emb,  # ✅ FIXED
                    payload={
                        "type": "disease_name",
                        "content": each_hashtag,
                    },
                )
            ],
        )

In [27]:
store_text(context3, embedded_context3, hashtag3, embedded_hashtag3)

In [28]:
store_text(context4, embedded_context4, hashtag4, embedded_hashtag4)

In [30]:
from qdrant_client.models import Filter, FieldCondition, MatchValue
from qdrant_client.http import models as rest
def retrieve_text_contexts(query_text, keyword = None, top_k=3, string_formatted_result = True):
  # Ensure the payload indexes are created for the correct collection (can be called multiple times safely)
  client.create_payload_index(
      collection_name="combine_multimodal_embeddings",
      field_name="type",
      field_schema=rest.PayloadSchemaType.KEYWORD
  )
  client.create_payload_index(
      collection_name="combine_multimodal_embeddings",
      field_name="disease_name",
      field_schema=rest.PayloadSchemaType.KEYWORD
  )
  if keyword:
    must_conditions = [
        FieldCondition(key="type", match=MatchValue(value="disease_name"))
    ]
    results = client.query_points(
      collection_name="combine_multimodal_embeddings",
      query=embed_plain_text(keyword)[0].tolist(),
      limit=top_k,
      query_filter=Filter(must=must_conditions),
      with_payload=True,
      with_vectors=False,
      score_threshold=None
    )
    try:
      retrieved_disease_name_tag = results.points[0].payload.get("content","")
    except Exception as e:
      print(f"Warning: Could not retrieve disease name for keyword '{keyword}'. Error: {e}")
      return [] # Return empty list if no disease name found for keyword
    must_conditions = [
        FieldCondition(key="type", match=MatchValue(value="disease_overview"))
    ]
    must_conditions.append(FieldCondition(key="disease_name", match=MatchValue(value=keyword)))
    overview_results = client.query_points(
      collection_name="combine_multimodal_embeddings",
      query=embed_plain_text(query_text)[0].tolist(),
      limit=top_k,
      query_filter=Filter(must=must_conditions),
      with_payload=True,
      with_vectors=False,
      score_threshold=None
    )
    if string_formatted_result == True:
      formatted_result = ""
      for each_result in overview_results.points:
        formatted_result += f"{each_result.payload.get('content','')}\n"
      return formatted_result
    else:
      return overview_results.points
  else:
    must_conditions = [
        FieldCondition(key="type", match=MatchValue(value="disease_overview"))
    ]
    results = client.query_points(
      collection_name="combine_multimodal_embeddings",
      query=embed_plain_text(query_text)[0].tolist(), # Fixed: Use query_text when keyword is None
      limit=top_k,
      query_filter=Filter(must=must_conditions),
      with_payload=True,
      with_vectors=False,
      score_threshold=None
    )
    if string_formatted_result:
      formatted_result = ""
      for each_result in results.points: # Fixed: Iterate over 'results.points'
        formatted_result += f"{each_result.payload.get('content','')}\n"
      return formatted_result
    else:
      return results.points

In [32]:
retrieve_text_contexts("Patient was a 33-year-old physician who had been working in Liberia since October 2013, during which time he had remained healthy while taking daily combination therapy with atovaquone and proguanil as prophylaxis against malaria. In April 2014, he and his team established an EVD care unit in Monrovia, and patients with confirmed EVD began arriving at this facility on June 11, 2014. On July 23, 2014, he awoke feeling febrile and fatigued; his oral temperature was 37.8°C. He reported his symptoms to colleagues and remained at home. Results on two rapid diagnostic tests for malaria (Standard Diagnostics) were negative. He started empirical malaria treatment with artemether and lumefantrine. Later that day, his oral temperature was 38.6°C, and nausea developed. He was tested for malaria by means of a rapid diagnostic test and for yellow fever, Lassa fever, and EBOV by means of semiquantitative real-time reverse-transcriptase–polymerase-chain-reaction (RT-PCR) assays, all of which were performed at the Liberian National Reference Laboratory. The results for all the tests were negative.", top_k =5)

"A 24-year-old woman from Uganda presents with a febrile illness, sore throat, abdominal pain, diarrhea, and bilateral conjunctivitis, with a history of her husband's recent death from a similar illness. The differential diagnoses include malaria, typhoid fever, invasive meningococcal disease, adenovirus, influenza, measles, Zika virus, and viral hemorrhagic fever (VHF), with Marburg virus disease being a key consideration given the husband's occupation in a gold mine. Testing should be done with caution, and samples should be sent to a specialized laboratory for PCR. The patient is treated with supportive care and resuscitation, and strict infection control measures should be followed.\nThis patient, a 35-year-old Peruvian logger, presents with acute febrile illness that produces liver failure, jaundice, bleeding, encephalopathy, and seizures. The diagnosis is confirmed by IgM capture ELISA positivity for yellow fever virus, which is a flavivirus transmitted by Aedes and Haemagogus mo