In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install PyPDF2
!pip install langchain
!pip install langchain-community
!pip install langchain-experimental
!pip install pypdf
!pip install sentence-transformers
!pip install spellchecker



In [None]:
from transformers import AutoTokenizer
import re
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker



# Define data paths
DATA_PATH_1 = '/content/drive/MyDrive/UNH_Hackathon/UNH_Hackathon_Data_PDFs/A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'
DATA_PATH_2 = '/content/drive/MyDrive/UNH_Hackathon/UNH_Hackathon_Data_PDFs/Hospital_ships_adrift_Part2_The_role_of_US_Navy_hospital_ship.pdf'
DATA_PATH_3 = '/content/drive/MyDrive/UNH_Hackathon/UNH_Hackathon_Data_PDFs/Sea_Power_The_US_Navy_and_Foreign_Policy_Council_on_Foreign_Relations.pdf'
DATA_PATH_4 = '/content/drive/MyDrive/UNH_Hackathon/UNH_Hackathon_Data_PDFs/US_Mercy.pdf'
DATA_PATH_5 = '/content/drive/MyDrive/UNH_Hackathon/UNH_Hackathon_Data_PDFs/Sea_Power_The_US_Navy_and_Foreign_Policy_Council_on_Foreign_Relations.pdf'

# Create a list of lists with path and filename
data_paths = [
    [DATA_PATH_1, 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'],
    [DATA_PATH_2, 'Hospital_ships_adrift_Part2_The_role_of_US_Navy_hospital_ship'],
    [DATA_PATH_3, 'Sea_Power_The_US_Navy_and_Foreign_Policy_Council_on_Foreign_Relations.pdf'],
    [DATA_PATH_4, 'US_Mercy.pdf'],
    [DATA_PATH_5, 'US_Navy_Ship_Based_Disaster_Response_Lessons_Learned_PMC.pdf']
]


combined_chunk_list = []  # List to hold combined chunks
metadata_list = []  # List to hold metadata for combined chunks

# Threshold for max chunk length after combination
MAX_COMBINED_LENGTH = 512

# Initialize the tokenizer for token counting
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")


# Combine related chunks, restarting on new pages
temp_text = ""  # Temporary variable for merging chunks
previous_page = None  # Track the page number of the last chunk
start_page = None  # Track the starting page of the combined chunk


# Initialize embedding model
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Enhanced pattern to detect only structured tables without narrative context
table_pattern = re.compile(
    r'(\d+\s*\(.*?\))|'             # Numbers with parentheses, e.g., '10 (some text)'
    r'(\bX\b)|'                     # Standalone "X", e.g., ' X '
    r'((?:\d+\s+){3,})|'            # Three or more consecutive numbers, e.g., '10 20 30'
    r'(.*?\d+%\s*){3,}'             # Three or more percentages, e.g., '20% 30% 40%'
)



# Extract text from each page
for doc in data_paths:

    # Uses langchain's document loader
    loader = PyPDFLoader(doc[0])
    data = loader.load()

    # Initial split so chunk size is more consistent
    size= 512
    overlap = 50
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)
    documents = splitter.split_documents(data)


    # Recreate chunks with semantic chunker
    semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type="percentile")
    semantic_chunks = semantic_chunker.split_documents(documents)

    # create metadate

    for chunk in semantic_chunks:
      chunk_text = chunk.page_content
      if not table_pattern.search(chunk_text):
        chunk_text = chunk.page_content.strip()
        current_page = chunk.metadata['page']  # Get the current chunk's page number

        # Calculate token count of the current combined text
        combined_text = temp_text + " " + chunk_text
        combined_tokens = len(tokenizer.tokenize(combined_text))

        # Check if we're on a new page or if adding the chunk exceeds max length
        if previous_page is not None and (current_page != previous_page or combined_tokens > MAX_COMBINED_LENGTH):
            # Save the current combined chunk and its metadata
            combined_chunk_list.append(temp_text.strip())
            metadata_list.append({
                'source': doc[1],
                'page': start_page,
            })

            # Start a new chunk
            temp_text = chunk_text
            start_page = current_page  # Set the starting page for the new chunk
        else:
            # Concatenate the current chunk to temp_text
            temp_text += " " + chunk_text
            if start_page is None:
                start_page = current_page  # Initialize start_page if starting new combined chunk

        previous_page = current_page  # Update the previous_page to the current page

    # Append any remaining text and metadata as the final chunk
    if temp_text:
        combined_chunk_list.append(temp_text.strip())
        metadata_list.append({
            'source': doc[1],
            'page': start_page,
        })




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors


In [None]:
from pprint import pprint
pprint(combined_chunk_list)

['MILITARY MEDICINE, 188, 7/8:e1802, 2023\n'
 'A Decade of Surgery Aboard the U.S. Naval Ship\n'
 'COMFORT (T-AH 20)\n'
 'CDR T amara J. Worlton, MD, FACS*,†; CPT Rathnayaka MKD Gunasingha, MD\n'
 ' †; \n'
 'LCDR Rex Atwood, MD†; CDR Mark Johnson, MD, FACS‡; CDR Ian C. Uber, MD§\n'
 ' \n'
 'ABSTRACT \n'
 'Introduction:\n'
 'The U.S. Naval Ship COMFORT has performed six humanitarian assistance and '
 'disaster relief mission since 2007. This \n'
 'paper describes the surgical volume per surgical specialty for five missions '
 'spanning 19 countries. Materials and Methods:\n'
 'Raw surgical case logs were analyzed for total case volume, total operating '
 'days, unanticipated return to operating room, \n'
 'and percentage of pediatric cases (<18 years old) for each country visited. '
 'Results:\n'
 'Total surgical volume for the five missions was 5,142. The countries most '
 'frequently visited were Columbia and Haiti \n'
 'with seven and five visits, respectively. General surgery, ophth

In [None]:
from pprint import pprint
pprint(metadata_list)

[{'page': 0, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 0, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 0, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 1, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 1, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 1, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 2, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 3, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 3, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 4, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 4, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 4, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 5, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 5, 'source': 'A_Decade_of_Surgery_Aboard_

Below is the manual inserting of latex version of graphs as chuncks into chunk list and corresponding metadata

In [None]:
import json

# Load the JSON data
file_path = '/content/drive/MyDrive/UNH_Hackathon/merged_data.json'  # Update with your file path
with open(file_path, 'r') as file:
    data = json.load(file)

# Extract LaTeX content from category_id 5 items with no timeout or timeout = false
latex_content = []

for entry in data:
    for item in entry.get("layout_dets", []):
        if item.get("category_id") == 5:
            # Check if "timeout" is either not present or explicitly false
            if not item.get("timeout", False):  # if timeout is False or missing
                latex_code = item.get("latex", "No LaTeX Content")
                latex_content.append(latex_code)



# List of strings to prepend
prefixes = [
    "Total Volume per Specialty per Year : ",
    "Number of Surgeons by Specialty on Each Mission Year (Estimated) : ",
    "Number of Cases per Country : ",
    "Partnership Themes Listed by Frequency Within and Between Units of Analysis : ",
    "Documentary Partnership Themes Matching Case Study : ",
    "Recent missions of the USNS Mercy (3) : "
]






# Prepend each prefix to the corresponding latex_content string
for i in range(len(prefixes)):
    latex_content[i] = prefixes[i] + " " + latex_content[i]

# Print the updated latex_content to see the result
for content in latex_content:
    print(content)

# Append the contents of prefixes_metadata to metadata_list
combined_chunk_list.extend(latex_content)

prefixes_metadata = [
    {'page': 2, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
    {'page': 2, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
    {'page': 3, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
    {'page': 6, 'source': 'Hospital_ships_adrift_Part2_The_role_of_US_Navy_hospital_ship'},
    {'page': 9, 'source': 'Hospital_ships_adrift_Part2_The_role_of_US_Navy_hospital_ship'},
    {'page': 2, 'source': 'US_Mercy.pdf'}
]

# Append the contents of prefixes_metadata to metadata_list
metadata_list.extend(prefixes_metadata)

Total Volume per Specialty per Year :  \begin{tabular}{|>{\hspace{0.5cm}}m{6cm} >{\hspace{0.5cm}}m{2.5cm} >{\hspace{0.5cm}}m{2.5cm} >{\hspace{0.5cm}}m{2.5cm} >{\hspace{0.5cm}}m{1.5cm} >{\hspace{0.5cm}}m{1.5cm}|}\noalign{\smallskip}\noalign{\smallskip}\hline \hline \noalign{\smallskip}\multicolumn{6}{|c|}{} \\[-0.5ex] & 2009 & 2011 & 2015 & 2018 & 2019 \\[0.5ex]\hline \noalign{\smallskip}\textit{Operating days} & 47 & 71 & 84 & 29 & 69 \\\textit{Total mission time Speciallies} & 4\,months & S\,months & 6\,months & 3\,months & S\,months \\General surgery & 506 & 259 & 435 & 265 & 462 \\Gynecology & 115 & 63 & 122 & 0 & 0 \\Ophthalmology & 199 & 312 & 201 & 211 & 412 \\O偏ryngology & SS & 98 & 78 & 0 & 0 \\Oral maxillofacial surgery & 49 & 68 & 75 & 38 & 12 \\Orthopedic surgery & 81 & 94 & 98 & 29 & 22 \\Plastic surgery & 74 & 117 & 129 & 54 & 143 \\Urology & 59 & 100 & 56 & 0 & 51 \\Total & 1,138 & 1,111 & 1,194 & 597 & 1,102 \\\textit{Return to operating room rate} & 0.33\% & 0.18\% & 0.

Setting up a clean Vector Database and Hybrid search

In [None]:
'''
user:
db_cf607103ea8262d

password:
Hs1<%^{QRr,2Jc;8

API_KEY:
0a3ae0ae0608129e5e33848199bd46ea36c44d150106e446100a61d69b1813991815e17eb96eee35cba3ff49c759274aecc1e9ba

public_endpoint:
https://in03-cf607103ea8262d.serverless.gcp-us-west1.cloud.zilliz.com

cluster_ID:
in03-cf607103ea8262d

cloud_Region:
gcp-us-west1
'''

'\nuser:\ndb_cf607103ea8262d\n\npassword:\nHs1<%^{QRr,2Jc;8\n\nAPI_KEY:\n0a3ae0ae0608129e5e33848199bd46ea36c44d150106e446100a61d69b1813991815e17eb96eee35cba3ff49c759274aecc1e9ba\n\npublic_endpoint:\nhttps://in03-cf607103ea8262d.serverless.gcp-us-west1.cloud.zilliz.com\n\ncluster_ID:\nin03-cf607103ea8262d\n\ncloud_Region:\ngcp-us-west1\n'

In [None]:
!pip3 install --upgrade pip
# Install pymilvus compatible with Milvus v2.3.x
!pip install pymilvus==2.3.7

# Install pymilvus compatible with Milvus v2.4.x
!pip install pymilvus==2.4.4

# Update PyMilvus to the newest version
!pip install --upgrade pymilvus

# Verify installation success
!pip list | grep pymilvus

#for embeddings
!pip install FlagEmbedding
!pip install peft


Collecting pymilvus==2.3.7
  Using cached pymilvus-2.3.7-py3-none-any.whl.metadata (4.4 kB)
Using cached pymilvus-2.3.7-py3-none-any.whl (179 kB)
Installing collected packages: pymilvus
  Attempting uninstall: pymilvus
    Found existing installation: pymilvus 2.4.9
    Uninstalling pymilvus-2.4.9:
      Successfully uninstalled pymilvus-2.4.9
Successfully installed pymilvus-2.3.7
Collecting pymilvus==2.4.4
  Using cached pymilvus-2.4.4-py3-none-any.whl.metadata (5.4 kB)
Using cached pymilvus-2.4.4-py3-none-any.whl (196 kB)
Installing collected packages: pymilvus
  Attempting uninstall: pymilvus
    Found existing installation: pymilvus 2.3.7
    Uninstalling pymilvus-2.3.7:
      Successfully uninstalled pymilvus-2.3.7
Successfully installed pymilvus-2.4.4
Collecting pymilvus
  Using cached pymilvus-2.4.9-py3-none-any.whl.metadata (5.6 kB)
Using cached pymilvus-2.4.9-py3-none-any.whl (201 kB)
Installing collected packages: pymilvus
  Attempting uninstall: pymilvus
    Found existing i

In [None]:
import time
time.sleep(5)
!pip install pymilvus
import pymilvus
from pymilvus import MilvusClient, AnnSearchRequest, WeightedRanker



In [None]:
"""
RUN THIS CODE AGAIN IF IT FAILS THE FIRST TIME!!!
RUN THIS CODE AGAIN IF IT FAILS THE FIRST TIME!!!
RUN THIS CODE AGAIN IF IT FAILS THE FIRST TIME!!!
RUN THIS CODE AGAIN IF IT FAILS THE FIRST TIME!!!
RUN THIS CODE AGAIN IF IT FAILS THE FIRST TIME!!!


Also takes about 7 minutes to run if embedding as not stored to your google drive if it is than it would take 30 seconds
"""





import random
import numpy as np
import os
import pickle

from transformers import AutoTokenizer, AutoModel
import torch

# Load model and tokenizer from Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/llm-embedder')
model = AutoModel.from_pretrained('BAAI/llm-embedder')
model.eval()





# Step 1: Replace these with your actual endpoint and token
CLUSTER_ENDPOINT = "https://in03-cf607103ea8262d.serverless.gcp-us-west1.cloud.zilliz.com"
TOKEN = "0a3ae0ae0608129e5e33848199bd46ea36c44d150106e446100a61d69b1813991815e17eb96eee35cba3ff49c759274aecc1e9ba"

# Step 2: Connect to your Milvus cluster hosted on Zilliz Cloud
client = MilvusClient(
    uri=CLUSTER_ENDPOINT,
    token=TOKEN
)

"""

EMBEDDING MODEL, turning CHUNKS INTO EMBEDDINGS

"""



keys = combined_chunk_list
# Step 6: Generate embeddings for the chunks (keys)

# Define the save path in your Google Drive
save_path = '/content/drive/MyDrive/Oct_31_embeddings_data.pkl'

# Check if the embeddings file exists if it does just load it up if not convert chuncks to embeddings
if os.path.exists(save_path):
    # Load embeddings, keys, and metadata_list from the file
    with open(save_path, 'rb') as f:
        data = pickle.load(f)
        sentence_embeddings = data['embeddings']
        keys = data['keys']
        metadata_list = data['metadata_list']
    print("Loaded embeddings, keys, and metadata from Google Drive.")
else:

    # Tokenize sentences
    encoded_input = tokenizer(keys, padding=True, truncation=True, return_tensors='pt')

    # Compute embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        # Use CLS token embedding as sentence embedding
        sentence_embeddings = model_output[0][:, 0]
        # Normalize embeddings for cosine similarity
        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

    print("Generated embeddings.")

    # Save embeddings, keys, and metadata_list to Google Drive
    with open(save_path, 'wb') as f:
        pickle.dump({'embeddings': sentence_embeddings, 'keys': keys, 'metadata_list': metadata_list}, f)
    print("Saved embeddings, keys, and metadata to Google Drive.")


DEBUG:pymilvus.milvus_client.milvus_client:Created new connection using: 1f323cdbb22d4dc88b6c82aa964d04bb


Loaded embeddings, keys, and metadata from Google Drive.


In [None]:
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, AnnSearchRequest, WeightedRanker
import random
import time

# Connect to your Zilliz Cloud cluster
CLUSTER_ENDPOINT = "https://in03-cf607103ea8262d.serverless.gcp-us-west1.cloud.zilliz.com"
TOKEN = "0a3ae0ae0608129e5e33848199bd46ea36c44d150106e446100a61d69b1813991815e17eb96eee35cba3ff49c759274aecc1e9ba"

# Step 1: Connect to the cluster
connections.connect(uri=CLUSTER_ENDPOINT, token=TOKEN)

# Step 2: Create the collection schema with 'metadata' field
def create_collection():
    # Define field schemas
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False),
        FieldSchema(name="posterVector", dtype=DataType.FLOAT_VECTOR, dim=768),  # Only poster vector remains
        FieldSchema(name="metadata", dtype=DataType.JSON),  # Metadata as JSON
        FieldSchema(name="chunks", dtype=DataType.JSON)  # Store chunks as JSON, containing a list of strings

    ]

    # Create the collection schema
    schema = CollectionSchema(fields=fields, enable_dynamic_field=False)

    # Create the collection
    collection = Collection(name="Oct_31", schema=schema)
    print("Collection 'Oct_31' created successfully.")
    return collection


# Step 4: Create index for vector fields
def create_index(collection):
    # Define the index parameters
    index_params = {
        "metric_type": "L2",  # L2 distance for similarity search
        "index_type": "AUTOINDEX"
    }

    # Create an index for the 'posterVector' field
    collection.create_index("posterVector", index_params)
    print("Index created for 'posterVector'.")



# Step 5: Insert vector embeddings into the collection (with 'metadata' field)
def insert_embeddings(collection, k_embeddings, m_data_list, c_list):
    # Check the dimension of the embeddings being inserted
    print(f"Inserting embeddings with dimensions: {len(k_embeddings[0])}")

    data = [
        {"id": i, "posterVector": k_embeddings[i], "metadata": m_data_list[i], "chunks": c_list[i]}
        for i in range(len(k_embeddings))
    ]

    # Insert the data into the collection
    collection.insert(data)
    print(f"Inserted {len(data)} entities into the collection 'Oct_31'")


# Step 7: Execute the full workflow
collection = create_collection()  # Create the collection with schema
create_index(collection)  # Create indexes for vector fields
time.sleep(5)  # Allow time for index creation
insert_embeddings(collection,sentence_embeddings, metadata_list, combined_chunk_list)  # Insert embeddings into the collection
time.sleep(5)  # Allow time for data to be indexed



Collection 'Oct_31' created successfully.
Index created for 'posterVector'.
Inserting embeddings with dimensions: 768
Inserted 87 entities into the collection 'Oct_31'
