In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install PyPDF2
!pip install langchain
!pip install langchain-community
!pip install langchain-experimental
!pip install pypdf
!pip install sentence-transformers

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m215.0/232.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting langchain
  Downloading langchain-0.3.4-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.12 (from langchain)
  Downloading langchain_core-0.3.13-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  D

In [None]:
from transformers import AutoTokenizer
import re
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker



# Define data paths
DATA_PATH_1 = '/content/drive/MyDrive/UNH_Hackathon/UNH_Hackathon_Data_PDFs/A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'
DATA_PATH_2 = '/content/drive/MyDrive/UNH_Hackathon/UNH_Hackathon_Data_PDFs/Hospital_ships_adrift_Part2_The_role_of_US_Navy_hospital_ship.pdf'
DATA_PATH_3 = '/content/drive/MyDrive/UNH_Hackathon/UNH_Hackathon_Data_PDFs/Sea_Power_The_US_Navy_and_Foreign_Policy_Council_on_Foreign_Relations.pdf'
DATA_PATH_4 = '/content/drive/MyDrive/UNH_Hackathon/UNH_Hackathon_Data_PDFs/US_Mercy.pdf'
DATA_PATH_5 = '/content/drive/MyDrive/UNH_Hackathon/UNH_Hackathon_Data_PDFs/US_Navy_Ship_Based_Disaster_Response_Lessons_Learned_PMC.pdf'

# Create a list of lists with path and filename
data_paths = [
    [DATA_PATH_1, 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'],
    [DATA_PATH_2, 'Hospital_ships_adrift_Part2_The_role_of_US_Navy_hospital_ship'],
    [DATA_PATH_3, 'Sea_Power_The_US_Navy_and_Foreign_Policy_Council_on_Foreign_Relations.pdf'],
    [DATA_PATH_4, 'US_Mercy.pdf'],
    [DATA_PATH_5, 'US_Navy_Ship_Based_Disaster_Response_Lessons_Learned_PMC.pdf']
]


combined_chunk_list = []  # List to hold combined chunks
metadata_list = []  # List to hold metadata for combined chunks

# Threshold for max chunk length after combination
MAX_COMBINED_LENGTH = 512

# Initialize the tokenizer for token counting
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")


# Combine related chunks, restarting on new pages
temp_text = ""  # Temporary variable for merging chunks
previous_page = None  # Track the page number of the last chunk
start_page = None  # Track the starting page of the combined chunk


# Initialize embedding model
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Enhanced pattern to detect only structured tables without narrative context
table_pattern = re.compile(
    r'(\d+\s*\(.*?\))|'             # Numbers with parentheses, e.g., '10 (some text)'
    r'(\bX\b)|'                     # Standalone "X", e.g., ' X '
    r'((?:\d+\s+){3,})|'            # Three or more consecutive numbers, e.g., '10 20 30'
    r'(.*?\d+%\s*){3,}'             # Three or more percentages, e.g., '20% 30% 40%'
)



# Extract text from each page
for doc in data_paths:

    # Uses langchain's document loader
    loader = PyPDFLoader(doc[0])
    data = loader.load()

    # Initial split so chunk size is more consistent
    size= 512
    overlap = 50
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)
    documents = splitter.split_documents(data)


    # Recreate chunks with semantic chunker
    semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type="percentile")
    semantic_chunks = semantic_chunker.split_documents(documents)

    # create metadate

    for chunk in semantic_chunks:
      chunk_text = chunk.page_content
      if not table_pattern.search(chunk_text):
        chunk_text = chunk.page_content.strip()
        current_page = chunk.metadata['page']  # Get the current chunk's page number

        # Calculate token count of the current combined text
        combined_text = temp_text + " " + chunk_text
        combined_tokens = len(tokenizer.tokenize(combined_text))

        # Check if we're on a new page or if adding the chunk exceeds max length
        if previous_page is not None and (current_page != previous_page or combined_tokens > MAX_COMBINED_LENGTH):
            # Save the current combined chunk and its metadata
            combined_chunk_list.append(temp_text.strip())
            metadata_list.append({
                'source': doc[1],
                'page': start_page,
            })

            # Start a new chunk
            temp_text = chunk_text
            start_page = current_page  # Set the starting page for the new chunk
        else:
            # Concatenate the current chunk to temp_text
            temp_text += " " + chunk_text
            if start_page is None:
                start_page = current_page  # Initialize start_page if starting new combined chunk

        previous_page = current_page  # Update the previous_page to the current page

    # Append any remaining text and metadata as the final chunk
    if temp_text:
        combined_chunk_list.append(temp_text.strip())
        metadata_list.append({
            'source': doc[1],
            'page': start_page,
        })




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

  embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors


In [None]:
from pprint import pprint
pprint(combined_chunk_list)

['MILITARY MEDICINE, 188, 7/8:e1802, 2023\n'
 'A Decade of Surgery Aboard the U.S. Naval Ship\n'
 'COMFORT (T-AH 20)\n'
 'CDR T amara J. Worlton, MD, FACS*,†; CPT Rathnayaka MKD Gunasingha, MD\n'
 ' †; \n'
 'LCDR Rex Atwood, MD†; CDR Mark Johnson, MD, FACS‡; CDR Ian C. Uber, MD§\n'
 ' \n'
 'ABSTRACT \n'
 'Introduction:\n'
 'The U.S. Naval Ship COMFORT has performed six humanitarian assistance and '
 'disaster relief mission since 2007. This \n'
 'paper describes the surgical volume per surgical specialty for five missions '
 'spanning 19 countries. Materials and Methods:\n'
 'Raw surgical case logs were analyzed for total case volume, total operating '
 'days, unanticipated return to operating room, \n'
 'and percentage of pediatric cases (<18 years old) for each country visited. '
 'Results:\n'
 'Total surgical volume for the five missions was 5,142. The countries most '
 'frequently visited were Columbia and Haiti \n'
 'with seven and five visits, respectively. General surgery, ophth

In [None]:
from pprint import pprint
pprint(metadata_list)

[{'page': 0, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 0, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 0, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 1, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 1, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 1, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 2, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 3, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 3, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 4, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 4, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 4, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 5, 'source': 'A_Decade_of_Surgery_Aboard_the_US_COMFORT.pdf'},
 {'page': 5, 'source': 'A_Decade_of_Surgery_Aboard_