In [1]:
# install necessary packages
!pip install pdfplumber faiss-cpu transformers sentence-transformers nltk numpy

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m581.0 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━

In [2]:
# import necessary packages
import pdfplumber
import nltk
import faiss 
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration
import re
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

CPU times: user 14.6 s, sys: 2.4 s, total: 17 s
Wall time: 23.9 s


In [3]:
def extract_text_by_column(pdf_path): 
    # pdf_path: filepath of the textbook pdf document
    
    columns = [] # list to store text in, by column

    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            # define boundaries of first and second columns
            x0, y0, x1, y1 = page.bbox  # page boundaries
            page_width = x1 - x0
            page_height = y1 - y0

            mid_x = x0 + (page_width / 2) # mid point of page

            left_bbox = (x0, y0, mid_x, y1) # first column boundaries
            right_bbox = (mid_x, y0, x1, y1) # second column boundaries

            # extract text from each column
            left_text = page.within_bbox(left_bbox).extract_text(x_tolerance=2, y_tolerance=2)
            right_text = page.within_bbox(right_bbox).extract_text(x_tolerance=2, y_tolerance=2)

            # add columns with text to list, with page number
            for col_text in [left_text, right_text]:
                if col_text and col_text.strip():
                    columns.append({"text": col_text.strip(), "page": i + 1})
    # return list of columns of text
    return columns 

def chunk_text(columns, max_tokens=150, overlap_tokens=40): 
    # columns: list of columns from extract_text_by_column
    # max_tokens: maxmium amount of tokens in a chunk
    # overlap_tokens: number of tokens to overlap by between chunks

    # create tokenizer 
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

    chunks = [] # list to store chunks of text as dictionaries with keys 'text' and 'page'

    # divide each column of text into chunks
    for col_chunk in columns:
        # remove newline symbols from text
        text = col_chunk['text'].replace('-\n', '').replace('\n', ' ') 
        page = col_chunk['page']
        # tokenize by sentence
        sentences = nltk.tokenize.sent_tokenize(text)

        current_chunk = [] # current chunk to add sentences to 
        current_tokens = 0 # number of tokens in current chunk
        overlap_chunk = []  # overlap tokens from previous chunk

        for sentence in sentences:
            # encode the sentence
            sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
            
            # check if adding the sentence will make the chunk too long
            if current_tokens + len(sentence_tokens) <= max_tokens:
                # if it isn't too long, add the sentence to the current chunk
                current_chunk.append(sentence)
                # update the chunk token count
                current_tokens += len(sentence_tokens)
                
            else:
                # if it is too long, add the current chunk to the chunks list and move on to the next chunk
                chunks.append({'text': ' '.join(current_chunk), 'page': page})
                
                # tokenize the current chunk
                tokenized_chunk = tokenizer.encode(' '.join(current_chunk), add_special_tokens=False)
                if len(tokenized_chunk) > overlap_tokens:
                    # go backwards through the chunk to get overlap tokens
                    overlap_text = ''
                    overlap_token_count = 0
                    for s in reversed(current_chunk):
                        # add sentences to next chunk, making sure to stay under the overlap amount
                        s_tokens = tokenizer.encode(s, add_special_tokens=False)
                        if overlap_token_count + len(s_tokens) > overlap_tokens:
                            break
                        overlap_text = s + ' ' + overlap_text
                        overlap_token_count += len(s_tokens)
                    # start next chunk with overlap tokens
                    current_chunk = [overlap_text.strip()]
                    current_tokens = overlap_token_count
                else:
                    # if the previous chunk is too small, don't overlap
                    current_chunk = []
                    current_tokens = 0

                # add the sentence to the current chunk
                current_chunk.append(sentence)
                current_tokens += len(sentence_tokens)
            
        if current_chunk:
            # if there is a final chunk left, add it to the chunks list
            chunks.append({'text': ' '.join(current_chunk), 'page': page})

    # return list of chunks
    return chunks

def get_faiss_index(chunks, embed_model="all-MiniLM-L6-v2"):
    # chunks: list of chunks from chunk_text
    # embed_model: model to use for vector embedding

    # create embedding model
    model = SentenceTransformer(embed_model)
    
    texts = [chunk['text'] for chunk in chunks] # text from each chunk in chunks
    page_numbers =  [chunk['page'] for chunk in chunks] # page from each chunk in chucks
    # use model to embed chunk texts
    embeddings = model.encode(texts)
    dim = embeddings.shape[1] # dimension of embeddings

    # create FAISS index
    faiss_index = faiss.IndexFlatL2(dim)
    # add embeddings to index
    faiss_index.add(np.array(embeddings))

    # return index with embeddings and list of chunks
    return faiss_index, chunks

def retrieve_context(faiss_index, chunks, query, embed_model="all-MiniLM-L6-v2", threshold=0.5,  k=2):
    # faiss_index: FAISS index of embeddings from get_faiss_index
    # chunks: list of chunks from get_faiss_index
    # embed_model: model to use for vector embedding
    # threshold: minimum similarity between query and relevant chunk
    # k: number of chunks to retrieve
    
    # set device to cuda GPU if available
    device = "cuda" if torch.cuda.is_available() else 'cpu'
    # create embedding model
    model = SentenceTransformer(embed_model).to(device)
    # use model to embed query
    query_embedding = model.encode([query])[0].reshape(1, -1)

    # get the distances and ids of nearest k chunks to query
    distances, retrieved_ids = faiss_index.search(query_embedding, k)
    # convert distances to similarities to normalize
    similarities = 1/(1+distances[0])

    retrieved_chunks = [] # list of chunks to return
    # check if each chunk has a similarity above the threshold and is longer than five words
    for i, idx in enumerate(retrieved_ids[0]):
        chunk_text = chunks[idx]
        similarity = similarities[i]
        if similarity>threshold and len(chunk_text['text'].split())>=5:
            # add chunks that meet the criteria
            retrieved_chunks.append((chunk_text, similarity))

    # return list of retrieved chunks
    return retrieved_chunks


def answer_query(query, context, qa_model="google/flan-t5-large"):
    # query: user input question
    # context: retrieved_chunks from retrieve_context
    # qa_model: LLM to use for question answering

    # set device to cuda GPU if available
    device = "cuda" if torch.cuda.is_available() else 'cpu'
    # create tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained(qa_model)
    model = T5ForConditionalGeneration.from_pretrained(qa_model).to(device) 

    context_text = [chunk[0]['text'] for chunk in context][0] # text from most relevant chunk in context

    # combine prompt, query, and relevant context
    input_text = f"Answer the question. Do not simply return a word or fragment, but one or more complete and detailed sentences, including all necessary information and all relevant context.: \n \n{query}\n\nContext:\n{context_text}"

    # tokenize input text for LLM 
    inputs = tokenizer(input_text, return_tensors="pt", padding='max_length',max_length=1000, truncation=True).to(device) 
    # set random seed
    torch.manual_seed(42)
    if device == "cuda":
        torch.cuda.manual_seed_all(42)
    # pass inputs to LLM to generate response
    outputs = model.generate(**inputs, max_new_tokens=1500, do_sample=True, 
                           temperature=0.7, top_p=0.9, early_stopping=True, num_beams=3)
    # decode output to get answer in text
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # return text answer
    return answer

In [None]:
filepath = 'ClinicalDermatology.pdf' # filepath of textbook pdf
# read in textbook data and divide into "chunks"
chunks = chunk_text(extract_text_by_column(filepath))

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors


CPU times: user 57.8 s, sys: 2.09 s, total: 59.9 s
Wall time: 1min 1s


In [5]:
# represent chunks of text as vector embeddings
faiss_index, chunks = get_faiss_index(chunks)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/67 [00:00<?, ?it/s]

CPU times: user 4.52 s, sys: 684 ms, total: 5.2 s
Wall time: 16 s


In [6]:
# manually create test set by reading through textbook and coming up with questions:
test_qa = [
    ("What is dermatology?", "Dermatology is the study of skin and its associated structures, including the hair and the nails and their diseases."),
    ("Name the five Ds of dermatological disease.", "The five Ds are Disfigurement, Disablement, Discomfort, Death and Depression"),
    ("How many layers does skin have?", "Skin has two layers, the outer epidermis and the underlying dermis."),
    ("Explain the difference between a nodule and a tumor.", "A nodule is a solid mass in the skin greater than 0.5cm in diameter. A tumor is less clearly defined, but is typically more than 1cm in diameter. Tumors can also be called 'large nodules'."),
    ("How is a potential fungal infection tested?", "Scales or plucked hairs can be dissolved into an aqueous solution of 20% potassium hydroxide (KOH) containing 40% dimethyl sulphoxide (DMSO). The scale from the edge of a scaling lesion is vigorously scraped on to a glass slide with a No. 15 scalpel blade or the edge of a second glass slide. A drop or two of the KOH solution is run under the cover slip. After 5-10 min the mount is examined under a microscope with the condenser lens lowered to increase contrast."),
    ("What kind of diagnosis is Tanzck smear used in?", "Tanzck smear, or cytology, can aid in diagnosis of viral infections such as herpes simplex and zoster, and of bullous diseases such as pemphigus."),
    ("When should you use incisional vs. excisional biopsies?", "Excisional biopsy is preferable for most small lesions (up to 0.5 cm diameter) but incisional biopsy is chosen when the partial removal of a larger lesion is adequate for diagnosis, and complete removal might leave an unnecessary and unsightly scar."),
    ("How does psoriasis affect the scalp?", "Areas of scaling are interspersed with normal skin; their lumpiness is more easily felt than seen.  Frequently, the psoriasis overflows just beyond the scalp margin. Significant hair loss is rare."),
    ("What causes Lyell's Disease?", "Toxic epidermal necrolysis (Lyell's disease) is usually a drug reaction, most commonly to sulphonamides, barbituates, carbamazepine, or allopurinol, but can also be a manifestation of graft-vs.-host disease."),
    ("What other conditions can systemic lupus erythematosus be confused with?", "SLE is a great imitator. Its malar rash can be confused with sunburn, polymorphic light eruption, and rosacea. The discoid lesions are distinctive, but are also seen in discoid LE and in subacute cutaneous LE. Occasionally they look like psoriasis or lichen planus. The hair fall suggests telogen effluvium."),
    ("Describe the symptoms of Acrocyanosis.", "The hands, feet, nose, ears, and cheeks become blue-red and cold. The palms are often cold and clammy."),
    ("How can Raynaud's disease be treated?", "The main treatment is to protect the vulnerable digits from cold. Warm clothing reduces the need for peripheral vasoconstriction to conserve heat. Smoking should be abandoned. Calcium channel blockers (e.g. nifedipine 10-30 mg three time daily) are the most effective in parients with primary Raynaud's disease."),
    ("How is Doxycycline different from Minocycline?", "It is a cheaper alternative to minocycline, but more frequently associated with phototoxic skin reactions."),
    ("Are Tetracyclines safe for children?", "No, Tetracyclines should not be taken by children under 12 years as they are deposited in growing bone and developing teeth, causing stained teeth and dental hypoplasia."),
    ("Where in the body are sweat glands?", "There are 2-3 million sweat glands distributed all over the body surface, but they are most numerous on the palms, soles, and axillae."),
    ("Describe the phases of the hair cycle.", "There are three phases of follicular activity. 1 Anagen. The active phase of hair production. 2 Catagen. A short phase of conversion from active growth to the resting phase. Growth stops, and the end of the hair becomes club-shaped. 3 Telogen. A resting phase at the end of which the club hair is shed."),
    ("What condition results from a cat bite?", "The infective agent is the baccilus Rochalimaea henselae. A few days after a cat bite or scratch, a reddish granulomatous papule appears at the site of inoculation."),
    ("How is syphilis diagnosed?", "The diagnosis of syphilis in its infectious (primary and secondary) stages can be confirmed using dark field microscopy to show up spirochaetes in smears from chancres, oral lesions, or moist areas in a secondary eruption. Serological tests for syphilis become positive only some 5–6 weeks after infection (usually a week or two after the appearance of the chancre). The traditional tests [Wasswemann reaction (WR) and Venereal Disease Research Laboratory (VDRL)] have now been replaced by more specific ones [e.g. the rapid plasma reagin (RPR) test and the fluorescent treponemal antibody/absorption (FTA/ABS) test]. These more sensitive tests do not become negative after treatment if an infection has been present for more than a few months."),
    ("Is dapsone an effective treatment for leprosy?", "The emergence of resistant strains of M. leprae means that it is no longer wise to treat leprosy with dapsone alone. It should now be used in combination, usually with clofazimine for lepromatous leprosy."),
    ("What causes hair to turn grey early?", "Early greying of the hair is seen in the rare premature aging syndromes, such as Werner's syndrome, and in autoimmune conditions such as pernicious anaemia, thyroid disorders, and Addison's disease."),
    ("Explain dermatological non-disease.", "This is a form of dysmorphophobia. The clinician can find no abnormality, but the distress felt by the patient leads to anxiety, depression, or even sucicide. Such patients are not uncommon. They expect dermatological solutions for complaints such as hair loss, or burning, itching, and redness of the face or genitals. The dermatologist, who can see nothing wrong, cannot solve matters and no treatment seems to help."),
    ("Which drugs can cause toxic reactive erythema?", "Culprits include antibiotics (especially ampicillin), sulphonamides and related compounds (diuretics and hypoglycaemics), barbituates, phenylbutazone, and para-aminosalicylate (PAS)."),
    ("What conditions does cryotherapy treat?", "It is effective for viral warts, seborrhoeic keratoses, actinic keratoses, and some superficial skin tumors (e.g. intraepidermal carcinoma and lentigo maligna."),
]

In [7]:
eval_dicts = [] # list of dictionaries to contain query, truth, context, and answer for test set data
for i in range(len(test_qa)):
    # retreive context for query i
    context = retrieve_context(faiss_index, chunks, test_qa[i][0], k=3, threshold=0.4)
    # add current query, truth, context, and answer to eval_dicts
    eval_dicts.append({
        'query': test_qa[i][0],
        'truth': test_qa[i][1],
        'context':  context,
        'answer': answer_query(test_qa[i][0], context) # generate answer 
        
    })

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# create evaluation metrics:

def context_precision(truth, context): # input the ground truth and the retreived documents
    text = ''.join(context[0][0]['text']) # text of context used
    # create Tfidf vectorizer and fit to both truth and context
    vectorizer = TfidfVectorizer().fit([context[0][0]['text'], truth])
    # vectorize both truth and context
    vectors = vectorizer.transform([context[0][0]['text'], truth])
    # get cosine similarity between truth and context vectors
    sim = cosine_similarity(vectors[0], vectors[1])[0][0]
    return sim

def faithfulness(answer, context): # input the generated answer and the retrieved documents
    text = ''.join(context[0][0]['text']) # text of context used
    # create Tfidf vectorizer and fit to both answer and context
    vectorizer = TfidfVectorizer().fit([answer, context[0][0]['text']])
    # vectorize both answer and context
    vectors = vectorizer.transform([answer, context[0][0]['text']])
    # get cosine similarity between answer and context vectors
    sim = cosine_similarity(vectors[0], vectors[1])[0][0]
    return sim

In [9]:
cp = [] # list of context precisions
f = [] # list of faithfulnesses

for x in eval_dicts:
    # calculate and metrics for each item in eval_dicts and add metrics to lists
    cp.append(context_precision(x['truth'], x['context']))
    f.append(faithfulness(x['answer'], x['context']))
    
# calculate means of each list
print('Context Precision: ', np.mean(cp), 'Faithfulness: ', np.mean(f))

Context Precision:  0.4149936306095072 Faithfulness:  0.5114133579400361


In [10]:
# example query:
query = "what is psoriasis?"
# search for documents relevant to query
context1 = retrieve_context(faiss_index, chunks, query, k=3, threshold=0.5)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 140 ms, sys: 18.1 ms, total: 158 ms
Wall time: 870 ms


In [11]:
# provide relevant documents to question answering LLM to get answer
answer_query(query, context1)

CPU times: user 3.95 s, sys: 395 ms, total: 4.34 s
Wall time: 4.74 s


'It is a chronic non-infectious inflammatory skin disorder, characterized by well-defined erythematous plaques bearing large adherent silvery scales.'