# 1. Install Packages

In [None]:
%%capture
!pip install -q openai
!pip install -q langchain

In [None]:
%%capture
!pip install pypdf
!pip install -U -q PyPDF2
!pip install -q pdfplumber
!pip install poppler-utils
!pip install Spire.PDF

In [None]:
%%capture
!pip install langchain_community[all]
!pip install fitz
!pip install pymupdf
!pip install pillow

# 2. Load Libraries

In [None]:
import numpy as np
import pandas as pd
import base64
from dotenv import load_dotenv
import os
from tqdm import tqdm
from spire.pdf.common import *
from spire.pdf import *
import poppler
import matplotlib.pyplot as plt
from pandas.plotting import table
from IPython.display import Image as IImage
from PIL import Image
from openai import OpenAI
from tensorflow import keras
import textwrap
import pdfplumber
import fitz
from PyPDF2 import PdfFileReader, PdfReader, PdfWriter
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader

from IPython.display import display, HTML

# Set CSS for Jupyter Notebook
display(HTML("<style>.container { width:80% !important; }</style>"))

# 3. Data Extraction

## 3.1. Load Data

In [None]:
# Mount the Drive
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
# Change to the file directory
import os
%cd /content/drive/MyDrive/RAG_Project/RAG_System/

/content/drive/MyDrive/RAG_Project/RAG_System


In [None]:
os.getcwd()

'/content/drive/MyDrive/RAG_Project/RAG_System'

In [None]:
# Get the current working directory
path = os.getcwd()

# Create data directories
os.makedirs(path + '/Extracted_Images', exist_ok=True)
os.makedirs(path + '/indexDf', exist_ok=True)

# Set the data directories
dataDir = path + '/Data/'
imgSaveDir = path + '/Extracted_Images/'
indexSaveDir = path + '/indexDf'

# List the files in the data directory
dataFiles = os.listdir(dataDir)
dataFiles

['A_Clinical_Guide_to_the_Treatment_of_the_Human_Stress_Response.pdf',
 'Acquired_Brain_Injury.pdf',
 'Applied_Bioinformatics.pdf',
 'Bioinformatics_for_Evolutionary_Biologists.pdf',
 'Biomedical_Informatics.pdf',
 'Breast_Cancer.pdf',
 'Cardiovascular_Biomechanics.pdf',
 'Child_Neuropsychology.pdf',
 'Clinical_Neuroanatomy.pdf',
 'Clinical_Methods_in_Medical_Family_Therapy.pdf',
 'ENZYMES_Catalysis_Kinetics_and_Mechanisms.pdf',
 'Epidemiological_Research_Terms_and_Concepts.pdf',
 'Essentials_of_Cerebellum_and_Cerebellar_Disorders.pdf',
 'Foundations_of_Behavioral_Health.pdf',
 'Handbook_of_Cardiac_Anatomy_Physiology_and_Devices.pdf',
 'Human_Chromosomes.pdf',
 'Integrated_Neuroscience.pdf',
 'Metabolism_of_Human_Diseases.pdf',
 'Pharmaceutical_Biotechnology.pdf',
 'Spine_Surgery.pdf']

In [None]:
# Number of pages per document
def count_pdf_pages(pdf_path):
  """Counts the number of pages in a PDF file.
  """
  doc = fitz.open(pdf_path)
  return doc.page_count

for filename in dataFiles:
    page_count = count_pdf_pages(dataDir+filename)
    print(f"There are {page_count} pages in the file {filename}.")

There are 489 pages in the file A_Clinical_Guide_to_the_Treatment_of_the_Human_Stress_Response.pdf.
There are 306 pages in the file Acquired_Brain_Injury.pdf.
There are 197 pages in the file Applied_Bioinformatics.pdf.
There are 323 pages in the file Bioinformatics_for_Evolutionary_Biologists.pdf.
There are 970 pages in the file Biomedical_Informatics.pdf.
There are 892 pages in the file Breast_Cancer.pdf.
There are 462 pages in the file Cardiovascular_Biomechanics.pdf.
There are 496 pages in the file Child_Neuropsychology.pdf.
There are 704 pages in the file Clinical_Neuroanatomy.pdf.
There are 630 pages in the file Clinical_Methods_in_Medical_Family_Therapy.pdf.
There are 560 pages in the file ENZYMES_Catalysis_Kinetics_and_Mechanisms.pdf.
There are 181 pages in the file Epidemiological_Research_Terms_and_Concepts.pdf.
There are 604 pages in the file Essentials_of_Cerebellum_and_Cerebellar_Disorders.pdf.
There are 398 pages in the file Foundations_of_Behavioral_Health.pdf.
There are 

## 3.2. Data Extraction

In [None]:
os.getcwd()

'/content/drive/MyDrive/RAG_Project/RAG_System'

In [None]:
# Load environment variables from .env file
load_dotenv('my_secrets.env')

# Access the variable
openai_key = os.getenv('OPENAI_API_KEY')

# Create OpenAI client
client = OpenAI(api_key = openai_key)

### 3.2.1 Useful Functions

#### 3.2.1.1 Useful Functions for Extraction

In [None]:
def is_image_large_enough(image, min_size):
    import io
    image = Image.open(io.BytesIO(image))
    width, height = image.size
    return width >= min_size[0] and height >= min_size[1]

In [None]:
def extract_text_and_images(pdf_path, img_save_dir, min_image_size = (100, 100)):
    doc = fitz.open(pdf_path)
    extracted_data = []
    # Create a directory with the same name as the PDF file
    save_path = os.path.join(img_save_dir, os.path.basename(pdf_path)[:-4])
    os.makedirs(save_path, exist_ok=True)

    for page_num in range(doc.page_count):
        page = doc[page_num]
        text = page.get_text("text")  # Extract text from the page
        #text = text.replace("\n", " ").strip()  # Replace newlines with spaces
        images = page.get_images(full=True)  # Extract image references from the page

        page_data = {
            'document_name': os.path.basename(pdf_path),
            'page_number': page_num + 1,
            'text': text,
            'images': []
        }

        # Extract and save images with metadata
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            if is_image_large_enough(base_image["image"], min_image_size):
                image_bytes = base_image["image"]
                img_name = f"image_pg{page_num + 1}_{img_index}.png"

            # Save image locally
            with open(os.path.join(save_path, img_name), "wb") as img_file:
                img_file.write(image_bytes)

            page_data['images'].append({
                'image_name': img_name,
                'page_number': page_num + 1
            })

        extracted_data.append(page_data)

    return extracted_data

#### 3.2.1.2. Useful Functions for Processing

In [None]:
def chunk_text_with_recursive_splitter(texts, chunk_size=500, overlap_size=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap_size,
        separators=["\n\n", "\n", " ", ""]
    )

    file_chunks = []
    last_chunk = ""  # To store any remaining chunk that needs to overlap with the next page
    last_page_number = None  # To track the page number of the last chunk
    flag = False

    document_name = texts[0]['document_name']

    for i, page in enumerate(texts):
        #document_name = page['document_name']
        page_number = page['page_number']
        page_text = page['text']

        # If there's a remaining last chunk from the previous page, add it to the beginning
        if last_chunk:
           combined_text = last_chunk + page_text
           flag = True
        else:
            combined_text = page_text

        # Split the combined text into chunks
        page_chunks = text_splitter.split_text(combined_text)

        # Process all chunks except the last one (which might overlap with the next page)
        if flag:
            if len(last_chunk) > chunk_size - overlap_size:
                file_chunks.append({
                    'document_name': document_name,
                    'pages': f"Page {page_number-1}-{page_number}",
                    'chunk': page_chunks[0]
                             })
                if len(page_chunks) > 1:
                    file_chunks.append({
                        'document_name': document_name,
                        'pages': f"Page {page_number-1}-{page_number}",
                        'chunk': page_chunks[1]
                                })

                for chunk in page_chunks[2:-1]:
                    file_chunks.append({
                        'document_name': document_name,
                        'pages': f"Page {page_number}",
                        'chunk': chunk
                    })
            else:
                file_chunks.append({
                    'document_name': document_name,
                    'pages': f"Page {page_number-1}-{page_number}",
                    'chunk': page_chunks[0]
                             })

                for chunk in page_chunks[1:-1]:
                    file_chunks.append({
                        'document_name': document_name,
                        'pages': f"Page {page_number}",
                        'chunk': chunk
                    })
        else:
            for chunk in page_chunks[:-1]:
                file_chunks.append({
                    'document_name': document_name,
                    'pages': f"Page {page_number}",
                    'chunk': chunk
                })

        # Handle the last chunk: store it to potentially merge with the next page
        if page_chunks:
            last_chunk = page_chunks[-1]
        else:
            last_chunk = ""

        # If it's the last page, we add any remaining chunk as it won't overlap anymore
        if i == len(texts) - 1 or len(last_chunk) == chunk_size:
            file_chunks.append({
                'document_name': document_name,
                'pages': f"Page {page_number}",
                'chunk': last_chunk
            })
            last_chunk = ""

        flag = False

    return file_chunks

In [None]:
def generate_ids(number, size):
  import string, random
  ids=[]
  for i in range(number):
    res = ''.join(random.choices(string.ascii_letters,k=size))
    ids.append(res)
    if len(set(ids)) != i+1:
      i-=1
      ids.pop(-1)

  return ids

In [None]:
def get_embedding(text, model = "text-embedding-ada-002"):

  text = text.replace("\n", " ")
  embedding_object =  client.embeddings.create(input = text, model=model)

  return embedding_object.data[0].embedding

In [None]:
def load_chunks(doc_chunks, df):
    ids = generate_ids(len(doc_chunks), 10)
    #df = pd.DataFrame(columns= ['ids','embeddings', 'metadata'])
    p = len(df)
    for i, chunk in enumerate(doc_chunks):
        idx = p+i
        df.loc[idx]=[ids[i], get_embedding(chunk['chunk']), {'document_name':chunk['document_name'], 'pages': chunk['pages'], 'text': chunk['chunk']}]


In [None]:
def caption_image(image_path):
    with open(image_path, "rb") as image:
        base64_encoded_data = base64.b64encode(image.read()).decode('utf-8')
        response = client.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=[
        {
            "role": "system",
            "content": """You are an agent specialized in captioning images related to the healthcare.
                          You will be provided images and your goal is to make a comprehensive caption of them.
                          Pay specific attention to images related to humain body, and make sure to report anomalies
                          and defects when applicable. Remember, the caption you produce is going to be used for search,
                          so accurate description is of paramount importance.

                          Note: Start the description right away without any introduction. So you would carefully avoid
                          introductory sentences like 'This image shows', or 'This image is', or 'On this image,..', ...
                       """

        },
          {
            "role": "user",
            "content": [
                {"type": "text", "text": "Give a comprehensive caption a the image using no more than 280 words."},
                {
                    "type": "image_url",
                    "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_encoded_data}"
                  }
                },
            ],
        },
    ],
    max_tokens=300,
        )
        #caption = response.choices[0]['text'].strip()
        caption = response.choices[0].message.content.strip()
    return caption

### 3.2.2. Extract and Process Text



In [None]:
def extract_data(data_dir, img_save_dir):
    data = []
    for file in tqdm(os.listdir(data_dir)):
        file_data = {}
        #if file.endswith('.pdf'):
        pdf_path = os.path.join(data_dir, file)
        file_content = chunk_text_with_recursive_splitter(extract_text_and_images(pdf_path, img_save_dir))

        file_data['document_name'] = file
        file_data['document_content'] = file_content
        data.append(file_data)
    return data

In [None]:
Extract the text and images, chunk text and save images
full_data = extract_data(dataDir, img_save_dir=imgSaveDir)

In [None]:
# Create index Dataframe and load data in it
index_df = pd.DataFrame(columns= ['ids','embeddings', 'metadata'])

# Save it as a csv file
index_df.to_csv(f'{indexSaveDir}/index_df.csv', index=False)

In [None]:
for file in tqdm(os.listdir(dataDir)):
    if file.endswith('.pdf'):
        temp_df = pd.DataFrame(columns= ['ids','embeddings', 'metadata'])
        pdf_path = os.path.join(dataDir, file)
        file_content = chunk_text_with_recursive_splitter(extract_text_and_images(pdf_path, img_save_dir=imgSaveDir))
        load_chunks(file_content, temp_df)
        temp_df.to_csv(f'{indexSaveDir}/index_df.csv', mode = 'a', header = False, index=False)

In [None]:
index_df = pd.read_csv(f'{indexSaveDir}/index_df.csv')
index_df.shape

(84840, 3)

In [None]:
index_df.iloc[68984]['metadata']

"{'document_name': 'Human_Chromosomes.pdf', 'pages': 'Page 508', 'text': 'for identifying origins of replication , 34\\nZ-DNA, banding produced by antibodies\\nto, 87-88\\nZygotene, 127-129\\nsynaptonemal complex role during,\\n142-145\\nIndex\\n501'}"

In [None]:
remainingFiles = ['Integrated_Neuroscience.pdf',
          'Metabolism_of_Human_Diseases.pdf',
          'Pharmaceutical_Biotechnology.pdf',
          'Spine_Surgery.pdf']

In [None]:
import json
import pandas as pd
import ast

for filename in remainingFiles:
    # Convert the 'metadata' column from string to dictionary safely
    # ast.literal_eval can handle single quotes for keys and values.
    index_df['metadata'] = index_df['metadata'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # Now you can access 'document_name'
    filtered_df = index_df[index_df['metadata'].apply(lambda x: x.get('document_name') == filename)]

    if len(filtered_df) > 0:
        print(filename)

Integrated_Neuroscience.pdf
Metabolism_of_Human_Diseases.pdf
Pharmaceutical_Biotechnology.pdf


In [None]:
type(index_df.iloc[0]['metadata'])

dict

In [None]:
for file in tqdm(remainingFiles):
    if file.endswith('.pdf'):
        temp_df = pd.DataFrame(columns= ['ids','embeddings', 'metadata'])
        pdf_path = os.path.join(dataDir, file)
        file_content = chunk_text_with_recursive_splitter(extract_text_and_images(pdf_path, img_save_dir=imgSaveDir))
        load_chunks(file_content, temp_df)
        temp_df.to_csv(f'{indexSaveDir}/index_df.csv', mode = 'a', header = False, index=False)

In [None]:
for file in tqdm(['Spine_Surgery.pdf']):
    if file.endswith('.pdf'):
        temp_df = pd.DataFrame(columns= ['ids','embeddings', 'metadata'])
        pdf_path = os.path.join(dataDir, file)
        file_content = chunk_text_with_recursive_splitter(extract_text_and_images(pdf_path, img_save_dir=imgSaveDir))
        load_chunks(file_content, temp_df)
        temp_df.to_csv(f'{indexSaveDir}/index_df.csv', mode = 'a', header = False, index=False)

100%|██████████| 1/1 [32:29<00:00, 1949.77s/it]


In [None]:
len(file_content)

4070

### 3.2.3. Process the Images

In [None]:
from PIL import Image
import io
# This is for avoiding irrelevant images being processed
def is_Relevant_Image(image_path, min_size):
    """Checks if the image at the given path has a minimum width and height."""
    try:
        image = Image.open(image_path)
        width, height = image.size
        return width >= min_size[0] and height >= min_size[1]
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return False

In [None]:
# The function reads the images, send them to a vision model for captioning,
# embed the caption and introduce the record to the index dataframe
def process_images(img_save_dir):

    for subdir in tqdm(os.listdir(img_save_dir)):
        df = pd.DataFrame(columns= ['ids','embeddings', 'metadata'])
        img_dir = os.path.join(img_save_dir, subdir)
        ids = generate_ids(len(os.listdir(img_dir)), 10)
        if not os.path.isdir(img_dir):
            continue
        for i, img in enumerate(os.listdir(img_dir)):
            img_path = os.path.join(img_dir, img)
            if not is_Relevant_Image(img_path, (100, 100)):
                continue
            img_caption = caption_image(img_path)
            img_embedding = get_embedding(img_caption)
            df.loc[len(df)] = [ids[i], img_embedding, {'image_name': img, 'image_caption': img_caption, 'document_name': subdir, 'page': f"Page {img.split('_')[1][2:]}"}]
        try:
            df.to_csv(f'{indexSaveDir}/index_df.csv', mode = 'a', header = False, index=False)
            print(f"{len(df)} images captioned and saved from file {subdir}")
        except Exception as e:
            print(f"Error saving images from file {subdir}: {e}")

In [None]:
#Process images and load in index file
process_images(imgSaveDir)

  5%|▌         | 1/20 [02:31<47:52, 151.20s/it]

45 images captioned and saved from file A_Clinical_Guide_to_the_Treatment_of_the_Human_Stress_Response


 10%|█         | 2/20 [03:02<24:10, 80.59s/it] 

54 images captioned and saved from file Acquired_Brain_Injury


 15%|█▌        | 3/20 [06:07<36:23, 128.45s/it]

103 images captioned and saved from file Applied_Bioinformatics


 20%|██        | 4/20 [06:30<23:08, 86.75s/it] 

110 images captioned and saved from file Bioinformatics_for_Evolutionary_Biologists


 25%|██▌       | 5/20 [20:29<1:29:28, 357.93s/it]

328 images captioned and saved from file Biomedical_Informatics


 30%|███       | 6/20 [44:50<2:51:01, 732.96s/it]

789 images captioned and saved from file Breast_Cancer


 35%|███▌      | 7/20 [51:27<2:15:02, 623.23s/it]

908 images captioned and saved from file Cardiovascular_Biomechanics


 40%|████      | 8/20 [54:21<1:36:01, 480.09s/it]

932 images captioned and saved from file Child_Neuropsychology


 45%|████▌     | 9/20 [1:14:06<2:08:25, 700.46s/it]

1215 images captioned and saved from file Clinical_Neuroanatomy


 50%|█████     | 10/20 [1:14:58<1:23:24, 500.40s/it]

1231 images captioned and saved from file Clinical_Methods_in_Medical_Family_Therapy


 55%|█████▌    | 11/20 [1:16:38<56:41, 377.95s/it]  

1252 images captioned and saved from file ENZYMES_Catalysis_Kinetics_and_Mechanisms


 60%|██████    | 12/20 [1:16:45<35:20, 265.03s/it]

1253 images captioned and saved from file Epidemiological_Research_Terms_and_Concepts


 65%|██████▌   | 13/20 [1:25:03<39:07, 335.42s/it]

1370 images captioned and saved from file Essentials_of_Cerebellum_and_Cerebellar_Disorders


 70%|███████   | 14/20 [1:25:39<24:30, 245.04s/it]

1379 images captioned and saved from file Foundations_of_Behavioral_Health


 75%|███████▌  | 15/20 [2:03:40<1:11:34, 858.91s/it]

1961 images captioned and saved from file Handbook_of_Cardiac_Anatomy_Physiology_and_Devices


 80%|████████  | 16/20 [3:07:47<1:57:13, 1758.35s/it]

3057 images captioned and saved from file Human_Chromosomes


 80%|████████  | 16/20 [3:29:12<52:18, 784.56s/it]   


KeyboardInterrupt: 

In [None]:
# The avove function rand and the runtime stopped. So I interrupeted it. However, images from 16 docs were already processed
# Processing the remaining.
for subdir in tqdm(['Integrated_Neuroscience','Metabolism_of_Human_Diseases','Pharmaceutical_Biotechnology','Spine_Surgery']):
        df = pd.DataFrame(columns= ['ids','embeddings', 'metadata'])
        img_dir = os.path.join(imgSaveDir, subdir)
        ids = generate_ids(len(os.listdir(img_dir)), 10)
        if not os.path.isdir(img_dir):
            continue
        count=0
        for i, img in enumerate(os.listdir(img_dir)):

            img_path = os.path.join(img_dir, img)
            if is_Relevant_Image(img_path, (100, 100)):
                img_caption = caption_image(img_path)
                img_embedding = get_embedding(img_caption)
                df.loc[len(df)] = [ids[i], img_embedding, {'image_name': img, 'image_caption': img_caption, 'document_name': subdir, 'page': f"Page {img.split('_')[1][2:]}"}]
        try:
            df.to_csv(f'{indexSaveDir}/index_df.csv', mode = 'a', header = False, index=False)
            print(f"{len(df)} images captioned and saved from file {subdir}")
        except Exception as e:
            print(f"Error saving images from file {subdir}: {e}")

 25%|██▌       | 1/4 [1:52:43<5:38:09, 6763.29s/it]

1259 images captioned and saved from file Integrated_Neuroscience


 50%|█████     | 2/4 [1:55:28<1:36:04, 2882.11s/it]

48 images captioned and saved from file Metabolism_of_Human_Diseases


 75%|███████▌  | 3/4 [2:02:16<29:12, 1752.47s/it]  

99 images captioned and saved from file Pharmaceutical_Biotechnology


100%|██████████| 4/4 [2:50:17<00:00, 2554.39s/it]

698 images captioned and saved from file Spine_Surgery





In [None]:
import ast

#Load index data
index_df = pd.read_csv(f'{indexSaveDir}/index_df.csv')
print(f'Data shape before cleaning duplicates {index_df.shape}')

# Convert the 'metadata' column from string to dictionary safely
# ast.literal_eval can handle single quotes for keys and values.
index_df['metadata'] = index_df['metadata'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Drop duplicates because the runtime stopped and when rerunning, some records got dupled
index_df = index_df.drop_duplicates(subset=['embeddings'])
print(f'Data shape after cleaning duplicates {index_df.shape}')

Data shape before cleaning duplicates (107001, 3)
Data shape after cleaning duplicates (94071, 3)


In [None]:
def update_metadata(row):
  # Check if metadata is already a dictionary
  if isinstance(row['metadata'], dict):
    metadata = row['metadata']
  else:
    # If not a dictionary, try to parse it as a string
    try:
      metadata = ast.literal_eval(row['metadata'])
    except (SyntaxError, ValueError):
      # If parsing fails, assume it's already text and wrap it in a dictionary
      metadata = {'text': row['metadata']}

  if 'image_caption' in metadata:
    metadata['text'] = metadata.pop('image_caption')
  return str(metadata)
index_df['metadata'] = index_df.apply(update_metadata, axis=1)

In [None]:
# Last save of index data
index_df.to_csv(f'{indexSaveDir}/index_df.csv', index=False)