In [1]:
import os
from langchain.document_loaders import PyPDFDirectoryLoader
import fitz  # PyMuPDF
import camelot

def extract_text_from_directory(directory_path):
    loader = PyPDFDirectoryLoader(directory_path)
    documents = loader.load()
    text = [doc.page_content for doc in documents]
    return text

def extract_images_from_directory(directory_path):
    images = []
    for pdf_file in os.listdir(directory_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, pdf_file)
            pdf_document = fitz.open(pdf_path)
            for page_num in range(len(pdf_document)):
                page = pdf_document.load_page(page_num)
                image_list = page.get_images(full=True)
                for img_index, img in enumerate(image_list):
                    xref = img[0]
                    base_image = pdf_document.extract_image(xref)
                    image_bytes = base_image["image"]
                    images.append((pdf_file, page_num, image_bytes))
    return images

def extract_tables_from_directory(directory_path):
    tables = []
    for pdf_file in os.listdir(directory_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, pdf_file)
            tables.extend(camelot.read_pdf(pdf_path, pages='all'))
    return tables

def process_directory(directory_path):
    text = extract_text_from_directory(directory_path)
    images = extract_images_from_directory(directory_path)
    tables = extract_tables_from_directory(directory_path)
    return text, images, tables

# Example usage
directory_path = 'data/'
text, images, tables = process_directory(directory_path)


In [5]:
import json
from PIL import Image
import io
import pandas as pd

def save_text(text, file_path):
    with open(file_path, 'w') as f:
        json.dump(text, f)

def load_text(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)


def save_images(images, directory_path):
    os.makedirs(directory_path, exist_ok=True)
    for i, (pdf_file, page_num, image_bytes) in enumerate(images):
        image = Image.open(io.BytesIO(image_bytes))
        # Convert image to RGB mode if it's in CMYK mode
        if image.mode == 'CMYK':
            image = image.convert('RGB')
        image_path = os.path.join(directory_path, f'image_{i}.png')
        image.save(image_path)

        
def load_images(directory_path):
    images = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.png'):
            image_path = os.path.join(directory_path, file_name)
            with open(image_path, 'rb') as f:
                images.append(f.read())
    return images

def save_tables(tables, directory_path):
    os.makedirs(directory_path, exist_ok=True)
    for i, table in enumerate(tables):
        df = table.df
        csv_path = os.path.join(directory_path, f'table_{i}.csv')
        df.to_csv(csv_path, index=False)

def load_tables(directory_path):
    tables = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.csv'):
            csv_path = os.path.join(directory_path, file_name)
            df = pd.read_csv(csv_path)
            tables.append(df)
    return tables



text_file_path = 'extracted_text.json'
images_directory_path = 'extracted_images'
tables_directory_path = 'extracted_tables'

save_text(text, text_file_path)
save_images(images, images_directory_path)
save_tables(tables, tables_directory_path)


In [6]:

# Load data
loaded_text = load_text(text_file_path)
loaded_images = load_images(images_directory_path)
loaded_tables = load_tables(tables_directory_path)


In [None]:
import numpy as np
import faiss
from transformers import AutoModel, AutoTokenizer
import torch

# Load SciBERT model and tokenizer
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def generate_sciBERT_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings

# Example texts
texts = [
    "This is a sample text about human anatomy.",
    "Another text related to biology."
]

# Generate embeddings
embeddings = generate_sciBERT_embeddings(texts)

# Initialize FAISS index
dimension = embeddings.shape[1]  # Embedding dimension
index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity search

# Add embeddings to the index
index.add(embeddings)

# Save the FAISS index to a file
faiss.write_index(index, "sciBERT_faiss_index.index")

# Save metadata (if needed)
metadata = [{"text": text, "id": i} for i, text in enumerate(texts)]
np.save("sciBERT_metadata.npy", metadata)

# To load the FAISS index and metadata later
index = faiss.read_index("sciBERT_faiss_index.index")
metadata = np.load("sciBERT_metadata.npy", allow_pickle=True).tolist()


In [1]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os

# Set path to the Tesseract executable if it's not in your PATH
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Directories
pdf_path = "data/anatomy_vol_3.pdf"
text_output_folder = 'output/text/'
os.makedirs(text_output_folder, exist_ok=True)
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

# Convert PDF to images
images = convert_from_path(pdf_path)

# Extract text from each image using OCR
text = ""
for i, image in enumerate(images):
    text += pytesseract.image_to_string(image) + "\n"  # Extract text from image and append to text

# Save text to file
with open(os.path.join(text_output_folder, f"{pdf_name}.txt"), 'w', encoding='utf-8') as text_file:
    text_file.write(text)

print(f"Text extracted and saved to {pdf_name}.txt")


Text extracted and saved to anatomy_vol_3.txt
