In [7]:
import os
from pathlib import Path
import PyPDF2
from langchain_core.documents import Document

def process_pdfs_in_directory(directory_path, metadata=None):
    """
    Process all PDFs in a directory and convert them to LangChain Document objects.
    
    Args:
        directory_path (str): Path to directory containing PDFs
        metadata (dict, optional): Additional metadata to include with each document
    
    Returns:
        list: List of Document objects
    """
    documents = []
    
    # Get all PDF files in the directory
    pdf_files = list(Path(directory_path).glob("*.pdf"))
    
    for pdf_path in pdf_files:
        try:
            # Open the PDF file
            with open(pdf_path, 'rb') as file:
                # Create a PDF reader object
                pdf_reader = PyPDF2.PdfReader(file)
                
                # Extract text from all pages
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text()
                
                # Create document metadata
                doc_metadata = {
                    'source': str(pdf_path),
                    'filename': pdf_path.name
                }
                
                # Add any additional metadata
                if metadata:
                    doc_metadata.update(metadata)
                
                # Create LangChain Document object
                document = Document(
                    page_content=text,
                    metadata=doc_metadata
                )
                
                documents.append(document)
                
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
    
    return documents

documents = process_pdfs_in_directory("rag_docs/")


In [8]:
documents

[Document(metadata={'source': 'rag_docs/Depression - Google Docs.pdf', 'filename': 'Depression - Google Docs.pdf'}, page_content=' ### **DSM-5-TR Diagnostic Criteria for a Major Depressive Episode** \n #### **Criterion A:** \n Five (or more) of the following symptoms must be present during the same **two-week period**, \n representing a change from previous functioning. **At least one** must be either: \n 1. **Depressed mood** \n - Most of the day, nearly every day, as indicated by: \n - Subjective report (e.g., feels sad, empty, hopeless) \n - Observation by others (e.g., appears tearful) \n - *Note:* In children/adolescents, mood may be **irritable**. \n 2. **Markedly diminished interest or pleasure** in all (or almost all) activities, most of the day, \n nearly every day (by self-report or observation). \n **Additional symptoms (must total ≥5):** \n 3. **Significant weight loss** (without dieting) **or weight gain** (>5% body weight in a month), \n or **appetite changes** nearly eve

In [4]:
pdf_reader = PyPDF2.PdfReader('rag_docs/Depression - Google Docs.pdf')

In [9]:
combined_chunks = documents

In [10]:
from langchain_chroma import Chroma
from langchain_voyageai import VoyageAIEmbeddings
import dotenv
import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings

# cell to load database - VERY EXPENSIVE

dotenv.load_dotenv()
new_client = chromadb.PersistentClient(path = "./chroma_db", tenant = DEFAULT_TENANT, database = DEFAULT_DATABASE, settings = Settings())

embeddings = VoyageAIEmbeddings(
    voyage_api_key=dotenv.get_key(dotenv_path= ".env", key_to_get = "VOYAGEAI_KEY") , model="voyage-large-2-instruct")

vectorstore = Chroma.from_documents(documents= combined_chunks , embedding=embeddings, collection_name="umich_fa2024", client=new_client)

  from .autonotebook import tqdm as notebook_tqdm
