In [2]:
import chromadb
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")
os.environ["OPENAI_API_KEY"] = openai_api_key

# Initialize ChromaDB
client = chromadb.PersistentClient(path="./research_db")
collection = client.get_or_create_collection(
    name="ml_publications",
    metadata={"hnsw:space": "cosine"}
)

# Set up our embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2"
    )


In [3]:
import os
import json
from datetime import datetime
from langchain_community.document_loaders import PyPDFLoader


def load_pdf_for_json_splitter(documents_path, output_json_path = None):
    """Load PDFs in a format suitable for RecursiveJsonSplitter"""
    
    # Create a dictionary structure instead of list
    documents_dict = {
        "documents": {},
        "metadata": {
            "load_timestamp": datetime.now().isoformat(),
            "total_files": 0
        }
    }
    
    for root, dirs, files in os.walk(documents_path):
        for file in files:
            if file.lower().endswith(".pdf"):
                file_path = os.path.join(root, file)
                try:
                    loader = PyPDFLoader(file_path)
                    loaded_docs = loader.load()
                    
                    # Combine all pages
                    full_content = "\n\n".join([doc.page_content for doc in loaded_docs if doc.page_content.strip()])
                    
                    # Use filename as key
                    doc_key = file.replace('.pdf', '').replace(' ', '_')
                    documents_dict["documents"][doc_key] = {
                        "title": file.replace('.pdf', ''),
                        "filename": file,
                        "file_path": file_path,
                        "content": full_content,
                        "page_count": len(loaded_docs),
                        "metadata": {
                            "file_size": os.path.getsize(file_path),
                            "file_modified": datetime.fromtimestamp(os.path.getmtime(file_path)).isoformat()
                        }
                    }
                    
                    documents_dict["metadata"]["total_files"] += 1
                    print(f"✅ Successfully loaded: {file}")
                    
                except Exception as e:
                    print(f"❌ Error loading {file}: {str(e)}")
    print(f"\n📂 Total PDF files processed: {len(documents_dict['documents'])}")
    
    if output_json_path:
        try:
            with open(output_json_path, 'w', encoding='utf-8') as f:
                json.dump(documents_dict, f, indent=2, ensure_ascii=False)
            print(f"💾 JSON saved to: {output_json_path}")
        except Exception as e:
            print(f"❌ Error saving JSON: {str(e)}")
    
    return documents_dict


In [4]:
from langchain_text_splitters import RecursiveJsonSplitter

# Usage
publication = load_pdf_for_json_splitter("data/data2", "data/data3")

# Now split with RecursiveJsonSplitter
# json_splitter = RecursiveJsonSplitter(max_chunk_size=1000)
# json_chunks = json_splitter.split_json(documents_dict)

✅ Successfully loaded: Bio 101 (compiled by Counselor).pdf
✅ Successfully loaded: BIO 101. INTRODUCTION TO GENETICS^.pdf
✅ Successfully loaded: BIO 101. INTRODUCTION TO GENETICS^-1.pdf
✅ Successfully loaded: BIO101 (ecology).pdf
✅ Successfully loaded: BIO 102 REVISION TEST QUESTIONS.pdf

📂 Total PDF files processed: 5
💾 JSON saved to: data/data3


5

In [5]:
def chunk_research_paper(paper_content):
    """Break a research paper into searchable chunks"""
    
    json_splitter = RecursiveJsonSplitter(
        max_chunk_size=1000,          # ~200 words per chunk
        # max_chunk_overlap=200,        # Overlap to preserve context
        # separators=[
        #     "object",   # split large objects {...}
        #     "array",    # split large arrays [...]
        #     "item",     # split array items
        #     "value"     # fall back to splitting values
        # ]
    )

    json_chunks = json_splitter.split_json(paper_content)
    

    return json_chunks

In [22]:
json_chunks = chunk_research_paper(publication)
print(len(json_chunks))
json_chunks

5


[{'documents': {'Bio_101_(compiled_by_Counselor)': {'title': 'Bio 101 (compiled by Counselor)',
    'filename': 'Bio 101 (compiled by Counselor).pdf',
    'file_path': 'data/data2/BIO 101/Bio 101 (compiled by Counselor).pdf',
    'content': '',
    'page_count': 46,
    'metadata': {'file_size': 22369639,
     'file_modified': '2025-09-23T17:53:05.989221'}},
   'BIO_101._INTRODUCTION_TO_GENETICS^': {'title': 'BIO 101. INTRODUCTION TO GENETICS^',
    'filename': 'BIO 101. INTRODUCTION TO GENETICS^.pdf',
    'file_path': 'data/data2/BIO 101/BIO 101. INTRODUCTION TO GENETICS^.pdf',
    'content': "BIO 101. \nINTRODUCTION \nTO GENETICS.\n\nAn overview of \nthe outline\n• Definition of Genetics\n• The pioneering scientists of heredity \n• Gregor MENDEL, the father of \nmodern genetics\n• Some basic definitions \n• Mendel’s laws of Inheritance \n• Monohybrid inheritance \n• Dihybrid inheritance\n• Chromosomes\n• Prokaryotic and Eukaryotic \nchromosomes \n• DNA structure \n• Structural organi

In [6]:

import torch
from langchain_community.embeddings import HuggingFaceEmbeddings

def embed_documents(documents: list[str]) -> list[list[float]]:
    """
    Embed documents using a model.
    """
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps" if torch.backends.mps.is_available() else "cpu"
    )
    model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={"device": device},
    )

    embeddings = model.embed_documents(documents)
    return embeddings

In [7]:

def insert_publications(collection: chromadb.Collection, publications: list[str]):
    """
    Insert documents into a ChromaDB collection.

    Args:
        collection (chromadb.Collection): The collection to insert documents into
        publications (list[str]): The documents to insert

    Returns:
        None
    """
    next_id = collection.count()

    for publication in publications["documents"]:
        chunked_publication = chunk_research_paper()

        # Step 2: extract only the text content for embeddings
        chunk_texts = [publication_cont_v["contents"] for publication_cont_key, publication_cont_v  in publication.items()]
        
        embeddings = embed_documents(chunk_texts)
        ids = list(range(next_id, next_id + len(chunked_publication)))
        ids = [f"document_{id}" for id in ids]
        collection.add(
            embeddings=embeddings,
            ids=ids,
            documents=chunk_texts,
            metadatas= chunked_publication
        )
        next_id += len(chunked_publication)

In [None]:
my_db = insert_publications(collection, publication)

In [None]:

all_items = collection.get(
    include=['documents', 'metadatas', 'embeddings'] # Specify what to include
)
print(all_items)
type(my_db)

In [8]:

def search_research_db(query, collection, embeddings, top_k=5):
    """Find the most relevant research chunks for a query"""
    
    # Convert question to vector
    query_vector = embeddings.embed_query(query)
    
    # Search for similar content
    results = collection.query(
        query_embeddings=[query_vector],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )
    
    # Format results
    relevant_chunks = []
    for i, doc in enumerate(results["documents"][0]):
        relevant_chunks.append({
            "content": doc,
            "title": results["metadatas"][0][i]["title"],
            "similarity": 1 - results["distances"][0][i]  # Convert distance to similarity
        })
    
    return relevant_chunks

In [9]:

from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
import os

def answer_research_question(query, collection, embeddings, llm):
    """Generate an answer based on retrieved research"""
    
    # Get relevant research chunks
    relevant_chunks = search_research_db(query, collection, embeddings, top_k=3)
    
    # Build context from research
    context = "\n\n".join([
        f"From {chunk['title']}:\n{chunk['content']}" 
        for chunk in relevant_chunks
    ])
    
    # Create research-focused prompt
    prompt_template = PromptTemplate(
        input_variables=["context", "question"],
        template="""
Based on the following research findings, answer the students's question:

Research Context:
{context}

Researcher's Question: {question}

Answer: Provide a comprehensive answer based on the findings above.
"""
    )
    
    # Generate answer
    prompt = prompt_template.format(context=context, question=query)
    response = llm.invoke(prompt)
    
    return response.content, relevant_chunks


In [10]:

# Initialize LLM and get answer

llm = ChatGroq(
        model="llama-3.1-8b-instant",
        temperature=0.7,
        api_key=os.getenv("GROQ_API_KEY")
    )
answer, sources = answer_research_question(
    "What what did early scientists contribute to geneticss",
    collection, 
    embeddings, 
    llm
)

print("AI Answer:", answer)
print("\nBased on sources:")
for source in sources:
    print(f"- {source['title']}")

AI Answer: According to the research findings, the early scientists of the Classical era made significant contributions to genetic theories. Specifically, the following scientists are mentioned:

1. **Pythagoras**: Although not much information is provided about his contributions to genetics, it's worth noting that Pythagoras was a Greek philosopher and mathematician who lived in the 6th century BCE. His work on the concept of "likes attract likes" might have laid some groundwork for understanding the idea of genetic similarity or heredity.

2. **Hippocrates**: As the founder of Western medicine, Hippocrates (460-370 BCE) is credited with making significant contributions to the field of genetics. His work on the concept of "like begets like" and the idea that diseases can be inherited through the mother's side suggest that he had an early understanding of heredity.

3. **Aristotle** (384-322 BCE): Aristotle's work on biology, particularly his treatise "Generation of Animals," discusses

In [None]:

# Alternative version if you want to keep page-level granularity but better organized
def load_pdf_research_publications_with_pages(documents_path, output_json_path=None):
    """Load research publications from PDF files with page-level structure"""
    
    documents = []
    
    for root, dirs, files in os.walk(documents_path):
        for file in files:
            if file.lower().endswith(".pdf"):
                file_path = os.path.join(root, file)
                try:
                    loader = PyPDFLoader(file_path)
                    loaded_docs = loader.load()
                    
                    # Create document with pages array
                    document_data = {
                        "title": file.replace('.pdf', '').replace('.PDF', ''),
                        "filename": file,
                        "file_path": file_path,
                        "page_count": len(loaded_docs),
                        "load_timestamp": datetime.now().isoformat(),
                        "pages": []
                    }
                    
                    # Add each page as separate entry
                    for i, doc in enumerate(loaded_docs):
                        if doc.page_content.strip():
                            page_data = {
                                "page_number": i + 1,
                                "content": doc.page_content,
                                "metadata": getattr(doc, 'metadata', {})
                            }
                            document_data["pages"].append(page_data)
                    
                    documents.append(document_data)
                    print(f"✅ Successfully loaded: {file} ({len(loaded_docs)} pages)")
                    
                except Exception as e:
                    print(f"❌ Error loading {file}: {str(e)}")
                    documents.append({
                        "title": file.replace('.pdf', '').replace('.PDF', ''),
                        "filename": file,
                        "error": str(e),
                        "pages": []
                    })
    
    print(f"\n📂 Total PDF files processed: {len(documents)}")
    
    if output_json_path:
        try:
            with open(output_json_path, 'w', encoding='utf-8') as f:
                json.dump(documents, f, indent=2, ensure_ascii=False)
            print(f"💾 JSON saved to: {output_json_path}")
        except Exception as e:
            print(f"❌ Error saving JSON: {str(e)}")
    
    return documents

# Usage examples:
if __name__ == "__main__":
    # Method 1: Combined content (one JSON object per PDF)
    documents = load_pdf_research_publications(
        documents_path="./your_pdf_folder",
        output_json_path="./pdf_documents_combined.json"
    )
    
    # Method 2: Page-level structure (better for RecursiveJsonSplitter)
    documents_with_pages = load_pdf_research_publications_with_pages(
        documents_path="./your_pdf_folder", 
        output_json_path="./pdf_documents_pages.json"
    )

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
embeddings=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)