In [15]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import chromadb

import torch
from tqdm import tqdm
import os

from dotenv  import load_dotenv

load_dotenv()




True

In [16]:
# Initialize ChromaDB
client = chromadb.PersistentClient(path="../research_db")
collection = client.get_or_create_collection(
    name="studybuddy",
    metadata={"hnsw:space": "cosine"}
)

In [17]:
def load_study_files(documents_path):
    """Loading all the available pdf files in the data directory"""
    
    # List to store all documents
    documents = []
    
    # Load each .txt file in the documents folder
    for file in os.listdir(documents_path):
        if file.endswith(".pdf"):
            file_path = os.path.join(documents_path, file)
            try:
                loader = PyPDFLoader(file_path)
                loaded_docs = loader.load()
                documents.extend(loaded_docs)
                print(f"Successfully loaded: {file}")
            except Exception as e:
                print(f"Error loading {file}: {str(e)}")
    
    print(f"\nTotal documents loaded: {len(documents)}")
    
    # Extract content as strings and return
    materials = []
    for doc in documents:
        materials.append(doc.page_content)
    
    return materials


materials = load_study_files('../data/')


Successfully loaded: CSC417pl10ch1.pdf
Successfully loaded: CSC417pl10ch2.pdf
Successfully loaded: CSC417pl10ch3.pdf
Successfully loaded: CSC417pl10ch4.pdf

Total documents loaded: 200


In [18]:
def chunk_study_files(paper_content):
    """Break each material into searchable chunks"""
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,          # ~200 words per chunk
        chunk_overlap=200,        # Overlap to preserve context
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    chunks = text_splitter.split_text(paper_content)
    
    # Add metadata to each chunk
    chunk_data = []
    for i, chunk in enumerate(chunks):
        chunk_data.append({
            "content": chunk,
            "chunk_id": f"{i}",
        })
    
    return chunk_data


In [19]:
def embed_documents(documents: list[str]) -> list[list[float]]:
    """
    Embed documents using a model.
    """
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps" if torch.backends.mps.is_available() else "cpu"
    )
    model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={"device": device},
    )

    embeddings = model.embed_documents(documents)
    return embeddings

In [20]:
def insert_materials(collection: chromadb.Collection, study_materials: list[str]):
    """
    Insert documents into a ChromaDB collection.
    """
    next_id = collection.count()

    for i, material in enumerate(tqdm(study_materials, desc="Inserting materials")):
        chunked_material = chunk_study_files(material)
        embeddings = embed_documents([chunk["content"] for chunk in chunked_material])
        ids = list(range(next_id, next_id + len(chunked_material)))
        ids = [f"document_{id}" for id in ids]
        collection.add(
            embeddings=embeddings,
            ids=ids,
            documents=[chunk["content"] for chunk in chunked_material],
            metadatas=[{"chunk_id": chunk["chunk_id"]} for chunk in chunked_material]
        )
        next_id += len(chunked_material)


insert_materials(collection, materials)


Inserting materials: 100%|██████████| 200/200 [15:41<00:00,  4.71s/it]


In [26]:
def search_material_db(query, collection, embeddings, top_k=5):
    """Find the most relevant research chunks for a query"""
    
    # Convert question to vector
    query_vector = embeddings.embed_query(query)
    
    # Search for similar content
    results = collection.query(
        query_embeddings=[query_vector],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )
    # Format results
    relevant_chunks = []
    for i, doc in enumerate(results["documents"][0]):
        relevant_chunks.append({
            "content": doc,
            "similarity": 1 - results["distances"][0][i]  # Convert distance to similarity
        })
    
    return relevant_chunks

In [29]:
def buddyassistant(topic, collection, embeddings, llm):
    """Generate an answer based on retrieved research"""
    
    # Get relevant research chunks
    relevant_chunks = search_material_db(topic, collection, embeddings, top_k=3)
    
    # Build context from research
    context = "\n\n".join([
        f"{chunk['content']}" 
        for chunk in relevant_chunks
    ])
    
    # Prompt for question generation
    question_template = PromptTemplate(
        input_variables=["topic"],
        template="""
        You are an AI professional assistant helping students evaluate their knowledge gaps.
        
        Your task is as follows:
        Generate 5 closed questions and 5 open questions on the topic {topic}

        Ensure your response follows these rules:
        - Each question should be approximately 70 words.
        - Each question should target the underlying nuances of the topic.

        Your goal is to provide sets of questions that serve as an evaluation benchmark.

        Style guide:
        - Return only the questions without any starting word.
        - Use <endofquestion> as delimiter to separate questions.
        """
    )
    
    # Prompt for answer evaluation
    evaluating_template = PromptTemplate(
        input_variables=["question", "answer", "context"], 
        template="""
        Based on the following research findings, assess how accurate the answer is to the question.

        Research Context:
        {context}

        Question: {question}

        Answer: {answer}

        Your response format:
        - Return an accuracy score in 2 decimal places followed by knowledge gaps.
        - Use bullet points to:
          - Highlight brilliant points (if any)
          - List what should have been included
          - Point out irrelevant or incorrect parts
          - Mention common pitfalls to avoid

        Use clear language over jargon. Response should not exceed 150 words.
        """
    )
    
    # Build chains
    question_chain = question_template | llm
    evaluating_chain = evaluating_template | llm

    # Generate questions
    response = question_chain.invoke({'topic': topic})
    questions = response.content.strip().split('<endofquestion>')
    questions = [q.strip() for q in questions if q.strip()]
    
    final_response = []
    
    for i, question in enumerate(questions, start=1):
        print(f"\nQuestion {i}:\n{question}\n")
        answer = input("Provide your answer for the question above:\n")
        evaluation = evaluating_chain.invoke({
            'question': question,
            'answer': answer,
            'context': context
        })
        final_response.append({
            'question': question,
            'answer': answer,
            'evaluation': evaluation.content.strip()
        })
        
    return final_response, relevant_chunks

In [30]:
llm = ChatGroq(model="llama3-8b-8192")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")




In [31]:
answer, sources = buddyassistant(
    "Programming language design",
    collection, 
    embeddings, 
    llm
)


Question 1:
Here are the 5 closed and 5 open questions on the topic of Programming language design:

What is the primary goal of the type inference system in a statically-typed programming language such as Rust, and how does it impact code readability?


Question 2:
How does the concept of operator overloading in languages like C++ and Python affect the language's syntax and semantics?


Question 3:
What are the key differences between the type systems of Functional Programming languages like Haskell and Imperative Programming languages like C++, and how do these differences impact programming paradigms?


Question 4:
What is the role of abstract syntax trees (ASTs) in programming language design, and how do they enable features like code analysis and optimization?


Question 5:
In what ways do programming languages like Rust and Swift use borrow checker mechanisms to ensure memory safety and prevent common programming errors?


Question 6:
What are some of the key challenges in desig

In [35]:
sources

[{'content': 'Copyright © 2012 Addison-Wesley. All rights reserved. 1-66\nSummary\n• Development, development environment, \nand evaluation of a number of important \nprogramming languages\n• Perspective into current issues in language \ndesign',
  'similarity': 0.7726072072982788},
 {'content': 'Copyright © 2012 Addison-Wesley. All rights reserved. 1-2\nChapter 1 Topics\n• Reasons for Studying Concepts of \nProgramming Languages\n• Programming Domains\n• Language Evaluation Criteria\n• Influences on Language Design\n• Language Categories\n• Language Design Trade-Offs\n• Implementation Methods\n• Programming Environments',
  'similarity': 0.7026222348213196},
 {'content': 'Copyright © 2012 Addison-Wesley. All rights reserved. 1-2\nChapter 1 Topics\n• Reasons for Studying Concepts of \nProgramming Languages\n• Programming Domains\n• Language Evaluation Criteria\n• Influences on Language Design\n• Language Categories\n• Language Design Trade-Offs\n• Implementation Methods\n• Programming 

In [45]:
for ind, each in enumerate(answer):
    print(f"Question {ind + 1}: {each['question']}", end = '\n\n')
    print(f"Your answer: {each['answer']}")
    print(f"AI's Evaluation: {each['evaluation']}")
    print("="* 150, '\n')

Question 1: Here are the 5 closed and 5 open questions on the topic of Programming language design:

What is the primary goal of the type inference system in a statically-typed programming language such as Rust, and how does it impact code readability?

Your answer: I have no idea
AI's Evaluation: Accuracy score: 0.00

Knowledge gaps:

* The answer "I have no idea" is a clear indication that the respondent lacks knowledge about type inference systems in statically-typed programming languages like Rust.
* The respondent has not provided any attempt to answer the question, which suggests a lack of understanding of the topic.

Brilliant points: None

What should have been included:

* A brief explanation of type inference systems and their role in statically-typed programming languages
* An understanding of Rust's type inference system and how it works
* A discussion of the impact of type inference on code readability

Irrelevant or incorrect parts: None

Common pitfalls to avoid:

* Avoi