In [1]:
import chromadb
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import nltk
from nltk.tokenize import sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Initialize ChromaDB (Persistent Storage)
client = chromadb.PersistentClient(path="chroma_db/")

# Create collections for each course inside a school
schools = {"School_of_technology":["Python","Java","AI","ML"],"School_of_business":["Marketing","Accounts"],"School_of_law":["History_law"],"School_of_design":["Design_history"]}
for school, courses in schools.items():
    for course in courses:
        collection_name = f"{school}_{course}"
        client.get_or_create_collection(name=collection_name)


print("School and Course collections created!")

print(len(client.list_collections()))
print(client.list_collections())



# Load the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "/n"  # Extract text from each page
    return text

# def split_text_into_chunks(text, chunk_size=500, overlap=50):
#     """
#     Splits long text (books) into fixed-size chunks with overlap.

#     :param text: Extracted text from PDF
#     :param chunk_size: Number of characters per chunk
#     :param overlap: Overlapping characters between consecutive chunks (to maintain context)
#     :return: List of text chunks
#     """
#     words = text.split()
#     chunks = []
    
#     for i in range(0, len(words), chunk_size - overlap):
#         chunk = " ".join(words[i:i + chunk_size])
#         chunks.append(chunk)

#     return chunks


def split_text_into_chunks(text, max_tokens=100):
    sentences = sent_tokenize(text)  # Split into sentences
    chunks = []
    current_chunk = ""
    print(len(text))

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_tokens:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence  # Start new chunk

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


def store_pdf_embeddings(school, course, pdf_paths):
    """
    Convert book PDFs to embeddings & store in ChromaDB.

    :param school: School name
    :param course: Course name
    :param pdf_paths: List of PDF file paths
    """
    collection_name = f"{school}_{course}"
    collection = client.get_or_create_collection(name=collection_name)

    for pdf_path in pdf_paths:
        print(f"Processing {pdf_path}...")

        text = extract_text_from_pdf(pdf_path)
        chunks = split_text_into_chunks(text, max_tokens=100)
        embeddings = embedding_model.encode(chunks).tolist()  # Generate vector embeddings

        for i, chunk in enumerate(chunks):
            collection.add(
                ids=[f"{collection_name}_{pdf_path}_{i}"],  # Unique ID
                documents=[chunk],  # Store text chunk
                embeddings=[embeddings[i]]  # Store vector embeddings
            )

        print(f"{pdf_path} embeddings added to {collection_name} in ChromaDB.")

# Example Usage: Storing book PDFs
store_pdf_embeddings(
    "School_of_technology",
    "Python",
    ["R:/CourseChabot/Backend/uploads/School_of_technology/Python/Learning_Python_part_1.pdf","R:/CourseChabot/Backend/uploads/School_of_technology/Python/Learning_Python_part_2.pdf"]
)

# store_pdf_embeddings(
#     "School_of_business",
#     "Marketing",
#     ["marketing_strategy.pdf", "digital_marketing_book.pdf"]
# )



School and Course collections created!
8
['School_of_technology_ML', 'School_of_design_Design_history', 'School_of_technology_Python', 'School_of_technology_Java', 'School_of_law_History_law', 'School_of_business_Marketing', 'School_of_business_Accounts', 'School_of_technology_AI']
Processing R:/CourseChabot/Backend/uploads/School_of_technology/Python/Learning_Python_part_1.pdf...
764468
R:/CourseChabot/Backend/uploads/School_of_technology/Python/Learning_Python_part_1.pdf embeddings added to School_of_technology_Python in ChromaDB.
Processing R:/CourseChabot/Backend/uploads/School_of_technology/Python/Learning_Python_part_2.pdf...
878791
R:/CourseChabot/Backend/uploads/School_of_technology/Python/Learning_Python_part_2.pdf embeddings added to School_of_technology_Python in ChromaDB.


In [4]:
import chromadb
from sentence_transformers import SentenceTransformer


# Initialize ChromaDB Client
client = chromadb.PersistentClient(path="chroma_db/")

# Load the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def query_course_content(school, course, user_query, top_k=3):
    """
    Queries the ChromaDB vector database to retrieve relevant course material.

    :param school: School name (e.g., "School_of_technology")
    :param course: Course name (e.g., "Python")
    :param user_query: Question asked by the user
    :param top_k: Number of top results to retrieve
    :return: List of retrieved document chunks
    """
    collection_name = f"{school}_{course}"
    collection = client.get_or_create_collection(name=collection_name)

    # Convert user query into vector embeddings
    query_embedding = embedding_model.encode(user_query).tolist()

    # Retrieve the most relevant course content
    results = collection.query(
        query_embeddings=[query_embedding], 
        n_results=top_k  # Retrieve top 3 most relevant results
    )

    if "documents" not in results or not results["documents"][0]:
        return ["No relevant content found in the course materials."]

    return results["documents"][0]

# Example Usage
# user_question = "What are the basics of Python programming?"
# retrieved_text = query_course_content("School_of_technology", "Python", user_question)

# print("Retrieved Content:")
# for i, chunk in enumerate(retrieved_text):
#     print(f"{i+1}. {chunk}\n")


In [5]:
import ollama

def generate_answer_with_ollama(school, course, user_query):
    """
    Uses ChromaDB and Ollama to generate an AI-based response.

    :param school: School name
    :param course: Course name
    :param user_query: User's question
    :return: AI-generated response
    """
    retrieved_content = query_course_content(school, course, user_query, top_k=3)
    print(retrieved_content)

    # Prepare the context for the AI model
    context = "\n\n".join(retrieved_content)
    
    prompt = f"""
    You are an AI assistant for {school}'s {course} course.

    User Question: {user_query}

    Course Reference Material:
    {context}

    Provide a detailed and structured response in the following format:

    1. **Introduction**  
       - Briefly explain the concept related to the user's question.

    2. **Steps to Perform**  
       - List step-by-step instructions, including subpoints if necessary.

    3. **Code Example (if applicable)**  
       - Provide a well-formatted and properly explained code snippet.

    4. **Key Points to Remember**  
       - Summarize important takeaways in bullet points.
    """


    
    # Generate AI response using Ollama
    response = ollama.chat(model="llama2", messages=[{"role": "user", "content": prompt}])

    return response["message"]["content"]

# Example Usage
user_question = "Explain about dataypes in python."
answer = generate_answer_with_ollama("School_of_technology", "Python", user_question)

print("AI Response:\n", answer)


['Somewhat more formally, in Python, data takes the form of objects—either built-in\nobjects that Python provides, or objects we create using Python or external language\ntools such as C extension libraries.', 'In Python, data takes\nthe form of objects\n—either built-in objects that Python provides, or objects we create\nusing Python tools and other languages such as C. In fact, objects are the basis of every\nPython program you will ever write.', 'Byte code is a\nPython-specific representation.']
AI Response:
 
1. Introduction:
Data types in Python are the classifications of data that can be stored or manipulated within a program. Data types determine the type of data that can be stored in a variable, and they play a crucial role in ensuring that data is used correctly and efficiently in a program. In Python, data types are objects, and these objects can be built-in or created using external tools such as C extension libraries.
2. Steps to Perform:

a. Understand the concept of data 