In [8]:
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
from docx import Document


BASE_DIR = os.getcwd()
DATA_PATH = os.path.join(BASE_DIR, "data")
DOC_FILE = os.path.join(DATA_PATH, "Syllabus.docx")


# extract text

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    full_text = ""
    for para in doc.paragraphs:
        full_text += para.text + "\n"
    return full_text.strip()


text = extract_text_from_docx(DOC_FILE)

if not text:
    print("No text extracted! Check if the Word document contains readable text.")
    exit()

# Split text into chunks for RAG

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)
texts = text_splitter.split_text(text)

# ChromaDB

client = chromadb.Client()
collection_name = "syllabus_collection"

try:
    collection = client.get_collection(name=collection_name)
except:
    collection = client.create_collection(name=collection_name)

for i, chunk in enumerate(texts):
    collection.add(
        documents=[chunk],
        metadatas=[{"source": f"{DOC_FILE}_chunk_{i}"}],
        ids=[f"{DOC_FILE}_chunk_{i}"]
    )

print(f"Added {len(texts)} text chunks from {DOC_FILE} to the Chroma collection.")


Added 2 text chunks from c:\Users\Vedaditya\Desktop\CollegeGPT\RAG_P1\data\Syllabus.docx to the Chroma collection.


In [None]:
import os
import requests
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
from docx import Document

BASE_DIR = os.getcwd()
DATA_PATH = os.path.join(BASE_DIR, "data")
DOC_FILE = os.path.join(DATA_PATH, "Syllabus.docx")

doc = Document(DOC_FILE)
full_text = ""
for para in doc.paragraphs:
    full_text += para.text + "\n"
text = full_text.strip()

print(f"Extracted {len(text)} characters from Syllabus.docx")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)
texts = text_splitter.split_text(text)

# ChromaDB setup
client = chromadb.Client()
collection_name = "syllabus_collection"

try:
    client.delete_collection(name=collection_name)
except:
    pass

collection = client.create_collection(name=collection_name)

for i, chunk in enumerate(texts):
    collection.add(
        documents=[chunk],
        metadatas=[{"source": f"syllabus_chunk_{i}"}],
        ids=[f"chunk_{i}"]
    )

print(f"Added {len(texts)} chunks to ChromaDB")

# user for query
user_query = input("\nWhat do you want to know about the syllabus? ")

# Retrieve from ChromaDB
results = collection.query(
    query_texts=[user_query],
    n_results=3
)

print("Retrieved relevant chunks from syllabus")

# Ollama
prompt = f"""You are a helpful assistant. Answer questions about the Computer Science Engineering Department syllabus.
Answer only based on the syllabus content below. If you don't know, say "I don't know".

Syllabus content:
{results['documents'][0]}

Question: {user_query}

Answer:"""

# Ollama API
response = requests.post('http://localhost:11434/api/generate',
                        json={
                            'model': 'llama3.2',
                            'prompt': prompt,
                            'stream': False
                        })

print("Answer:")
print("---------------------")
print(response.json()['response'])

Extracted 1345 characters from Syllabus.docx
Added 2 chunks to ChromaDB
Retrieved relevant chunks from syllabus
Answer:
---------------------
Here are the subjects mentioned in the syllabus:

For 5th Semester (Third Year):

1. Theory of Computation
2. Operating Systems
3. Artificial Intelligence and Machine Learning
4. Computational Intelligence
5. Data Mining and Data Warehousing
6. Microprocessors and Microcontrollers
7. Distributed Systems
8. Cryptographic Foundation and Network Security
9. Object-Oriented Analysis and Design
10. Web Technology

For 6th Semester (Third Year):

1. Compiler Design
2. Software Engineering
3. Evolutionary Computing
4. Pattern Recognition
5. Embedded Systems
6. Cloud and Edge Computing
