# Load .env file

In [9]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
GROQ_API_KEY = os.getenv('GROQ_API_KEY')
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")


# Preprocessing

In [10]:
import os
import glob
import PyPDF2
from pptx import Presentation
from tqdm import tqdm
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone
import openai

from dotenv import load_dotenv

load_dotenv()

# --- Configuration ---
openai.api_key = os.getenv("OPENAI_API_KEY")
pc = Pinecone(
        api_key=os.getenv("PINECONE_API_KEY")
    )

index_name = 'syllabus'

index = pc.Index(index_name)

BASE_DIR = "ctse_lecture_slides"
TEXT_DIR = "ctse_extracted_txt_files"
MODULE_TEXT_DIR = "ctse_merged_module_texts"

os.makedirs(TEXT_DIR, exist_ok=True)
os.makedirs(MODULE_TEXT_DIR, exist_ok=True)


# --- Extractor Class ---
class SlideExtractor:
    def __init__(self, base_dir: str, output_dir: str, merged_dir: str):
        self.base_dir = base_dir
        self.output_dir = output_dir
        self.merged_dir = merged_dir

    def extract_all(self):
        for module_dir in sorted(os.listdir(self.base_dir)):
            module_path = os.path.join(self.base_dir, module_dir)
            if not os.path.isdir(module_path):
                continue

            module_texts = []

            lecture_files = sorted(
                glob.glob(os.path.join(module_path, "*.pdf")) + glob.glob(os.path.join(module_path, "*.pptx")),
                key=lambda x: os.path.basename(x).lower()
            )

            for file_path in lecture_files:
                if file_path.endswith(".pdf"):
                    text = self._extract_pdf(file_path)
                else:
                    text = self._extract_pptx(file_path)

                if text:
                    self._save_individual_text(file_path, text)
                    module_texts.append(f"[{os.path.basename(file_path)}]\n{text}\n")

            # Save merged module-level file
            if module_texts:
                merged_path = os.path.join(self.merged_dir, f"{module_dir}.txt")
                with open(merged_path, "w", encoding="utf-8") as f:
                    f.write("\n".join(module_texts))

    def _extract_pdf(self, path: str) -> str:
        try:
            with open(path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                return "\n".join(page.extract_text() or "" for page in reader.pages)
        except Exception as e:
            print(f"[PDF ERROR] {path} - {e}")
            return ""

    def _extract_pptx(self, path: str) -> str:
        try:
            prs = Presentation(path)
            return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
        except Exception as e:
            print(f"[PPTX ERROR] {path} - {e}")
            return ""

    def _save_individual_text(self, original_path: str, content: str):
        rel_path = os.path.relpath(original_path, self.base_dir)
        txt_path = os.path.join(self.output_dir, rel_path.replace(".pdf", ".txt").replace(".pptx", ".txt"))
        os.makedirs(os.path.dirname(txt_path), exist_ok=True)
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(content)


# --- Chunking & Embedding ---
class Vectorizer:
    def __init__(self, merged_dir: str, chunk_size: int = 1000, chunk_overlap: int = 100):
        self.merged_dir = merged_dir
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        self.embedder = OpenAIEmbeddings(model="text-embedding-3-small")

    def process(self):
        txt_files = sorted(glob.glob(os.path.join(self.merged_dir, "*.txt")))
        for path in tqdm(txt_files, desc="Vectorizing merged module files"):
            with open(path, "r", encoding="utf-8") as f:
                content = f.read()

            module_name = os.path.splitext(os.path.basename(path))[0]
            chunks = self.text_splitter.split_text(content)

            if chunks:
                for i, text in enumerate(chunks):
                    try:
                        vector = self.embedder.embed_query(text)  # using embed_query for single text
                        metadata = {
                            "module": module_name,
                            "source": f"{module_name}.txt",
                            "chunk_index": i,
                            "text": text,
                        }
                        index.upsert(
                            vectors=[(f"{module_name}-{i}", vector, metadata)],
                            namespace=module_name
                        )
                    except Exception as e:
                        print(f"[ERROR] Failed to process chunk {i} of {module_name}: {e}")


Extract the text 

In [11]:
print("🔍 Extracting text from slides...")
extractor = SlideExtractor(BASE_DIR, TEXT_DIR, MODULE_TEXT_DIR)
extractor.extract_all()

🔍 Extracting text from slides...


Vectorize the the chunks

In [12]:
print("🧠 Vectorizing and storing into Pinecone...")
vectorizer = Vectorizer(MODULE_TEXT_DIR)
vectorizer.process()

print("✅ All done!")

🧠 Vectorizing and storing into Pinecone...
✅ All done!


# Retrieve the context

In [13]:
from pinecone import Pinecone
from langchain.vectorstores import Pinecone as Pcone
from langchain.embeddings import OpenAIEmbeddings

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Connect to index
index = pc.Index(PINECONE_INDEX_NAME)

embedder = OpenAIEmbeddings(model="text-embedding-3-small")

def retrieve_similar_questions(query, top_k=3):
    vector = embedder.embed_query(query)
    namespace = 'ctse'
    results = index.query(
        vector=vector,
        top_k=top_k,
        namespace=namespace,
        include_metadata=True,
    )

    texts = [match["metadata"]["text"] for match in results["matches"]]

    return texts[:top_k]

print(retrieve_similar_questions("how to train a neural net"))


['SE4010 | Current Trends in SE| Introduction to Artificial Neural Networks| Jeewaka PereraBuilding Complete Neural Networks\n•Stacking multiple Nerons to for a layer\n•Organizing multiple Layers to form the network\nSE4010 | Current Trends in SE| Introduction to Artificial Neural Networks| Jeewaka PereraTraining a neural net\n1.Randomly initialize weights\n2.Implement forward propagation to get the output at each \nneuron \n3.Compute the error at the output layer Etotal\n4.Implement backpropagation to compute partial \nderivatives   𝜕𝐸𝑡𝑜𝑡𝑎𝑙\n𝜕𝑤𝑙\n𝑗𝑘\n5.Use Gradient descent or any other optimization technique \nto update the weights to minimize Etotal\n6.    Repeat this process over multiple iterations (epochs)  \nuntil the error converges \nSE4010 | Current Trends in SE| Introduction to Artificial Neural Networks| Jeewaka Perera', 'SE4010 | Current Trends in SE| Introduction to Artificial Neural Networks| Jeewaka PereraWhat is a Neural Network\n•A Collection of Perceptron\n•A Layer ca

# Define the LLM and prompt

In [14]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

llm = ChatGroq(
    model="mistral-saba-24b",
    temperature=0,
)

system = f"""
    You are an expert in answering user question with provide context.
"""

human = r"""
    Answer the question according to context.
    Context: {context}
    Question: {question}
"""

prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])
parser = StrOutputParser()


chain = prompt | llm | parser



# Chatbot

In [17]:

question = "explain transformers?"

context = retrieve_similar_questions(question)

answer = chain.invoke({"context": context, "question": question})
print(f"\nAnswer: {answer}\n")



Answer: Transformers are a type of neural network architecture introduced in 2017, primarily for natural language processing (NLP) tasks. They are based on a self-attention mechanism, which allows the model to focus on different parts of the input sequence when processing each token. This mechanism enables the model to capture long-range dependencies and relationships within the data more effectively than traditional recurrent neural networks (RNNs).

Key features of Transformers include:

1. **Self-Attention Mechanism**: This allows the model to weigh the importance of different parts of the input sequence when making predictions, enabling it to handle long-range dependencies.
2. **Parallel Processing**: Unlike RNNs, which process sequences step-by-step, Transformers can process the entire input sequence in parallel. This makes them much faster and more efficient, especially for long sequences.
3. **Versatility**: Transformers have been successfully applied to a variety of tasks, inc