In [None]:
# Install basic packages 
%pip install requests sentence-transformers
%pip install PyPDF2
%pip install streamlit

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [14]:
import os
import requests
import glob
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import PyPDF2
import re

# Set  API key
os.environ["GOOGLE_API_KEY"] = "AIzaSyBzS_RgPK9r-ZAWFndoDkm6TunuIpRRSlA"

# Step 1: Load documents from the 'data' folder - UPDATED to handle PDFs
def load_documents(directory='./data'):
    documents = []
    
    # Process text files
    for file_path in glob.glob(f"{directory}/*.txt"):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            documents.append({
                "content": content,
                "source": file_path
            })
    
    # Process PDF files
    for file_path in glob.glob(f"{directory}/*.pdf"):
        try:
            content = extract_text_from_pdf(file_path)
            documents.append({
                "content": content,
                "source": file_path
            })
        except Exception as e:
            print(f"Error extracting text from {file_path}: {str(e)}")
    
    print(f"Loaded {len(documents)} documents")
    return documents

# New helper function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text() + "\n\n"
    
    # Clean up the text (remove extra whitespace, etc.)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Step 2: Split documents into chunks (simple version)
def split_into_chunks(documents, chunk_size=800, overlap=150):
    chunks = []
    for doc in documents:
        content = doc["content"]
        source = doc["source"]
        
        # Simple sliding window approach
        for i in range(0, len(content), chunk_size - overlap):
            chunk_text = content[i:i + chunk_size]
            if len(chunk_text) < 100: 
                continue
            chunks.append({
                "content": chunk_text,
                "source": source
            })
    
    print(f"Split into {len(chunks)} chunks")
    return chunks

# Step 3: Create embeddings
def create_embeddings(chunks):
    print("Loading embedding model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    print("Creating embeddings for chunks...")
    embeddings = []
    for chunk in chunks:
        embedding = model.encode(chunk["content"])
        embeddings.append(embedding)
    
    return model, embeddings

# Step 4: Simple retrieval function
def retrieve_relevant_chunks(query, model, chunks, embeddings, k=3):
    # Get query embedding
    query_embedding = model.encode(query)
    
    # Calculate similarity
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    
    # Get top k chunks
    top_indices = np.argsort(similarities)[-k:][::-1]
    
    return [chunks[i] for i in top_indices]

# Step 5: Function to call Google Gemini API
def ask_gemini(prompt):
    api_key = os.environ.get("GOOGLE_API_KEY", "")
    
    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
    
    headers = {"Content-Type": "application/json"}
    data = {
        "contents": [{
            "parts":[{"text": prompt}]
        }]
    }
    
    try:
        response = requests.post(url, headers=headers, json=data)
        
        if response.status_code == 200:
            response_json = response.json()
            if "candidates" in response_json and len(response_json["candidates"]) > 0:
                return response_json["candidates"][0]["content"]["parts"][0]["text"]
            else:
                return "No valid response found in API response"
        else:
            return f"Error: {response.status_code} - {response.text}"
    except Exception as e:
        return f"Error calling API: {str(e)}"

# Step 6: Main QA function
def answer_question(question, model, chunks, embeddings):
    print(f"\nQuestion: {question}")
    print("-" * 50)
    
    # Get relevant chunks
    relevant_chunks = retrieve_relevant_chunks(question, model, chunks, embeddings)
    
    # Create prompt for Gemini
    context = "\n\n".join([chunk["content"] for chunk in relevant_chunks])
    prompt = f"""Answer the following question based ONLY on the information provided in the context below.
    If the answer is not found in the context, say "I don't have enough information to answer this question."

    CONTEXT:
    {context}

    QUESTION:
    {question}

    ANSWER:"""
    
    # Get answer from Gemini
    answer = ask_gemini(prompt)
    
    print(f"Answer: {answer}\n")
    print("Sources:")
    for i, chunk in enumerate(relevant_chunks):
        source_file = os.path.basename(chunk["source"])
        print(f"Source {i+1} from {source_file}:\n{chunk['content'][:150]}...\n")
    
    return answer

# Step 7: Improved interactive chat interface
def chat(model, chunks, embeddings):
    print("\n" + "=" * 60)
    print("  CTSE Lecture Notes Chatbot")
    print("  Type 'exit' to quit or 'new' for a new question")
    print("=" * 60)
    
    while True:
        question = input("\nAsk a question: ")
        
        if question.lower() in ['exit', 'quit', 'bye']:
            print("\nThank you for using the CTSE Lecture Notes Chatbot. Goodbye!")
            break
            
        answer = answer_question(question, model, chunks, embeddings)
        
        # Ask if user wants to continue
        print("\n" + "-" * 60)
        print("Type your next question, 'exit' to quit, or press Enter to continue")
        print("-" * 60)

# Main execution
if __name__ == "__main__":
    # Load and process documents
    documents = load_documents()
    chunks = split_into_chunks(documents, chunk_size=800)
    model, embeddings = create_embeddings(chunks)
    
    # Start interactive chat directly without sample question
    chat(model, chunks, embeddings)

Loaded 1 documents
Split into 18 chunks
Loading embedding model...
Creating embeddings for chunks...

  CTSE Lecture Notes Chatbot
  Type 'exit' to quit or 'new' for a new question

Question: What's the significance of bias in a neural network, and how does it relate to the threshold value?
--------------------------------------------------
Answer: Bias is used to shift the output value, and it is equal to the negative of the threshold value (Bias = -Threshold value).


Sources:
Source 1 from ML Lec 2 - Part 1.pdf:
 Introduction to Artificial Neural Networks| Jeewaka PereraStructure of an ANN SE4010 | Current Trends in SE| Introduction to Artificial Neural Networ...

Source 2 from ML Lec 2 - Part 1.pdf:
SE4010 | Current Trends in SE| Introduction to Artificial Neural Networks| Jeewaka PereraGain an understanding of the structure and background of ANN ...

Source 3 from ML Lec 2 - Part 1.pdf:
 y=ቊ0,𝑥𝑃⋅𝑤1+𝑥𝑏⋅𝑤2+𝑥𝑟⋅𝑤3<𝑡 1,𝑥𝑃⋅𝑤1+𝑥𝑏⋅𝑤2+𝑥𝑟⋅𝑤3≥𝑡 •Its easier to compute with vectors y=ቊ0,𝑥⋅𝑤<𝑡 

KeyboardInterrupt: Interrupted by user