<a href="https://colab.research.google.com/github/VOX304/SchoolChatbot/blob/main/RAG_aNh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
%pip install langchain \
langchain_community \
langchain_core \
langchain_google_genai \
python-dotenv \
pypdf



In [11]:
pip install faiss-cpu



In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
from google.colab import userdata

os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
embedding_model = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
pdf_files = ["/content/sample_data/CSE Module Handbook.pdf", "/content/sample_data/CSE2021_Info Session  Internship, Thesis and Graduation.pdf"]  # Adjust paths

In [13]:
documents = []
for pdf in pdf_files:
    pdf_loader = PyPDFLoader(pdf)
    documents.extend(pdf_loader.load())

text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

# Ensure embeddings are generated correctly
#embeddings = embedding_model.embed_documents([doc.page_content for doc in chunks])

# Pass embedded vectors to FAISS



In [14]:
vector_db = FAISS.from_documents(chunks, embedding_model)

In [15]:
print(f"✅ Processed {len(chunks)} text chunks into FAISS vector database.")


✅ Processed 360 text chunks into FAISS vector database.


In [16]:
query = "What is the requirement for graduation?"
retrieved_docs = vector_db.similarity_search(query, k = 3)


In [17]:
for i, doc in enumerate(retrieved_docs[:3]):  # Show top 3
    print(f"\n📄 Document {i+1}:\n{doc.page_content}")


📄 Document 1:
GRADUATION
1. General Information
2. Graduation Timeline

📄 Document 2:
Vietnamese-German University Computer Science Program
General Information
1. Prerequisites: 
- Pass all modules (180 ECTS)
- Complete 04 German classes or submit an A2 German Certificate
2. Expected timeline:
- VGU conducts two graduation assessments annually: in April and October 
- Only one Graduation Ceremony: November

📄 Document 3:
General Information
1. Prerequisites: 
- Evidence of the internship registration with a signed training contract (IC)
- Successful completion of all modules of the first 5 semesters (150 ECTS)
2. Grading Policy: Bachelor Thesis (weighting 80%) and Colloquium 
(min. 30 min. and max. 60 min., weighting 20%)
3. Regulation: Thesis final reports submitted late will fail. Bachelor’s 
thesis with colloquium only be repeated once.
Vietnamese-German University
7
Computer Science Program


In [18]:
from langchain_google_genai import ChatGoogleGenerativeAI

chat_model = ChatGoogleGenerativeAI(
    google_api_key=os.environ["GOOGLE_API_KEY"],
    model="gemini-2.0-flash-thinking-exp-01-21",
    temperature=0.7
)
print("✅ Chat model loaded successfully.")

✅ Chat model loaded successfully.


In [22]:
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)


def augment_prompt(query: str):
    # get top 3 results from knowledge base
    results = vector_db.similarity_search(query, k=10)
    # get the text from the results
    source_knowledge = "\n".join([x.page_content for x in results])
    # feed into an augmented prompt
    augmented_prompt = f"""You are the school assistant: Using the contexts below, answer the query, in a friendly way. Dont make up answers. If you dont know the answer, just say that you dont know, dont try to make up an answer.

    Contexts:
    {source_knowledge}

    Query: {query}"""
    return augmented_prompt



In [28]:
question = "What are the requirement for graduation"
context = augment_prompt(question)
history = []  # If using history, format as messages

prompt = HumanMessage(
    content=augment_prompt(
        "What are the requirement for graduation"
    )
)

res = chat_model.invoke(
    augment_prompt(
        question
    )
)
print(res.content)



ValueError: Invalid input type <class 'langchain_core.messages.human.HumanMessage'>. Must be a PromptValue, str, or list of BaseMessages.