In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai.chat_models import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain.prompts import ChatPromptTemplate
from langchain.vectorstores import Chroma
from dotenv import load_dotenv

In [2]:
# Load environment variables from .env
load_dotenv()
import os
openai_api_key = os.getenv("OPENAI_API_KEY")

In [3]:
# Load all PDFs in the data folder
data_folder = '../data'
documents = []
for file_name in os.listdir(data_folder):
    if file_name.endswith('.pdf'):
        loader = PyPDFLoader(os.path.join(data_folder, file_name))
        documents.extend(loader.load())

In [4]:
# Split text into manageable chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)

In [5]:
# Create embeddings and vector store
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectorstore = Chroma.from_documents(docs, embeddings)

In [6]:
# Define the system prompt
system_prompt = """
You are an intelligent and helpful assistant designed to answer questions based on CTSE (Current Trends in Software Engineering) lecture notes. 
Your primary goal is to provide accurate, concise, and contextually relevant answers to the user's questions. 
The lecture notes cover topics such as software engineering principles, AI/ML trends, software development methodologies, and related concepts. 
If a question is outside the scope of the lecture notes, politely inform the user that the information is not available. 
Always maintain a professional tone and avoid making up information. 
When answering, ensure your responses are clear and easy to understand, and provide examples or explanations when necessary. 
If the user asks for clarification, respond patiently and provide additional details. 
Your responses should be tailored to help students understand the material effectively.
"""

In [7]:
human_template = """Context: 
{context}

Question: {input}
"""

In [8]:
# Create a prompt template that includes both context and query variables
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", human_template)
])

In [9]:
# Set up the LLM and Retrieval-based QA chain
llm = ChatOpenAI(model="gpt-4", openai_api_key=openai_api_key)

# Create a document chain that combines the retrieved documents
document_chain = create_stuff_documents_chain(llm, chat_prompt)

# Create a retrieval chain that uses the retriever and the document chain
qa_chain = create_retrieval_chain(vectorstore.as_retriever(), document_chain)

In [None]:
# Chatbot interaction
while True:
    query = input("Ask a question about CTSE lecture notes: ")
    if query.lower() == "exit":
        break
    try:
        response = qa_chain.invoke({"input": query})
        print(f"Answer: {response['answer']}")
            
    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()