In [None]:
!pip -q install langchain openai faiss-cpu sentence-transformers pypdf tiktoken

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from google.colab import drive
drive.mount("/content/drive")

PAPER_DIR   = "/content/drive/MyDrive/ProfS/ProfS_Papers"
INDEX_PATH  = "/content/drive/MyDrive/ProfS/ProfS_FAISS"

In [None]:
!pip install -Uq langchain-community pypdf

In [None]:
# 1 · load + chunk papers
import glob, os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

pdf_files = glob.glob(os.path.join(PAPER_DIR, "**/*.pdf"), recursive=True)
docs = []
for pdf in pdf_files:
    docs += PyPDFLoader(pdf).load()

print(f"Loaded {len(pdf_files)} PDFs  →  {len(docs)} pages")

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunks   = splitter.split_documents(docs)
print(f"Split into {len(chunks):,} chunks")

In [None]:
# 2 · embed with small model + build / load FAISS index
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from pathlib import Path

emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# tag every PDF chunk so we can recognise genuine sources
for c in chunks:
    c.metadata["type"] = "paper"

# (re)build
vectordb = FAISS.from_documents(chunks, emb)
vectordb.save_local(INDEX_PATH)
print("FAISS index rebuilt with latest PDFs ✔️")

# (lower → stricter)
retriever = vectordb.as_retriever(
    search_kwargs={"k": 4, "score_threshold": 0.75}
)

In [None]:
# 3
from langchain.prompts.chat import ChatPromptTemplate, MessagesPlaceholder
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

# Define system prompt
SYSTEM_PROMPT = (
    "You are Professor S, a kind, patient, slightly quirky CS professor. "
    "You often say 'emm…', 'actually', 'fascinating', 'oh I see', and begin with "
    "'Fantastic question!'. End every answer with ':D '. "
    "Use the following context to inform your answer. If the context is relevant, prioritize it. "
    "If the question is unrelated to the context, use your general knowledge to provide a helpful answer."
    "\n\nContext:\n{context}"
)


# Prompt structure using system + memory + question
prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(SYSTEM_PROMPT),
    MessagesPlaceholder(variable_name="chat_history"),
    HumanMessagePromptTemplate.from_template("{question}")
])

CONDENSE_Q_SYSTEM = (
    "You are a helpful assistant. You get the prior chat_history "
    "(as a list of messages) plus the user's follow-up question. "
    "Rewrite the follow-up so it can stand alone."
)

condense_question_prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(CONDENSE_Q_SYSTEM),
    MessagesPlaceholder(variable_name="chat_history"),   # list is fine
    HumanMessagePromptTemplate.from_template("{question}")
])

# LLM and memory
import os, getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

llm = ChatOpenAI(
    model_name="gpt-4o-2024-08-06",
    temperature=0.8,
    openai_api_key=os.environ["OPENAI_API_KEY"]
)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [None]:
# 4
import json, pathlib
from langchain.memory.chat_message_histories import ChatMessageHistory
from langchain.schema import HumanMessage, AIMessage, SystemMessage

MEM_FILE = f"{PAPER_DIR}/profS_memory.jsonl"

def load_memory():
    if not pathlib.Path(MEM_FILE).exists():
        return []
    with open(MEM_FILE, "r", encoding="utf-8") as f:
        return [json.loads(ln) for ln in f]

def append_memory(msgs):
    with open(MEM_FILE, "a", encoding="utf-8") as f:
        for m in msgs:
            f.write(json.dumps(m, ensure_ascii=False) + "\n")

# Load and wrap messages
long_term_history = load_memory()
print(f"Loaded {len(long_term_history)//2} previous Q&A pairs")

from langchain_core.messages import AIMessage, HumanMessage
from langchain.memory.chat_message_histories import ChatMessageHistory

chat_history = ChatMessageHistory()
for m in long_term_history:
    role = m["role"]
    content = m["content"]
    if role == "user":
        chat_history.add_user_message(content)
    elif role == "assistant":
        chat_history.add_ai_message(content)
    else:
        raise ValueError(f"Unknown role: {role}")

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    chat_memory=chat_history,
    output_key="answer",           # tell memory which field to store
)

from langchain.chains import ConversationalRetrievalChain

# build RAG chain
rag = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    get_chat_history=lambda hist: hist,          # keep list structure
    condense_question_prompt=condense_question_prompt,
    combine_docs_chain_kwargs={
        "prompt": prompt,
        "document_variable_name": "context",
    },
    return_source_documents=True,                # sources

    verbose=False,
)


In [None]:
# 5 · Chat loop
from langchain.schema import Document, SystemMessage, HumanMessage
from datetime import datetime
import textwrap

def wrap(txt: str) -> str:
    return textwrap.fill(txt, 90)

print("Ask Professor S anything :D  (type 'exit' to quit)")

try:
    while True:
        q = input("\nStudent: ").strip()
        if q.lower() in {"exit", "quit"}:
            break

        # 1 · RAG
        result  = rag.invoke({"question": q})
        answer  = result["answer"]
        # keep only chunks that really came from PDFs
        sources = [d for d in result.get("source_documents", [])
                   if d.metadata.get("type") == "paper"]

        # 2 · Decide if fallback is needed
        use_fallback = len(sources) == 0
        if use_fallback:
            print("(no relevant paper context → fallback to GPT-4o)")
            completion = llm.invoke([
                SystemMessage(
                    content="You are Professor S, a kind, patient, slightly quirky CS professor. "
                            "End every answer with ':D '"),
                HumanMessage(content=q)
            ])
            answer = completion.content.strip()

        # 3 · Display
        if not use_fallback:
            print("\nRetrieved context from papers:")
            for d in sources:
                print("•", d.page_content[:150].replace("\n", " "), "…")
        print("\nProfessor S:", wrap(answer))

        # 4 · Append to long-term memory file
        append_memory([
            {"role": "user",      "content": q},
            {"role": "assistant", "content": answer}
        ])

        # 5 · grow the vector store **only** with RAG answers
        if not use_fallback and sources:
            ts = datetime.utcnow().isoformat()
            vectordb.add_documents([
                Document(page_content=q,      metadata={"role": "user",      "ts": ts}),
                Document(page_content=answer, metadata={"role": "assistant", "ts": ts})
            ])

except KeyboardInterrupt:
    print("\nSession interrupted.")
finally:
    vectordb.save_local(INDEX_PATH)
    print("Index saved to", INDEX_PATH)
