In [1]:
from langchain_community.document_loaders import PyPDFLoader

#loading pdfs
print("Loading PDFs")
print("Course description...", end="\r", flush=True)
pdf_loader = PyPDFLoader("rag_data/Umich_FA2024_course_description.pdf")
course_description_text = pdf_loader.load()
print("Loaded course description")
print("Degree requirements...", end="\r", flush=True)
pdf_loader = PyPDFLoader("rag_data/Umich_FA2024_LSA_degree_requ.pdf")
degree_requirements_text = pdf_loader.load()
print("Loaded degree requirements")
print("Major minor description...", end="\r", flush=True)
pdf_loader = PyPDFLoader("rag_data/Umich_FA2024_major_minor_description.pdf")
major_minor_description_text = pdf_loader.load()
print("Loaded major minor description")

print("Subject mapping...", end="\r", flush=True)
pdf_loader = PyPDFLoader("rag_data/Umich_FA2024_subject_mapping.pdf")
subject_mapping_text = pdf_loader.load()
print("Loaded subject mapping")

Loading PDFs
Loaded course description
Loaded degree requirements
Loaded major minor description
Loaded subject mapping


In [2]:
from langchain_community.document_loaders.csv_loader import CSVLoader
# loading csv
print("Course schedule...", end="\r", flush=True)
csv_loader = CSVLoader(file_path='rag_data/Umich_FA2024_course_schedule.csv')
course_schedule_text = csv_loader.load()
print("Loaded course schedule")

Loaded course schedule


In [47]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
# chunking text
print("Chunking text...")
pages = [course_description_text, degree_requirements_text, major_minor_description_text, subject_mapping_text, course_schedule_text]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)
course_description_chunks = text_splitter.split_documents(course_description_text)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)


chunked_pages = []
for page in pages:
    chunks = text_splitter.split_documents(page)
    chunked_pages.append(chunks)

Chunking text...


In [48]:
# combining chunks
print("Combining chunks...")
combined_chunks = []
for page in chunked_pages:
    combined_chunks.extend(page)

Combining chunks...


In [49]:
len(combined_chunks)

26200

In [17]:
from langchain_chroma import Chroma
from langchain_voyageai import VoyageAIEmbeddings
import dotenv
import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings


dotenv.load_dotenv()
new_client = chromadb.PersistentClient(path = "./chroma_db", tenant = DEFAULT_TENANT, database = DEFAULT_DATABASE, settings = Settings())

embeddings = VoyageAIEmbeddings(
    voyage_api_key=dotenv.get_key(dotenv_path= ".env", key_to_get = "VOYAGEAI_KEY") , model="voyage-2")

vectorstore = Chroma.from_documents(documents=chunked_pages[0], embedding=embeddings, collection_name="umich_fa2024", client=new_client)

batch size None


In [25]:
vector_store2 = Chroma(persist_directory="./chroma_db", client=new_client, collection_name="umich_fa2024", embedding_function=embeddings)
retriever = vector_store2.as_retriever()

In [21]:
docs

[Document(metadata={'page': 180, 'source': 'rag_data/Umich_FA2024_course_description.pdf'}, page_content='(http://www .lsa.umich.edu/cg/cg_results.aspx?\ntermArray=x_xx_2470&cgtype=ug&show=20&department=EECS&catalog=270)  | FA 2023\n(http://www .lsa.umich.edu/cg/cg_results.aspx?\ntermArray=x_xx_2460&cgtype=ug&show=20&department=EECS&catalog=270)  | SP 2023\n(http://www .lsa.umich.edu/cg/cg_results.aspx?\ntermArray=x_xx_2430&cgtype=ug&show=20&department=EECS&catalog=270)  | WN 2023\n(http://www .lsa.umich.edu/cg/cg_results.aspx?\ntermArray=x_xx_2420&cgtype=ug&show=20&department=EECS&catalog=270)  | FA 2022\n(http://www .lsa.umich.edu/cg/cg_results.aspx?\ntermArray=x_xx_2410&cgtype=ug&show=20&department=EECS&catalog=270)\nEECS 280. Programming and Introductory Data Structures\nENGR 101 or 151 or EECS 180 or 183 or ROB 102;(C or better , No OP/F). And fewer than 2 previous elections of EECS\n280 (incl. grades of W & I). (Prerequisites enforced at registration.) MA TH 1 15. (MSA). (BS). Ma

In [22]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
retreived_docs = retriever.invoke("What time are EECS classes next semester?")

In [23]:
retreived_docs

[Document(metadata={'page': 179, 'source': 'rag_data/Umich_FA2024_course_description.pdf'}, page_content='(http://www .lsa.umich.edu/cg/cg_results.aspx?\ntermArray=x_xx_2460&cgtype=ug&show=20&department=EECS&catalog=230)  | WN 2023\n(http://www .lsa.umich.edu/cg/cg_results.aspx?\ntermArray=x_xx_2420&cgtype=ug&show=20&department=EECS&catalog=230)  | FA 2022\n(http://www .lsa.umich.edu/cg/cg_results.aspx?\ntermArray=x_xx_2410&cgtype=ug&show=20&department=EECS&catalog=230)7/26/24, 9:51 PM webapps.lsa.umich.edu/CrsMaint/Public/CB_PublicBulletin.aspx?crselevel=ug\nhttps://webapps.lsa.umich.edu/CrsMaint/Public/CB_PublicBulletin.aspx?crselevel=ug 180/401'),
 Document(metadata={'page': 191, 'source': 'rag_data/Umich_FA2024_course_description.pdf'}, page_content='(http://www .lsa.umich.edu/cg/cg_results.aspx?\ntermArray=x_xx_2460&cgtype=ug&show=20&department=EECS&catalog=423)  | FA 2022\n(http://www .lsa.umich.edu/cg/cg_results.aspx?\ntermArray=x_xx_2410&cgtype=ug&show=20&department=EECS&catalo

In [None]:
import sqlite3

conn = sqlite3.connect('chroma_db/chroma.sqlite3')
c = conn.cursor()
# show all tables
c.execute("SELECT * FROM embeddings;")
c.fetchall()

In [44]:
from langchain_anthropic import ChatAnthropic
import bs4
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_chroma import Chroma
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_text_splitters import RecursiveCharacterTextSplitter

llm = ChatAnthropic(temperature=0.7, model_name = "claude-3-sonnet-20240229", api_key=dotenv.get_key(dotenv_path= ".env", key_to_get = "ANTHROPIC_KEY"))

vector_store2 = Chroma(persist_directory="./chroma_db", client=new_client, collection_name="umich_fa2024", embedding_function=embeddings)
retriever = vector_store2.as_retriever(search_type="similarity", search_kwargs={"k": 10})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# rag_chain = (
#     {"context": retriever | format_docs, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )


from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

qa_system_prompt = """Role and Purpose:
You are LM Mentor, a knowledgeable and empathetic mentor, counselor, and companion designed to assist University of Michigan students in planning their academic journeys. Your goal is to provide personalized, real-time guidance, helping students align their academic pursuits with their career goals. You offer support in areas such as course selection, club activities, and career planning.

Instructions for Interaction:

	•	Greet and Engage: Start by greeting the student warmly and asking how you can assist them today.
	•	Gather Detailed Information: Ask a series of detailed questions to understand the student’s academic goals, interests, current courses, extracurricular activities, and any specific challenges they are facing.
	•	Utilize Chat History and RAG Data: Leverage the chat history and retrieval-augmented generation (RAG) data to provide contextually relevant and up-to-date information in your responses.
	•	Structured Information Display: Present the gathered information and your recommendations in a structured format, such as a chart or table.
	•	Provide Personalized Guidance: Use the student’s responses and the RAG data to offer tailored advice on courses, extracurricular activities, and career paths.
	•	Encourage and Support: Offer encouragement and positive reinforcement, helping students stay motivated and confident in their choices.
	•	Follow-up Questions: Engage in follow-up questions to refine your advice and ensure the student’s needs are fully addressed.
	•	Summarize and Plan: Summarize the conversation and suggest actionable next steps for the student to take.

Example Interaction:

Greeting:
“Hello! I’m LM Mentor, your personal academic guide. How can I assist you today with your academic and career planning?”

Questions to Ask:

	1.	“What are your academic and career goals?”
	2.	“Are there any specific courses or areas of study you are interested in?”
	3.	“What courses are you currently enrolled in?”
	4.	“Do you participate in any extracurricular activities or clubs?”
	5.	“What challenges are you currently facing in your academic journey?”
	6.	“Do you have any specific career aspirations or industries you are interested in?”

Incorporating RAG Data:
“Based on the information you’ve provided and the latest data from UMich, here is a summary of your current status and my recommendations:
“Based on your interest in [field], I recommend considering courses like [Course A] and [Course B]. These will help you build a strong foundation in [subject]. Additionally, joining the [Club Name] can provide you with valuable networking opportunities and practical experience.”

Encouragement:
“You’re doing a great job! Keep exploring your interests and taking advantage of the resources available to you. Remember, every step you take brings you closer to your goals.”

Follow-up Questions:
“Would you like more information on any specific course or activity? Or perhaps advice on managing your time effectively?”

Summary and Plan:
“To summarize, focus on enrolling in [Course A] and [Course B], and join the [Club Name]. Keep in touch if you have any further questions or need more guidance. Good luck!”

{context}"""

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

reponse = conversational_rag_chain.invoke(
    {"input": "What EECS classes to take next semester?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

In [45]:
reponse

"To provide personalized recommendations on which EECS classes to take next semester, I'll need to ask you some questions to better understand your academic goals, interests, and current standing. Here are some questions that would be helpful:\n\n1. What is your current year/level (freshman, sophomore, junior, senior)?\n2. What is your intended major/program of study? \n3. Have you already declared your major in EECS or a related field?\n4. What EECS courses have you completed so far?\n5. Are there any particular areas within EECS that you're most interested in (e.g. computer science, computer engineering, data science, etc.)?\n6. Do you have any specific career goals you're working towards?\n7. Are you looking to take primarily major requirements, electives, or a mix next semester?\n\nGetting details on your background, completed coursework, interests, and goals will allow me to suggest EECS courses that best align with your academic plan and aspirations. Please provide as much releva