In [1]:
# Import Libraries
import os
import glob
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.prompts import MessagesPlaceholder

In [2]:
# Load api key from .env file
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file")

print("API key loaded Successfully")

API key loaded Successfully


### Document Collection

In [5]:
# Load PDF documents from a specified folder
documents = []

for pdf_path in glob.glob("documents/*.pdf"):  #
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    documents.extend(docs)

print(f"Loaded {len(documents)} PDF Documents.")

Loaded 5 PDF Documents.


In [8]:
# Create splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=70,
    separators=["\n\n", "\n", ".", " ", ""]
)

# Split documents
chunks = text_splitter.split_documents(documents)

print(f"Split {len(documents)} documents into {len(chunks)} chunks")
for i, chunk in enumerate(chunks):
    print(f"\nChunk {i+1}: {chunk.page_content}")

Split 5 documents into 15 chunks

Chunk 1: I, Bello Oluwadamilare, am a Data Scientist and AI Engineer with a
specialized focus on software engineering and the development of
intelligent systems that harmonize technical hardware principles with
advanced predictive modeling. My expertise is centered on the
engineering sector, where I leverage data-driven decision-making and
applied machine learning to optimize complex systems and improve
operational outcomes. I hold a Bachelor of Engineering (B.Eng) in

Chunk 2: operational outcomes. I hold a Bachelor of Engineering (B.Eng) in
Electrical and Electronics Engineering from the Federal University of
Agriculture, Abeokuta, and a National Diploma in the same field from the
Federal Polytechnic, Ilaro, providing me with a robust analytical
foundation. Over the past few years, I have spearheaded diverse technical
projects, including a Chicago crime data analysis for identifying
behavioral hotspots, a predictive system for loan eligibility based 

### Embeddings

In [None]:
# Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=api_key
)

# Test embedding
test_embedding = embeddings.embed_query("Who is Bello?")
print(f"Embedding dimension: {len(test_embedding)}")
print(f"First 5 values: {test_embedding[:5]}")

### Vector Store

In [None]:
# Create vector store from documents
vectorstore = Chroma.from_documents(
    chunks,
    embeddings,
    collection_name="my_info_collection",
    persist_directory="./chroma_db"
)

In [None]:
# Test retriver
query = "Which School did Bello attend?"

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

results = retriever.invoke(query)
results

### Conversational RAG

In [None]:
# Create LLM
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
    openai_api_key=api_key
)

# Create retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}
)

# Prompt
prompt = ChatPromptTemplate.from_template("""
You are an AI assistant answering questions about Bello Oluwadamilare T. using the provided documents.

Use ONLY the context below to answer the question.
If the answer is not in the context, say "I don't know."

<context>
{context}
</context>

Question: {question}

Answer in clear sentences.
At the end, list the sources you used as bullet points.
""")

# format documents
def format_docs(docs):
    return "\n\n".join(
        f"Source: {doc.metadata.get('source', 'unknown')}\n{doc.page_content}"
        for doc in docs
    )

# RAG chain Using LCEL
rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# Test RAG chain
query = "What ML projects has Oluwadamilare worked on?"
response = rag_chain.invoke(query)
print(response)

### Store Conversational RAG History

In [None]:
# Store for chat histories
chat_store = {}

def get_session_history(session_id: str):
    if session_id not in chat_store:
        chat_store[session_id] = InMemoryChatMessageHistory()
    return chat_store[session_id]

# Create conversational prompt
conv_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an AI assistant answering questions about Olasunkanmi Akeem Rasak using the provided documents. Use ONLY the context below to answer the question. If the answer is not in the context, say I don't know."),
    MessagesPlaceholder(variable_name="chat_history"),
    ("system", "Answer in clear sentences. At the end, list the sources you used as bullet points."),
    ("human", "Context: {context}\n\nQuestion: {question}")
])

# Build base chain
conv_chain_base = (
    RunnableParallel(
        context=lambda x: format_docs(retriever.invoke(x["question"])),
        question=lambda x: x["question"],
        chat_history=lambda x: x.get("chat_history", [])
    )
    | conv_prompt
    | llm
    | StrOutputParser()
)

# Wrap with message history
conv_chain = RunnableWithMessageHistory(
    conv_chain_base,
    get_session_history,
    input_messages_key="question",
    history_messages_key="chat_history"
)


### Queries

In [None]:
# First question
response = conv_chain.invoke(
    {"question": "Which School did Oluwadamilare attend?"},
    config={"configurable": {"session_id": "user_1"}}
)
print("Response 1:\n", response)

# Follow-up question
response2 = conv_chain.invoke(
    {"question": "Which of the schools did he obtain National Diploma from?"},
    config={"configurable": {"session_id": "user_1"}}
)

print("\nResponse 2:\n", response2)