# Configurations

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

gemini_key = os.getenv('GOOGLE_API_KEY')

In [2]:
from langchain.chat_models import init_chat_model

model = init_chat_model("gemini-2.0-flash", model_provider="google_genai", api_key = gemini_key)

In [3]:
model.invoke('How are you?')

AIMessage(content="I am doing well, thank you for asking! As a large language model, I don't experience emotions or feelings in the same way humans do, but I am functioning optimally and ready to assist you with your requests. How can I help you today?", additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, id='run--27cf947c-3fb6-47a9-b14d-dc813d63389f-0', usage_metadata={'input_tokens': 4, 'output_tokens': 53, 'total_tokens': 57, 'input_token_details': {'cache_read': 0}})

# Document Parsing

In [5]:
from typing import List
from langchain_core.documents import Document
import os
import pandas as pd

In [6]:
data = pd.read_csv('../data/Wikipedia Crypto Articles.csv')

In [22]:
data.shape

(218, 2)

In [7]:
data.head()

Unnamed: 0,title,article
0,Cryptocurrency,"A cryptocurrency, crypto-currency, or crypto i..."
1,Bitcoin,Bitcoin (abbreviation: BTC or XBT; sign: ₿) is...
2,Digital currency,"Digital currency (digital money, electronic mo..."
3,Central bank digital currency,A central bank digital currency (CBDC; also ca...
4,Litecoin,Litecoin (Abbreviation: LTC; sign: Ł) is a dec...


In [12]:
data.isna().any(axis=1).sum()

np.int64(9)

In [15]:
# Remove NaNs
data = data.dropna()

In [16]:
data.isna().any(axis=1).sum()

np.int64(0)

In [17]:
documents = [
    Document(
        page_content=row['article'],
        metadata={"title": row['title']}
    )
    for _, row in data.iterrows()
]

In [18]:
len(documents)

218

In [19]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

In [20]:
splits = text_splitter.split_documents(documents)

In [21]:
print(f"Split the documents into {len(splits)} chunks.")

Split the documents into 2894 chunks.


# Create the embeddings for the rag system

In [24]:
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
document_embeddings = embedding_function.embed_documents([split.page_content for split in splits])
print(document_embeddings[0][:5])

  return forward_call(*args, **kwargs)


[-0.030076615512371063, 0.020071297883987427, -0.11568485200405121, -0.019832484424114227, -0.00924318190664053]


# Setting up the vector store for RAG System

In [25]:
from langchain_chroma import Chroma

collection_name = "articles_collection"
vectorstore = Chroma.from_documents(
    collection_name=collection_name,
    documents=splits,
    embedding=embedding_function,
    persist_directory="./chroma_db"
)
print("Vector store created and persisted to './chroma_db'")


  return forward_call(*args, **kwargs)


Vector store created and persisted to './chroma_db'


In [26]:
query = "What is ethereum"
search_results = vectorstore.similarity_search(query, k=2)
print(f"\nTop 2 most relevant chunks for the query: '{query}'\n")
for i, result in enumerate(search_results, 1):
    print(f"Result {i}:")
    print(f"Source: {result.metadata.get('source', 'Unknown')}")
    print(f"Content: {result.page_content}")
    print()

  return forward_call(*args, **kwargs)



Top 2 most relevant chunks for the query: 'What is ethereum'

Result 1:
Source: Unknown
Content: Ethereum is a decentralized blockchain with smart contract functionality. Ether (Abbreviation: ETH;  sign: Ξ) is the native cryptocurrency of the platform. Among cryptocurrencies, ether is second only to bitcoin in market capitalization. It is open-source software.

Result 2:
Source: Unknown
Content: === Ethereum ===



# Building the RAG chain

In [31]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [32]:
# Creating the retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

In [33]:
template = """Answer the question based only on the following context:
{context}
Question: {question}
Answer: """

In [34]:
prompt = ChatPromptTemplate.from_template(template)

In [35]:
def docs2str(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [36]:
rag_chain = (
    {"context": retriever | docs2str, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [50]:
# Testing the rag chain
question = "Tell me something about ethereum"
docs = retriever.invoke(question)
response = rag_chain.invoke(question)
print(f"Question: {question}")
print(f"Answer: {response}")

  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


Question: Tell me something about ethereum
Answer: Ethereum is a decentralized blockchain with smart contract functionality.


In [42]:
print("\n--- Context used (sources) ---")
for i, doc in enumerate(docs, 1):
    title = doc.metadata.get("title", "Unknown source")
    print(f"\n📄 Document {i} (Title: {title}):\n{doc.page_content[:500]}...\n")


--- Context used (sources) ---

📄 Document 1 (Title: Vitalik Buterin):
=== Ethereum ===...


📄 Document 2 (Title: Ethereum):
Ethereum is a decentralized blockchain with smart contract functionality. Ether (Abbreviation: ETH;  sign: Ξ) is the native cryptocurrency of the platform. Among cryptocurrencies, ether is second only to bitcoin in market capitalization. It is open-source software....



# Handling follow up questions

In [51]:
from langchain_core.prompts import MessagesPlaceholder
from langchain.chains import create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain

In [52]:
contextualize_q_system_prompt = """
Given a chat history and the latest user question
which might reference context in the chat history,
formulate a standalone question which can be understood
without the chat history. Do NOT answer the question,
just reformulate it if needed and otherwise return it as is.
"""

In [53]:
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [54]:
contextualize_chain = contextualize_q_prompt | model | StrOutputParser()
print(contextualize_chain.invoke({"input": "What about its fees?", "chat_history": []}))

What are the fees associated with the product or service being discussed?


In [55]:
from langchain.chains.retrieval import create_retrieval_chain

history_aware_retriever = create_history_aware_retriever(
    model, retriever, contextualize_q_prompt
)

qa_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful AI assistant. Use the following context to answer the user's question."),
    ("system", "Context: {context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}")
])

question_answer_chain = create_stuff_documents_chain(model, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


In [56]:
from langchain_core.messages import HumanMessage, AIMessage

chat_history = []
question1 = "What is ethereum"
answer1 = rag_chain.invoke({"input": question1, "chat_history": chat_history})['answer']
chat_history.extend([
    HumanMessage(content=question1),
    AIMessage(content=answer1)
])

print(f"Human: {question1}")
print(f"AI: {answer1}\n")

question2 = "What about its fees?"
answer2 = rag_chain.invoke({"input": question2, "chat_history": chat_history})['answer']
chat_history.extend([
    HumanMessage(content=question2),
    AIMessage(content=answer2)
])

print(f"Human: {question2}")
print(f"AI: {answer2}")


  return forward_call(*args, **kwargs)


Human: What is ethereum
AI: Ethereum is a decentralized blockchain with smart contract functionality. Ether (ETH) is the native cryptocurrency of the platform and is second only to bitcoin in market capitalization. It is open-source software.



  return forward_call(*args, **kwargs)


Human: What about its fees?
AI: For Ethereum, transaction fees differ by computational complexity, bandwidth use, and storage needs. In February 2023, the median transaction fee for Ether corresponded to $2.2845.
