In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.3.2-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.2-cp311-cp311-win_amd64.whl (11.3 MB)
   ---------------------------------------- 0.0/11.3 MB ? eta -:--:--
   ----- ---------------------------------- 1.6/11.3 MB 10.5 MB/s eta 0:00:01
   -------------- ------------------------- 4.2/11.3 MB 11.4 MB/s eta 0:00:01
   ------------------------ --------------- 6.8/11.3 MB 11.7 MB/s eta 0:00:01
   -------------------------------- ------- 9.2/11.3 MB 11.6 MB/s eta 0:00:01
   ---------------------------------------- 11.3/11.3 MB 11.1 MB/s eta 0:00:00
Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, pandas

   ---------------

In [8]:
import pandas as pd

df = pd.read_csv('RAG_TECHNIQUES/data/customers-100.csv', index_col=False)

df.head()

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
0,1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
1,2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
2,3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
3,4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
4,5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/


In [9]:
import os
from dotenv import load_dotenv

# LangChain components
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain

# Load environment variables from .env file
load_dotenv()

# Configure the Gemini API key
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("Gemini API Key not found. Please set it in the .env file.")

# Set up the Gemini model for generation
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.5,
    max_tokens=500,
    timeout=None,
    max_retries=2,
    # other params...
)

In [10]:
# 1. Load the CSV file
# The loader converts each row into a Document object.
# The page_content will be a string representation of the row.
file_path = 'RAG_TECHNIQUES/data/customers-100.csv'
loader = CSVLoader(file_path=file_path)
docs = loader.load()

# Let's inspect the first loaded document to see its structure
print(f"Loaded {len(docs)} documents (rows) from the CSV.")
print("\n--- Example Document ---")
print(f"Page Content:\n{docs[0].page_content}")
print(f"\nMetadata:\n{docs[0].metadata}")

Loaded 100 documents (rows) from the CSV.

--- Example Document ---
Page Content:
Index: 1
Customer Id: DD37Cf93aecA6Dc
First Name: Sheryl
Last Name: Baxter
Company: Rasmussen Group
City: East Leonard
Country: Chile
Phone 1: 229.077.5154
Phone 2: 397.884.0519x718
Email: zunigavanessa@smith.info
Subscription Date: 2020-08-24
Website: http://www.stephenson.com/

Metadata:
{'source': 'RAG_TECHNIQUES/data/customers-100.csv', 'row': 0}


In [None]:
# 2. Text Chunking (or in this case, row-level chunking is already done by the loader)
# We can skip splitting if rows are small enough
# In a real-world scenario with long text in columns, this would be important.


# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# split_documents = text_splitter.split_documents(docs)

# print(f"Total documents after splitting (if any): {len(split_documents)}")

In [12]:
# 3. Vector Store Creation
embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

print("Creating vector store...")
vector_store = FAISS.from_documents(docs, embeddings)
print("Vector store created successfully.")

Creating vector store...
Vector store created successfully.


In [13]:
print(f"Vector store contains {vector_store.index.ntotal} document chunks.")

Vector store contains 100 document chunks.


In [14]:
# 4. Retriever Setup
retriever = vector_store.as_retriever(search_kwargs={"k": 3}) # Retrieve top 3 relevant rows

# 5. RAG Chain Creation
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
    
])

document_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [None]:
answer= retrieval_chain.invoke({"input": "which company does sheryl Baxter work for?"})
answer['answer']

'Sheryl Baxter works for Rasmussen Group.'

In [17]:
answer= retrieval_chain.invoke({"input": "What's Joana Bender phone number and email?"})
answer['answer']

"Joanna Bender's phone numbers are 001-234-203-0635x76146 and 001-199-446-3860x3486. Her email address is colinalvarado@miles.net."

In [21]:
answer

{'input': "What's Joana Bender phone number and email?",
 'context': [Document(id='3edd3b2b-d117-4bcf-9f01-ddfbcec5441c', metadata={'source': 'RAG_TECHNIQUES/data/customers-100.csv', 'row': 4}, page_content='Index: 5\nCustomer Id: 053d585Ab6b3159\nFirst Name: Joanna\nLast Name: Bender\nCompany: Martin, Lang and Andrade\nCity: West Priscilla\nCountry: Slovakia (Slovak Republic)\nPhone 1: 001-234-203-0635x76146\nPhone 2: 001-199-446-3860x3486\nEmail: colinalvarado@miles.net\nSubscription Date: 2021-04-17\nWebsite: https://goodwin-ingram.com/'),
  Document(id='44f5e2ba-6c1a-4cdf-a3c1-1a4bdea0b7c3', metadata={'source': 'RAG_TECHNIQUES/data/customers-100.csv', 'row': 45}, page_content='Index: 46\nCustomer Id: fD780ED8dbEae7B\nFirst Name: Joanne\nLast Name: Montes\nCompany: Price, Sexton and Mcdaniel\nCity: Gwendolynview\nCountry: Palau\nPhone 1: (897)726-7952\nPhone 2: (467)886-9467x5721\nEmail: juan80@henson.net\nSubscription Date: 2020-07-01\nWebsite: http://ochoa.com/'),
  Document(id='b