In [2]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [1]:
from langchain_openai import ChatOpenAI


In [5]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    encode_kwargs={"normalize_embeddings": True},  # for cosine similarity
)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [6]:
# Test the embeddings to make sure they work
test_text = "sun is bright"
test_embedding = embeddings.embed_query(test_text)
print(len(test_embedding))
print(test_embedding[:5])

768
[-0.005224306136369705, -0.057863906025886536, -0.027607057243585587, -0.014645080082118511, 0.006460424512624741]


In [11]:
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.chains import RetrievalQA
file = 'sale.csv'

In [12]:
loader = CSVLoader(file_path=file)
docs = loader.load()
#print(docs)

db = DocArrayInMemorySearch.from_documents(docs, embeddings)

In [13]:
print(docs)

[Document(metadata={'source': 'sale.csv', 'row': 0}, page_content='Region: Australia and Oceania\nCountry: Tuvalu\nItem Type: Baby Food\nSales Channel: Offline\nOrder Priority: H\nOrder Date: 5/28/2010\nOrder ID: 669165933\nShip Date: 6/27/2010\nUnits Sold: 9925\nUnit Price: 255.28\nUnit Cost: 159.42\nTotal Revenue: 2533654.00\nTotal Cost: 1582243.50\nTotal Profit: 951410.50'), Document(metadata={'source': 'sale.csv', 'row': 1}, page_content='Region: Central America and the Caribbean\nCountry: Grenada\nItem Type: Cereal\nSales Channel: Online\nOrder Priority: C\nOrder Date: 8/22/2012\nOrder ID: 963881480\nShip Date: 9/15/2012\nUnits Sold: 2804\nUnit Price: 205.70\nUnit Cost: 117.11\nTotal Revenue: 576782.80\nTotal Cost: 328376.44\nTotal Profit: 248406.36'), Document(metadata={'source': 'sale.csv', 'row': 2}, page_content='Region: Europe\nCountry: Russia\nItem Type: Office Supplies\nSales Channel: Offline\nOrder Priority: L\nOrder Date: 5/2/2014\nOrder ID: 341417157\nShip Date: 5/8/2014

In [27]:
llm = ChatOpenAI(
    temperature=0.0,
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
    model="openai/gpt-oss-20b:free",
)

In [32]:
retriever = db.as_retriever(search_kwargs={"k": 35})

In [33]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [36]:
query =  "what are the total number of orders in europe?"

In [37]:
response = qa_stuff.run(query)
print(response)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
There are **15 orders** recorded for the Europe region.
