In [16]:
! pip install chromadb


Collecting chromadb
  Obtaining dependency information for chromadb from https://files.pythonhosted.org/packages/6d/f8/c00068afa88b9d203d874dd9085e239e9cd1d5c843090fb5947a4de85594/chromadb-1.0.0-cp39-abi3-win_amd64.whl.metadata
  Downloading chromadb-1.0.0-cp39-abi3-win_amd64.whl.metadata (7.0 kB)
Collecting build>=1.0.3 (from chromadb)
  Obtaining dependency information for build>=1.0.3 from https://files.pythonhosted.org/packages/84/c2/80633736cd183ee4a62107413def345f7e6e3c01563dbca1417363cf957e/build-1.2.2.post1-py3-none-any.whl.metadata
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Obtaining dependency information for chroma-hnswlib==0.7.6 from https://files.pythonhosted.org/packages/3e/79/1bce519cf186112d6d5ce2985392a89528c6e1e9332d680bf752694a4cdf/chroma_hnswlib-0.7.6-cp311-cp311-win_amd64.whl.metadata
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-win_amd64.whl.metadata (262 bytes)
Collecting fastapi==0.115.


[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
! pip install tiktoken





[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [143]:
! pip install python-dotenv





[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [144]:
import os
import pandas as pd
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from dotenv import load_dotenv

In [145]:
load_dotenv()

True

In [146]:
# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = "OPEN_KEY"

In [140]:
# formating Output
def format_sales_content(row):
    return (
        f"On {row.get('sale_date', 'unknown date')}, a {row.get('fuel_variant', 'fuel')} "
        f"{row.get('model', 'model')} {row.get('car_type', row.get('cartype', 'type'))} - "
        f"{row.get('model_variant', 'variant')} was sold in {row.get('city', 'city')} "
        f"in {row.get('region', 'region')} region for ₹{row.get('sale_amount', 'amount')}."
    )

def format_marketing_content(row):
    return (
        f"{row.get('marketing_channel', 'Campaign')} campaign '{row.get('campaign_name', 'Unnamed')}' "
        f"targeting {row.get('target_cartype', 'vehicles')}s for {row.get('target_audience', 'audience')} "
        f"offered a {row.get('discount_percent', '0%')}% discount and had "
        f"{row.get('actual_sales', '0')} actual sales."
    )

def format_customer_content(row):
    return (
        f"Customer {row.get('customer_id', 'Unknown')} from {row.get('city', 'city')} in "
        f"{row.get('region', 'region')} region prefers {row.get('preferred_cartype', 'cars')} "
        f"with {row.get('fuel_preference', 'fuel')} and has a budget of ₹{row.get('budget', 'N/A')}."
    )
#CSV Loader with Format Routing
def load_csvs_to_documents(file_paths):
    all_docs = []
    for path in file_paths:
        df = pd.read_csv(path)
        source_name = os.path.basename(path).replace(".csv", "")
        for _, row in df.iterrows():
            if source_name == "sales":
                content = format_sales_content(row)
            elif source_name == "marketing_campaign":
                content = format_marketing_content(row)
            elif source_name == "customer":
                content = format_customer_content(row)
            else:
                content = "\n".join(f"{col}: {row[col]}" for col in df.columns)
            all_docs.append(Document(page_content=content, metadata={"source": source_name}))
    return all_docs
        
        

In [133]:
#Chunk documents
def chunk_documents(docs, chunk_size=500, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)

In [134]:
#Embed and store in ChromaDB
def create_chroma_index(chunks, persist_dir="chroma_db"):
    embedding_model = OpenAIEmbeddings()
    vectordb = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_model,
        persist_directory=persist_dir
    )
    vectordb.persist()
    return vectordb

In [128]:
# Query the ChromaDB
def query_chroma(vectordb, query_text, k=5, filters=None):
    results = vectordb.similarity_search(query_text, k=k, filter={"source": "marketing_campaign"})
    for i, result in enumerate(results):
        print(f"\n Result {i+1}:\n{result.page_content}")
        

In [None]:
# --- Main Execution ---
csv_files = [r"C:\Users\Dell\Downloads\sales.csv", r"C:\Users\Dell\Downloads\marketing_campaign.csv", r"C:\Users\Dell\Downloads\customer.csv""]

In [141]:
# Load → Chunk → Store
documents = load_csvs_to_documents(csv_files)
chunks = chunk_documents(documents)
chroma_db = create_chroma_index(chunks)

In [142]:
# Sample query
query_chroma(chroma_db, "SUV campaigns in south region for Summer Sale")


 Result 1:
SMS campaign 'Summer Sale 2025' targeting SUV s for Young Professionals offered a 10%% discount and had 264 actual sales.

 Result 2:
Email campaign 'Summer Sale 2025' targeting SUV s for Budget Buyers offered a 20%% discount and had 350 actual sales.

 Result 3:
Email campaign 'Summer Sale 2025' targeting SUV s for Budget Buyers offered a 30%% discount and had 544 actual sales.

 Result 4:
SMS campaign 'Summer Sale 2025' targeting SUV s for Luxury Car Owners offered a 30%% discount and had 111 actual sales.

 Result 5:
Social Media campaign 'Summer Sale 2025' targeting SUV s for Luxury Car Owners offered a 25%% discount and had 218 actual sales.
