In [None]:
! pip install tiktoken


In [None]:
! pip install -U sentence-transformers langchain chromadb transformers==4.34.1


In [3]:
import os
import pandas as pd
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from dotenv import load_dotenv
import shutil

In [13]:
def format_sales_content(row):
    content = (
        f"Sale ID: {row['sale_id']}, "
        f"Customer ID: {row['customer_id']}, "
        f"Model: {row['model']} ({row['model_variant']}), "
        f"Car Type: {row['car_type']}, "
        f"Fuel: {row['fuel_variant']}, "
        f"Region: {row['region']}, City: {row['city']}, "
        f"Date: {row['sale_date']}, Amount: {row['sale_amount']}"
    )
    return content

In [14]:
def format_marketing_content(row):
    content = (
        f"Campaign: {row['campaign_name']} (ID: {row['campaign_id']}), "
        f"Dates: {row['start_date']} to {row['end_date']}, "
        f"Channel: {row['marketing_channel']}, "
        f"Target: {row['target_audience']} interested in {row['target_cartype']}, "
        f"Discount: {row['discount_percent']}%, "
        f"Expected Sales: {row['expected_sales']}, Actual Sales: {row['actual_sales']}"
    )
    return content

In [15]:
def format_customer_content(row):
    content = (
        f"Customer ID: {row['customer_id']}, "
        f"Age: {row['age']}, Gender: {row['gender']}, "
        f"Preferred Model: {row['preferred_model']} ({row['preferred_model_variant']}), "
        f"Car Type: {row['preferred_cartype']}, "
        f"Fuel Variant: {row['preferred_fuel_variant']}, "
        f"Purchase Type: {row['purchase_type']}"
    )
    return content

In [16]:


def load_csvs_to_documents(file_paths):
    all_docs = []
    
    for path in file_paths:
        df = pd.read_csv(path)
        source_name = os.path.basename(path).replace(".csv", "")
        
        for _, row in df.iterrows():
            if source_name == "sales":
                content = format_sales_content(row)
                metadata = {
                    "source": source_name,
                    "region": row.get("region", ""),
                    "city": row.get("city", ""),
                    "fuel_variant": row.get("fuel_variant", ""),
                    "car_type": row.get("car_type", "")
                }
            
            elif source_name == "marketing_campaign":
                content = format_marketing_content(row)
                metadata = {
                    "source": source_name,
                    "channel": row.get("marketing_channel", ""),
                    "target_cartype": row.get("target_cartype", ""),
                    "discount": row.get("discount_percent", ""),
                    "start_date": row.get("start_date", ""),
                    "end_date": row.get("end_date", "")
                }

            elif source_name == "customer":
                content = format_customer_content(row)
                metadata = {
                    "source": source_name,
                    "age": row.get("age", ""),
                    "gender": row.get("gender", ""),
                    "preferred_model": row.get("preferred_model", ""),
                    "preferred_fuel_variant": row.get("preferred_fuel_variant", ""),
                    "purchase_type": row.get("purchase_type", "")
                }
            
            else:
                content = "\n".join(f"{col}: {row[col]}" for col in df.columns)
                metadata = {"source": source_name}

            all_docs.append(Document(page_content=content, metadata=metadata))

    return all_docs


In [5]:
#Chunk documents
def chunk_documents(docs, chunk_size=500, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)

In [6]:
def create_chroma_index(chunks, persist_dir="chroma_db"):
    # Clean old DB if needed
    import shutil
    shutil.rmtree(persist_dir, ignore_errors=True)

    embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2")


    vectordb = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_model,
        persist_directory=persist_dir
    )
    vectordb.persist()
    return vectordb


In [7]:
# ----------- Main Execution-----------
csv_files = [
        r"C:\Users\Dell\Downloads\customer.csv",
        r"C:\Users\Dell\Downloads\sales.csv",
        r"C:\Users\Dell\Downloads\marketing_campaign.csv"
    ]



    
    

In [17]:
# Load → Chunk → Store
all_documents = []
all_documents = load_csvs_to_documents(csv_files)
chunks = chunk_documents(all_documents)
chroma_db = create_chroma_index(chunks)

  embedding_model = HuggingFaceEmbeddings(
  from tqdm.autonotebook import tqdm, trange
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  vectordb.persist()


In [35]:
def query_chroma(vectordb, query_text, k=5, source=None, channel=None):
    # Build filter dynamically using $and
    where_filters = []
    if source:
        where_filters.append({"source": source})
    if channel:
        where_filters.append({"channel": channel})
    
    filter_dict = {"$and": where_filters} if where_filters else None

    results = vectordb.similarity_search(query_text, k=k, filter=filter_dict)

    for i, result in enumerate(results):
        print(f"\nResult {i+1} (Source: {result.metadata.get('source')}):\n{result.page_content}")


In [37]:
query_chroma(
    chroma_db,
    query_text="show campaign using marketing_channel socialmedia",
    k=5,
    source="marketing_campaign",
    channel="Social Media"
)



Result 1 (Source: marketing_campaign):
Campaign: Festive Offer (ID: CAMPAIGN043), Dates: 2024-08-03 to 2025-06-27, Channel: Social Media, Target: Young Professionals interested in SUV , Discount: 10%%, Expected Sales: 188, Actual Sales: 53

Result 2 (Source: marketing_campaign):
Campaign: Festive Offer (ID: CAMPAIGN063), Dates: 2025-01-26 to 2025-06-06, Channel: Social Media, Target: Luxury Car Owners interested in SUV , Discount: 25%%, Expected Sales: 349, Actual Sales: 439

Result 3 (Source: marketing_campaign):
Campaign: New Launch Discount (ID: CAMPAIGN036), Dates: 2025-02-05 to 2025-06-10, Channel: Social Media, Target: Budget Buyers interested in SUV , Discount: 20%%, Expected Sales: 127, Actual Sales: 319

Result 4 (Source: marketing_campaign):
Campaign: Summer Sale 2025 (ID: CAMPAIGN074), Dates: 2024-09-07 to 2025-06-20, Channel: Social Media, Target: Luxury Car Owners interested in SUV , Discount: 25%%, Expected Sales: 344, Actual Sales: 218

Result 5 (Source: marketing_campa