In [None]:
from load_file import load_dataframe
from langchain_community.document_loaders.csv_loader import CSVLoader
import os
from dotenv import load_dotenv
load_dotenv()
file_loc = os.getenv("file_location")

In [None]:
loader = CSVLoader(file_path=file_loc)

documents = loader.load()

df = load_dataframe(file_loc)

Data is loaded into dataframe sucessfully
    Invoice ID Branch       City Customer type  Gender  \
0  750-67-8428      A     Yangon        Member  Female   
1  226-31-3081      C  Naypyitaw        Normal  Female   
2  631-41-3108      A     Yangon        Normal    Male   
3  123-19-1176      A     Yangon        Member    Male   
4  373-73-7910      A     Yangon        Normal    Male   

             Product line  Unit price  Quantity   Tax 5%     Total       Date  \
0       Health and beauty       74.69         7  26.1415  548.9715   1/5/2019   
1  Electronic accessories       15.28         5   3.8200   80.2200   3/8/2019   
2      Home and lifestyle       46.33         7  16.2155  340.5255   3/3/2019   
3       Health and beauty       58.22         8  23.2880  489.0480  1/27/2019   
4       Sports and travel       86.31         7  30.2085  634.3785   2/8/2019   

    Time      Payment    cogs  gross margin percentage  gross income  Rating  
0  13:08      Ewallet  522.83              

In [22]:
type(documents)

list

In [43]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chuck_documents(documents,hunk_size=550, chunk_overlap=50):
    if not documents:
        print("No documents to chunk.")
        return None
    try:
        text_split = RecursiveCharacterTextSplitter(
            chunk_size=hunk_size,
            chunk_overlap=chunk_overlap
        )
        chunked_docs = text_split.split_documents(documents)
        print(f"Created {len(chunked_docs)} chunks from {len(documents)} original documents")
        return chunked_docs
    except Exception as e:
        print(f"Error creating text splitter: {e}")
        return None


chunk_doc = chuck_documents(documents)
print(f"Total chunks created: {len(chunk_doc)}")

Created 1000 chunks from 1000 original documents
Total chunks created: 1000


In [47]:
print(chunk_doc[498])

page_content='Invoice ID: 401-18-8016
Branch: B
City: Mandalay
Customer type: Member
Gender: Female
Product line: Sports and travel
Unit price: 98.13
Quantity: 1
Tax 5%: 4.9065
Total: 103.0365
Date: 1/21/2019
Time: 17:36
Payment: Cash
cogs: 98.13
gross margin percentage: 4.761904762
gross income: 4.9065
Rating: 8.9' metadata={'source': 'D:\\K_AgentiAI\\DASH-AI_Agent\\Data\\CSV\\supermarket_sales.csv', 'row': 498}


In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

def vector_db_setup(chunk_doc,collection_name="Supermarket_sales",persist_directory="./chroma_langchain_db"):
    if not chunk_doc :
        print("No documents to add to vector store")
        return None,None
    try:
        print("please wait while model is Initializing.....")
        embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
        vector_store = Chroma(
            collection_name=collection_name,
            embedding_function=embed,
            persist_directory=persist_directory,
         )
        print(f"Adding {len(chunk_doc)} document chunks to vector store...")
        vector_store.add_documents(chunk_doc)
        retriver =  vector_store.as_retriever(
            search_kwargs={"k": 4}  # Return top 4 most relevant chunks
        ) 
        print("Vector store setup complete.")
        return vector_store,retriver
    except Exception as e:
        print(f"Error creating vector store: {e}")
        return None, None


In [61]:
# Make sure chunked_docs is already created from previous steps
vector_store, retriever = vector_db_setup(chunk_doc)


please wait while model is Initializing.....
Adding 1000 document chunks to vector store...
Vector store setup complete.


In [62]:
print("Total vectors in store:", vector_store._collection.count())

Total vectors in store: 2000
