In [1]:
from langchain_core.documents import Document
from langchain_community.document_loaders.csv_loader import CSVLoader
import pandas as pd
import os 
os.makedirs("../data/fashion_data",exist_ok=True)

In [8]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Go through dataframe
df = pd.read_csv("../data/fashion_data/Zara_Product_raw_data_20250628.csv")

In [5]:
csv_loader = CSVLoader(file_path="../data/fashion_data/Zara_Product_raw_data_20250628.csv")

In [7]:
csv_loader.load()

[Document(page_content='product_id: 5584/361/250\nProduct_name: BASIC SLIM FIT T-SHIRT\nProduct_color: WHITE | 5584/361/250\nProduct_price: $ 22.90\nProduct_image_uRL: https://static.zara.net/assets/public/efa3/a952/b15f45aba931/14b9517d1705/01887461250-e1/01887461250-e1.jpg?ts=1739262441992&w=204\nProduct_url: https://www.zara.com/ca/en/basic-slim-fit-t-shirt-p05584361.html\nOutfit_image_uRL: https://static.zara.net/assets/public/05d6/f64c/6e5c478c8a15/65ea2e03eb80/05584261250-p/05584261250-p.jpg?ts=1727277108008&w=570\nProduct_description: Slim fit T-shirt made of cotton fabric with stretch. Round neck and short sleeves.\n\nLighter than our basic Medium Weight model.\nMaterials_care: Composition: 95% cotton, 5% elastane', metadata={'source': '../data/fashion_data/Zara_Product_raw_data_20250628.csv', 'row': 0}),
 Document(page_content='product_id: 4410/010/052\nProduct_name: LINEN PANTS\nProduct_color: LIGHT BEIGE | 4410/010/052\nProduct_price: $ 79.90\nProduct_image_uRL: https://stat

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_documents(documents,chunk_size=100,chunk_overlap=50):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [10]:
class EmbeddingGenerator:
    def __init__(self,model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        self.model = SentenceTransformer(self.model_name)
    def generate_embeddings(self, text: list[str]) ->np.ndarray:
        embeddings = self.model.encode(text)
        return embeddings

In [11]:
csv_embeddings = EmbeddingGenerator()

In [32]:
class ChromaDBGenerator:
    def __init__(self,collection_name: str= "csv_collection",persist_directory: str="../data/fashion_vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_client()
    
    def _initialize_client(self):
        self.client = chromadb.Client(Settings(
            persist_directory=self.persist_directory,
            chroma_db_impl="duckdb+parquet",
            anonymized_telemetry=False
        ))
        self.collection = self.client.get_or_create_collection(name=self.collection_name)
    
    def add_documents(self,documents: List[Document],embeddings: EmbeddingGenerator):

        ids = []
        metadatas = []
        documents_texts = []
        embeddings_list = []
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            #prepare metadata

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            documents_texts.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids = ids,
                metadatas = metadatas,
                documents = documents_texts,
                embeddings = embeddings_list
            )
            print(f"Added {len(documents)} documents to vector store")



        except Exception as e:
            print(f"Error adding documents to vector store at index {i}")
            raise

In [33]:
csv_chroma_db = ChromaDBGenerator()

Failed to send telemetry event client_start: capture() takes 1 positional argument but 3 were given
Using embedded DuckDB with persistence: data will be stored in: ../data/fashion_vector_store
No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


In [15]:
chunks = split_documents(csv_loader.load())

Split 80 documents into 1055 chunks

Example chunk:
Content: product_id: 5584/361/250
Product_name: BASIC SLIM FIT T-SHIRT
Product_color: WHITE | 5584/361/250...
Metadata: {'source': '../data/fashion_data/Zara_Product_raw_data_20250628.csv', 'row': 0}


In [17]:
texts = [doc.page_content for doc in chunks] 

In [19]:
embeddings = csv_embeddings.generate_embeddings(texts)

In [34]:
csv_chroma_db.add_documents(chunks,embeddings)


Failed to send telemetry event collection_add: capture() takes 1 positional argument but 3 were given


Added 1055 documents to vector store


In [51]:
class RAGFashion:
    def __init__(self, vector_store: ChromaDBGenerator, embeddings_manager: EmbeddingGenerator):
        self.vector_store = vector_store
        self.embeddings = embeddings_manager

    def retrieve_similar_products(self, query: str, top_k: int = 1) -> List[Dict[str, Any]]:
        query_embedding = self.embeddings.generate_embeddings([query])[0].tolist()
        results = self.vector_store.collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k
        )
        retrieved_items = []
        for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
            retrieved_items.append({
                "document": doc,
                "metadata": metadata
            })
        return retrieved_items

In [53]:
RAGashion = RAGFashion(vector_store=csv_chroma_db, embeddings_manager=csv_embeddings)
RAGashion.retrieve_similar_products("colorful short",top_k=3)

[{'document': 'Product_name: TEXTURED TECHNICAL SHORTS\nProduct_color: OYSTER-WHITE | 0761/451/251',
  'metadata': {'source': '../data/fashion_data/Zara_Product_raw_data_20250628.csv',
   'row': 17,
   'doc_index': 215,
   'content_length': 82}},
 {'document': 'Product_name: TEXTURED TECHNICAL SHORTS\nProduct_color: OYSTER-WHITE | 0761/451/251',
  'metadata': {'source': '../data/fashion_data/Zara_Product_raw_data_20250628.csv',
   'row': 62,
   'doc_index': 818,
   'content_length': 82}},
 {'document': 'Product_description: Regular fit shorts made of linen fabric. Adjustable elastic drawstring',
  'metadata': {'source': '../data/fashion_data/Zara_Product_raw_data_20250628.csv',
   'row': 54,
   'doc_index': 719,
   'content_length': 91}}]