In [2]:
import pandas as pd

from uuid import uuid4
import chromadb

from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma


In [3]:
df = pd.read_csv("../../data/rag-data.csv")

In [17]:
client = chromadb.PersistentClient(path="../../chroma_store")
collection_name="wr-uae"
collection = client.get_or_create_collection(collection_name)
embed_model = OllamaEmbeddings(model="nomic-embed-text:v1.5")
vector_store = Chroma(
    collection_name=collection_name,
    embedding_function=embed_model,
    client=client
)

In [18]:
def find_relevant(query: str, filter: str | None = None, k: int = 5) -> str:
    """Search the vector store and return a compact RESULTS list plus a CONTEXT block."""
   
    hits = vector_store.similarity_search(query, k=k, filter=filter)
    
    return [{"category": d.metadata['category'], "content": d.page_content} for d in hits]

In [28]:
cat = "wage-protection"

for _, row in df[df["category"] == cat].iterrows():
    sim = find_relevant(row['chunk'], filter={"category": cat}, k=13)
    print(sim)
    break



In [29]:
for s in sim:
    print(s)

{'category': 'wage-protection', 'content': 'This is not an official Translation \nMINISTERIAL RESOLUTION NO.(598) OF 2022 CONCERNING THE WAGES \nPROTECTION SYSTEM \n \nThe Minister of Human Resources & Emiratisation \nHaving perused:   \n\uf0a7 Federal Law No.(1) of 1972 on the competencies of the Ministries and Powers of \nthe Ministers and amendments thereof, \n\uf0a7 Federal Decree-Law No. (33) of 2021 concerning Regulation of Labor Relations \nand its Executive Regulations and its executive regulations issued pursuant to \nCabinet Resolution No. (1) of 2022  \n\uf0a7 Cabinet Resolution No. (21) of 2020 concerning service fees and administrative \nfines at the Ministry of Human Resources and Emiratisation \n\uf0a7 Ministerial Resolution No. (43) of 2022 concerning the Wages Protection System \n\uf0a7 Ministerial Resolution No. (346) of 2022 concerning the amendment of certain \nprovisions of Ministerial Resolution No. (43) of 2022 concerning the Wages \nProtection System \n\uf0a7 Mi

In [None]:
def prep_docs(ret_data):
    return "\n\n".join([f"**Document : {idx+1}, Document Category: `{doc['category']}`**\n" + doc["content"] for idx, doc in enumerate(ret_data)])
    

def get_gen_docs():
    global df 
    
    categories = df["category"].unique().tolist()
    main_docs = {}


    for c in categories:
        sample_df = df[df["category"] == c].sample(n=5, random_state=42)
        main_docs[c] = sample_df.index.tolist()
        
        
    df_data = []

    for key, val in main_docs.items():
        docs = df.iloc[val]["chunk"].tolist()
        
        for d in docs: 
            sim_docs = find_relevant(d, filter={"category": key}, k=13)
            sim_docs = sim_docs[1:]
            diff_docs = find_relevant(d, filter={"category": {"$ne": key}}, k=12)
            for i in range(0, 12, 4):
                sim_d = prep_docs(sim_docs[i:i+1])
                diff_d = prep_docs(diff_docs[i:i+1])
                original_doc = prep_docs([{"category": key, "content": d}])
                df_data += [{
                    "id": str(uuid4()),
                    "doc": original_doc + sim_d
                }, 
                 {
                    "id": str(uuid4()), 
                    "doc": original_doc + diff_d
                 }
                                 
                ]
    return pd.DataFrame(df_data)
    

In [11]:
gen_doc_data = get_gen_docs()


In [13]:
gen_doc_data.to_csv("./data/doc_data.csv", index=False)