In [1]:
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from langchain_huggingface import HuggingFaceEmbeddings
import pandas as pd
import uuid

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chroma_client = chromadb.PersistentClient(path="./chroma_data")

In [3]:
embedding_model = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B")

In [4]:
class CustomEmbeddingFunction(EmbeddingFunction):
    def __init__(self, embedding_model: HuggingFaceEmbeddings):
        self.embedding_model = embedding_model

    def __call__(self, texts: Documents) -> Embeddings:
        return self.embedding_model.embed_documents(texts)

In [5]:
chroma_collection = chroma_client.get_or_create_collection(
    name="requirements_collection_qwen3",
    embedding_function=CustomEmbeddingFunction(embedding_model=embedding_model),
)

# Data Preparation and Ingestion

In [13]:
training_df = pd.read_csv("./dataset/PURE_train.csv")

training_df.head()

Unnamed: 0.1,Unnamed: 0,Requirement,Name of Doc,Req/Not Req
0,0,The solution should provide detailed context-s...,cctns.pdf,Req
1,1,The help should be accessible to the users bot...,cctns.pdf,Req
2,2,The solution should provide an interface for t...,cctns.pdf,Req
3,3,"The solution should send alerts (e.g., email, ...",cctns.pdf,Req
4,4,The solution should enable the user to track t...,cctns.pdf,Req


In [20]:
training_df["Req/Not Req"].value_counts()

Req/Not Req
Req        2832
Not_Req    2474
Name: count, dtype: int64

In [28]:
requirements = training_df["Requirement"].tolist()
labels_metadata = training_df["Req/Not Req"].tolist()
labels_metadata = [
    {"is_req": True} if label == "Req" else {"is_req": False}
    for label in labels_metadata
]
unique_ids = [str(uuid.uuid4()) for i in range(len(requirements))]

In [30]:
chroma_collection.add(
    documents=requirements,
    metadatas=labels_metadata,
    ids=unique_ids,
)

In [6]:
chroma_collection.peek()

{'ids': ['b0a97a75-6a87-4612-ae74-c4584514bcd3',
  '694a8833-3614-4416-a8c7-8e02b656bff2',
  '40112f1b-ed01-46d5-88ae-b881fee28d28',
  'aa267958-38e2-499e-8c9f-882dd8cba429',
  '2e8d8adb-0822-4e4d-b5d7-3edd068c994c',
  '87df8e4a-b6c1-4a7e-8115-9824c4d466f4',
  '9e7887d7-e1fe-442b-b6f1-519937ee4f6d',
  'e850158d-0b3a-4bc0-824a-8d73c6745fa9',
  '6a873980-b849-491a-a36f-3ec4ab84b1f9',
  'e520ee1b-93b8-4652-b35a-20976328a34d'],
 'embeddings': array([[ 0.00036489,  0.01039584, -0.01005721, ..., -0.00334276,
          0.04283836,  0.0261395 ],
        [-0.01276445, -0.00919969, -0.00693607, ..., -0.03069634,
         -0.01462597, -0.00936372],
        [ 0.07851893,  0.00889595, -0.00620396, ...,  0.00966742,
          0.06437241, -0.00990432],
        ...,
        [ 0.0103562 , -0.03702731, -0.00657598, ...,  0.03955981,
          0.01012854, -0.03544005],
        [-0.02820624, -0.04733935, -0.00909049, ...,  0.02690835,
          0.03394987, -0.02174129],
        [ 0.01686939, -0.05201786, 

# Query and Other Experiments

In [8]:
chroma_collection.query(
    query_texts=["The system shall allow users to reset their passwords."],
    n_results=2
)

{'ids': [['8b07bab4-5452-4d96-8f4b-c8c8422f643f',
   '24a03a2b-7199-4af2-ac2c-bc4bfea392e2']],
 'embeddings': None,
 'documents': [['The system gives users the ability to reset their password. Priority 2 0460 Individual passwords can be reset. ',
   'The system only allows users to change their own passwords. ']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'is_req': True}, {'is_req': True}]],
 'distances': [[0.29834192991256714, 0.4580190181732178]]}

In [None]:
# Get all embeddings and metadata from the collection
collection_data = chroma_collection.get(
    include=["metadatas", "embeddings"]
)

{'ids': ['b0a97a75-6a87-4612-ae74-c4584514bcd3',
  '694a8833-3614-4416-a8c7-8e02b656bff2',
  '40112f1b-ed01-46d5-88ae-b881fee28d28',
  'aa267958-38e2-499e-8c9f-882dd8cba429',
  '2e8d8adb-0822-4e4d-b5d7-3edd068c994c',
  '87df8e4a-b6c1-4a7e-8115-9824c4d466f4',
  '9e7887d7-e1fe-442b-b6f1-519937ee4f6d',
  'e850158d-0b3a-4bc0-824a-8d73c6745fa9',
  '6a873980-b849-491a-a36f-3ec4ab84b1f9',
  'e520ee1b-93b8-4652-b35a-20976328a34d',
  'f9d1da57-671d-4177-a3c0-2355c991dc40',
  'f04025ce-42e7-4591-863d-e5e367b075fc',
  '27355e07-1205-4685-bcbb-b842c3a93fab',
  '4c3f7a5b-519f-45c0-87a8-580ff45e3202',
  '829cc61b-f0f2-4f06-a23d-b00c29031ccc',
  'b21acbc0-1077-41e9-9ef0-f13aa821cc30',
  '0f365ee3-3465-4c80-99af-dbe6bb9b2469',
  '2e3d3b0a-7b56-4930-8fb8-f2e414b5f4e8',
  '77ddb690-ec06-4752-9f8f-83179360b8cc',
  '793be7b2-bd53-4ed1-a958-a9e70052b4f0',
  'b6039d5f-3306-4e46-addd-c595ce788ef6',
  '2e8d4275-3aaf-4986-9d32-d1d4f3d11713',
  'e8053cf8-ac7a-4a83-8a1e-3b0bb36b651b',
  'e1f3bc27-1d83-47f7-b8f8-

In [None]:
import umap
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Extract embeddings and metadata
embeddings = np.array(collection_data['embeddings'])
metadatas = collection_data['metadatas']

# Extract labels for coloring
colors = ['green' if meta['is_req'] else 'red' for meta in metadatas]

print(f"Number of embeddings: {len(embeddings)}")
print(f"Embedding dimension: {embeddings.shape[1]}")

In [None]:
# Apply UMAP dimensionality reduction to 2D
reducer = umap.UMAP(n_components=2, random_state=42)
embeddings_2d = reducer.fit_transform(embeddings)

print(f"Reduced embeddings shape: {embeddings_2d.shape}")

In [None]:
# Create the visualization
plt.figure(figsize=(12, 8))

# Separate requirements and non-requirements for better visualization
req_indices = [i for i, meta in enumerate(metadatas) if meta['is_req']]
non_req_indices = [i for i, meta in enumerate(metadatas) if not meta['is_req']]

# Plot non-requirements (red)
plt.scatter(embeddings_2d[non_req_indices, 0], 
           embeddings_2d[non_req_indices, 1], 
           c='red', 
           label='Non-Requirement', 
           alpha=0.6, 
           s=50)

# Plot requirements (green)
plt.scatter(embeddings_2d[req_indices, 0], 
           embeddings_2d[req_indices, 1], 
           c='green', 
           label='Requirement', 
           alpha=0.6, 
           s=50)

plt.title('UMAP Visualization of Requirement Embeddings', fontsize=16)
plt.xlabel('UMAP Dimension 1', fontsize=12)
plt.ylabel('UMAP Dimension 2', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()