In [None]:
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from langchain_huggingface import HuggingFaceEmbeddings
import pandas as pd
import uuid
import matplotlib.pyplot as plt
# Apply t-SNE to reduce embeddings to 2D
from sklearn.manifold import TSNE
import numpy as np

In [13]:
chroma_client = chromadb.PersistentClient(path="./chroma_data")

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-4B")

In [None]:
class CustomEmbeddingFunction(EmbeddingFunction):
    def __init__(self, embedding_model: HuggingFaceEmbeddings):
        self.embedding_model = embedding_model

    def __call__(self, texts: Documents) -> Embeddings:
        return self.embedding_model.embed_documents(texts)

In [None]:
chroma_collection = chroma_client.get_or_create_collection(
    name="requirements_collection_qwen3_4b",
    embedding_function=CustomEmbeddingFunction(embedding_model=embedding_model),
)

# Data Preparation and Ingestion

In [None]:
training_df = pd.read_csv("./dataset/PURE_train.csv")

training_df.head()

In [None]:
training_df["Req/Not Req"].value_counts()

In [None]:
requirements = training_df["Requirement"].tolist()
labels_metadata = training_df["Req/Not Req"].tolist()
labels_metadata = [
    {"is_req": True} if label == "Req" else {"is_req": False}
    for label in labels_metadata
]
unique_ids = [str(uuid.uuid4()) for i in range(len(requirements))]

In [None]:
chroma_collection.add(
    documents=requirements,
    metadatas=labels_metadata,
    ids=unique_ids,
)

In [None]:
chroma_collection.peek()

# Visualize Embeddings with t-SNE

In [None]:
# Get all embeddings and metadata from the collection
collection_data = chroma_collection.get(
    include=["metadatas", "embeddings"]
)

embeddings = collection_data["embeddings"]
metadatas = collection_data["metadatas"]

print(f"Number of embeddings: {len(embeddings)}")
print(f"Embedding dimension: {len(embeddings[0])}")

In [None]:
embeddings_array = np.array(embeddings)
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
embeddings_2d = tsne.fit_transform(embeddings_array)

print(f"Reduced embeddings shape: {embeddings_2d.shape}")

In [None]:
# Prepare colors based on metadata
colors = ['green' if meta['is_req'] else 'red' for meta in metadatas]

plt.figure(figsize=(12, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=colors, alpha=0.6, s=50)
plt.title('t-SNE Visualization of Requirement Embeddings', fontsize=16)
plt.xlabel('t-SNE Component 1', fontsize=12)
plt.ylabel('t-SNE Component 2', fontsize=12)

# Add legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='green', label='Requirement (is_req: True)'),
    Patch(facecolor='red', label='Not Requirement (is_req: False)')
]
plt.legend(handles=legend_elements, loc='best')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()