In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
import os
import glob

In [16]:
import chromadb
from chromadb.config import Settings

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
FOLDER_PATH = r"C:\Users\rauna\projects\llm_engineering\My Projects\Drug Chatbot\Synthetic_files"  # Uncomment and set your path
file_pattern = os.path.join(FOLDER_PATH, "*.txt")

In [5]:
file_paths = glob.glob(file_pattern)

In [7]:
texts = []
filenames = []

In [None]:
for file_path in file_paths:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            if content:  # Only add non-empty files
                texts.append(content)
                filenames.append(os.path.basename(file_path))
                print(f"Loaded: {os.path.basename(file_path)}")
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

In [11]:
embeddings = model.encode(texts)

In [12]:
print(f"\n{'='*60}")
print(f"Model: all-MiniLM-L6-v2")
print(f"Embedding dimension: {embeddings.shape[1]}")
print(f"Number of files embedded: {embeddings.shape[0]}")
print(f"{'='*60}\n")


Model: all-MiniLM-L6-v2
Embedding dimension: 384
Number of files embedded: 38



In [None]:
for i, (filename, embedding) in enumerate(zip(filenames, embeddings)):
    print(f"File {i+1}: {filename}")
    print(f"Embedding shape: {embedding.shape}")
    print(f"First 10 values: {embedding[:10]}")
    print(f"Embedding norm: {np.linalg.norm(embedding):.4f}\n")

In [14]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [None]:
print("Similarity Matrix:")
print("-" * 60)
for i in range(len(filenames)):
    for j in range(len(filenames)):
        sim = cosine_similarity(embeddings[i], embeddings[j])
        print(f"{filenames[i]} vs {filenames[j]}: {sim:.4f}")
    print()

In [21]:
COLLECTION_NAME = "document_embeddings"  # ChromaDB collection name
DB_PATH = r"C:\Users\rauna\projects\llm_engineering\My Projects\Drug Chatbot\ChromaDB\chroma_db"

In [22]:
chroma_client = chromadb.PersistentClient(path=DB_PATH)

In [23]:
collection = chroma_client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={"description": "Text file embeddings using all-MiniLM-L6-v2"}
)

In [24]:
print("Adding embeddings to ChromaDB...")
ids = [f"doc_{i}" for i in range(len(filenames))]
metadatas = [{"filename": fn, "file_path": fp} for fn, fp in zip(filenames, file_paths)]

Adding embeddings to ChromaDB...


In [25]:
collection.add(
    embeddings=embeddings.tolist(),
    documents=texts,
    metadatas=metadatas,
    ids=ids
)

In [33]:
query = "who is insulin"

In [34]:
results = collection.query(
    query_texts=[query],
    n_results=min(5, len(filenames))
)

In [35]:
print("\nMost similar documents:")
for i, (doc_id, distance, metadata) in enumerate(zip(
    results['ids'][0], 
    results['distances'][0], 
    results['metadatas'][0]
)):
    similarity = 1 - distance  # Convert distance to similarity
    print(f"  {i+1}. {metadata['filename']} (similarity: {similarity:.4f})")


Most similar documents:
  1. file_23.txt (similarity: 0.2574)
  2. file_26.txt (similarity: 0.2344)
  3. file_20.txt (similarity: 0.1864)
  4. file_31.txt (similarity: 0.1614)
  5. file_24.txt (similarity: 0.1581)


In [31]:
print("\n" + "="*60)
print("ChromaDB Collection Info:")
print("="*60)
print(f"Collection name: {COLLECTION_NAME}")
print(f"Total documents: {collection.count()}")
print(f"Database path: {DB_PATH}")
print("\nTo query this collection later, use:")
print(f"  collection = chroma_client.get_collection('{COLLECTION_NAME}')")
print(f"  results = collection.query(query_texts=['your query'], n_results=5)")


ChromaDB Collection Info:
Collection name: document_embeddings
Total documents: 38
Database path: C:\Users\rauna\projects\llm_engineering\My Projects\Drug Chatbot\ChromaDB\chroma_db

To query this collection later, use:
  collection = chroma_client.get_collection('document_embeddings')
  results = collection.query(query_texts=['your query'], n_results=5)
