In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np
import os
import glob

In [2]:
import chromadb
from chromadb.config import Settings

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
FOLDER_PATH = r"C:\Users\rauna\projects\llm_engineering\My Projects\Drug Chatbot\Synthetic_files"  
file_pattern = os.path.join(FOLDER_PATH, "*.txt")

In [5]:
print(file_pattern)

C:\Users\rauna\projects\llm_engineering\My Projects\Drug Chatbot\Synthetic_files\*.txt


In [6]:
file_paths = glob.glob(file_pattern)

In [7]:
print(file_paths)

['C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_01.txt', 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_02.txt', 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_03.txt', 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_04.txt', 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_05.txt', 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_06.txt', 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_07.txt', 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_08.txt', 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_09.txt', 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug

In [16]:
texts = []
filenames = []

In [None]:
for file_path in file_paths:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            if content:  # Only add non-empty files
                texts.append(content)
                filenames.append(os.path.basename(file_path))
                print(f"Loaded: {os.path.basename(file_path)}")
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

In [None]:
for file_path in file_paths[1:2]:
    with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            print(content)
            filenames.append(os.path.basename(file_path))
            print(f"Loaded: {os.path.basename(file_path)}")

In [None]:
embeddings = model.encode(texts)

In [40]:
display(embeddings)

array([[-0.03121755,  0.00714299, -0.01756535, ..., -0.01611793,
        -0.01445116,  0.02758407],
       [ 0.01701781, -0.06746283,  0.03736177, ...,  0.07838504,
        -0.01671585,  0.02201158],
       [-0.01052115, -0.02091547, -0.04784062, ...,  0.00063754,
        -0.03845629, -0.00804244],
       ...,
       [-0.02444021,  0.03095116, -0.08432989, ...,  0.02079832,
         0.00981694, -0.00664631],
       [-0.0203063 ,  0.03741434, -0.05464162, ...,  0.02180241,
        -0.00148364, -0.0752122 ],
       [-0.00051512,  0.03226906, -0.0718165 , ...,  0.06185648,
         0.01028941, -0.06245422]], shape=(38, 384), dtype=float32)

In [30]:
from IPython.display import display

In [39]:
display(len(texts[0]))

2794

In [34]:
display(len(embeddings.tolist()))

38

In [19]:
print(f"\n{'='*60}")
print(f"Model: all-MiniLM-L6-v2")
print(f"Embedding dimension: {embeddings.shape[1]}")
print(f"Number of files embedded: {embeddings.shape[0]}")
print(f"{'='*60}\n")


Model: all-MiniLM-L6-v2
Embedding dimension: 384
Number of files embedded: 38



In [None]:
for i, (filename, embedding) in enumerate(zip(filenames, embeddings)):
    print(f"File {i+1}: {filename}")
    print(f"Embedding shape: {embedding.shape}")
    print(f"First 10 values: {embedding[:10]}")
    print(f"Embedding norm: {np.linalg.norm(embedding):.4f}\n")

In [14]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [None]:
print("Similarity Matrix:")
print("-" * 60)
for i in range(len(filenames)):
    for j in range(len(filenames)):
        sim = cosine_similarity(embeddings[i], embeddings[j])
        print(f"{filenames[i]} vs {filenames[j]}: {sim:.4f}")
    print()

In [21]:
COLLECTION_NAME = "document_embeddings"  # ChromaDB collection name
DB_PATH = r"C:\Users\rauna\projects\llm_engineering\My Projects\Drug Chatbot\ChromaDB\chroma_db"

In [22]:
chroma_client = chromadb.PersistentClient(path=DB_PATH)

In [23]:
collection = chroma_client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={"description": "Text file embeddings using all-MiniLM-L6-v2"}
)

In [24]:
print(collection)

Collection(name=document_embeddings)


In [25]:
print("Adding embeddings to ChromaDB...")
ids = [f"doc_{i}" for i in range(len(filenames))]
metadatas = [{"filename": fn, "file_path": fp} for fn, fp in zip(filenames, file_paths)]

Adding embeddings to ChromaDB...


In [26]:
print(ids)

['doc_0', 'doc_1', 'doc_2', 'doc_3', 'doc_4', 'doc_5', 'doc_6', 'doc_7', 'doc_8', 'doc_9', 'doc_10', 'doc_11', 'doc_12', 'doc_13', 'doc_14', 'doc_15', 'doc_16', 'doc_17', 'doc_18', 'doc_19', 'doc_20', 'doc_21', 'doc_22', 'doc_23', 'doc_24', 'doc_25', 'doc_26', 'doc_27', 'doc_28', 'doc_29', 'doc_30', 'doc_31', 'doc_32', 'doc_33', 'doc_34', 'doc_35', 'doc_36', 'doc_37']


In [27]:
print(metadatas)

[{'filename': 'file_01.txt', 'file_path': 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_01.txt'}, {'filename': 'file_02.txt', 'file_path': 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_02.txt'}, {'filename': 'file_03.txt', 'file_path': 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_03.txt'}, {'filename': 'file_04.txt', 'file_path': 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_04.txt'}, {'filename': 'file_05.txt', 'file_path': 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_05.txt'}, {'filename': 'file_06.txt', 'file_path': 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_06.txt'}, {'filename': 'file_07.txt', 'file_path': 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic

In [25]:
collection.add(
    embeddings=embeddings.tolist(),
    documents=texts,
    metadatas=metadatas,
    ids=ids
)

In [42]:
display(embeddings.tolist()[0])
display(texts[0])
display(metadatas[0])
display(ids[0])

[-0.0312175489962101,
 0.007142986170947552,
 -0.017565354704856873,
 -0.014416318386793137,
 -0.06413102149963379,
 0.009872017428278923,
 0.07674046605825424,
 0.05445420742034912,
 -0.07276026904582977,
 -0.01634938456118107,
 -0.0555085688829422,
 -0.08762987703084946,
 -0.0006610637065023184,
 0.0247679241001606,
 0.05791550874710083,
 -0.0040714251808822155,
 0.0435117706656456,
 -0.017558185383677483,
 0.015233439393341541,
 0.0625896155834198,
 0.030219666659832,
 0.051577627658843994,
 0.015342476777732372,
 -0.012847304344177246,
 -0.05911268666386604,
 -0.005930706392973661,
 0.018281899392604828,
 0.01570085436105728,
 -0.040435515344142914,
 -0.0002284114307258278,
 0.02468736283481121,
 -0.010601339861750603,
 0.004332321230322123,
 -0.0409814678132534,
 -0.03280812129378319,
 0.05802594870328903,
 0.011951249092817307,
 0.1297919899225235,
 0.006926883012056351,
 -0.025032464414834976,
 -0.08869816362857819,
 -0.03336543217301369,
 -0.019961779937148094,
 0.0366763658821

"Topic: Physical appearance and distinctive features (make it a male)\n\n## Paracetamol: A Portrait of Presence\n\nParacetamol, a man who cuts a figure that’s both familiar and undeniably striking. He’s not one to blend into the background, yet his presence is often a quiet, reassuring one.\n\n**Physically, Paracetamol possesses a sturdy, well-built frame.** He stands at an average height, perhaps a shade over six feet, with broad shoulders that speak of resilience and a grounded nature. His build isn't overly muscled, but rather possesses a comfortable robustness, suggesting a life of practical activity rather than rigorous athletic pursuit. His posture is generally upright, conveying a sense of quiet confidence and attentiveness.\n\n**His face is perhaps his most memorable canvas.** A strong jawline anchors a face that’s often framed by a neatly trimmed, dark beard, flecked with hints of distinguished grey that add a touch of maturity. His hair, too, is dark, often kept short and pra

{'filename': 'file_01.txt',
 'file_path': 'C:\\Users\\rauna\\projects\\llm_engineering\\My Projects\\Drug Chatbot\\Synthetic_files\\file_01.txt'}

'doc_0'

In [33]:
query = "who is insulin"

In [34]:
results = collection.query(
    query_texts=[query],
    n_results=min(5, len(filenames))
)

In [35]:
print("\nMost similar documents:")
for i, (doc_id, distance, metadata) in enumerate(zip(
    results['ids'][0], 
    results['distances'][0], 
    results['metadatas'][0]
)):
    similarity = 1 - distance  # Convert distance to similarity
    print(f"  {i+1}. {metadata['filename']} (similarity: {similarity:.4f})")


Most similar documents:
  1. file_23.txt (similarity: 0.2574)
  2. file_26.txt (similarity: 0.2344)
  3. file_20.txt (similarity: 0.1864)
  4. file_31.txt (similarity: 0.1614)
  5. file_24.txt (similarity: 0.1581)


In [43]:
print("\n" + "="*60)
print("ChromaDB Collection Info:")
print("="*60)
print(f"Collection name: {COLLECTION_NAME}")
print(f"Total documents: {collection.count()}")
print(f"Database path: {DB_PATH}")
print("\nTo query this collection later, use:")
print(f"  collection = chroma_client.get_collection('{COLLECTION_NAME}')")
print(f"  results = collection.query(query_texts=['your query'], n_results=5)")


ChromaDB Collection Info:
Collection name: document_embeddings
Total documents: 38
Database path: C:\Users\rauna\projects\llm_engineering\My Projects\Drug Chatbot\ChromaDB\chroma_db

To query this collection later, use:
  collection = chroma_client.get_collection('document_embeddings')
  results = collection.query(query_texts=['your query'], n_results=5)
