In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# ----Chroma Trial----

In [None]:
!pip install -q chromadb

In [None]:
import json

# Json file Path, saved on google drive of the collaboratos
json_path = '/content/drive/MyDrive/Colab Notebooks/Big Data/Final_Project/secret.json'

# Loading the json file
with open(json_path) as f:
  secrets = json.load(f)

# Secret info from json
collection_string_list = secrets["COLLECTION_STRING_LIST"]
collection_string_list.remove("PREVALENT")

In [None]:
print(collection_string_list)

['ANAGRAFICA', 'ANAMNESI', 'CORONAROGRAFIA_PTCA', 'ECOCARDIO_DATI', 'ECOCAROTIDI', 'ESAMI_LABORATORIO', 'ESAMI_SPECIALISTICI', 'ESAMI_STRUMENTALI_CARDIO', 'LISTA_EVENTI', 'RICOVERO_OSPEDALIERO', 'VISITA_CONTROLLO_ECG']


In [None]:
import os

documentations_path = "/content/drive/MyDrive/Colab Notebooks/Big Data/Final_Project/Datasets_documentations/txt"

doc_dict = {}
file_extension = ".txt"

for collection in collection_string_list:
  doc_path = os.path.join(documentations_path, collection + file_extension)
  doc_text = ""

  try:
    with open(doc_path, 'r', encoding='utf-8') as f:
      doc_text = f.read()
      print(f"File '{doc_path}' letto con successo")
  except Exception as e:
    print(f"File inexistent: {collection}.pdf")

  doc_dict[collection]= doc_text


print(doc_dict)


File '/content/drive/MyDrive/Colab Notebooks/Big Data/Final_Project/Datasets_documentations/txt/ANAGRAFICA.txt' letto con successo
File '/content/drive/MyDrive/Colab Notebooks/Big Data/Final_Project/Datasets_documentations/txt/ANAMNESI.txt' letto con successo
File '/content/drive/MyDrive/Colab Notebooks/Big Data/Final_Project/Datasets_documentations/txt/CORONAROGRAFIA_PTCA.txt' letto con successo
File '/content/drive/MyDrive/Colab Notebooks/Big Data/Final_Project/Datasets_documentations/txt/ECOCARDIO_DATI.txt' letto con successo
File '/content/drive/MyDrive/Colab Notebooks/Big Data/Final_Project/Datasets_documentations/txt/ECOCAROTIDI.txt' letto con successo
File '/content/drive/MyDrive/Colab Notebooks/Big Data/Final_Project/Datasets_documentations/txt/ESAMI_LABORATORIO.txt' letto con successo
File '/content/drive/MyDrive/Colab Notebooks/Big Data/Final_Project/Datasets_documentations/txt/ESAMI_SPECIALISTICI.txt' letto con successo
File '/content/drive/MyDrive/Colab Notebooks/Big Data/F

In [None]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
import chromadb
from chromadb import Client
from chromadb.config import Settings

CHROMA_PATH = "/content/drive/MyDrive/Colab Notebooks/Big Data/Final_Project/chroma_data"

chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)

# Elimina la collection se esiste già
if chroma_client.get_collection(name="datasets_documentations") is not None:
    chroma_client.delete_collection(name="datasets_documentations")
    print("Collection 'datasets_documentations' eliminata.")

chroma_collection = chroma_client.get_or_create_collection(name="datasets_documentations")


Collection 'datasets_documentations' eliminata.


In [None]:
for mongo_collection in collection_string_list:
  embedding = embedding_model.encode(doc_dict[mongo_collection])

  chroma_collection.add(
      documents=[doc_dict[mongo_collection]],
      embeddings=[embedding],
      ids=[mongo_collection],
      metadatas=[{"source_file": mongo_collection + file_extension, "table_name": mongo_collection}]
  )
  print(f"Embedding per '{mongo_collection}' aggiunto con successo a ChromaDB.")


Embedding per 'ANAGRAFICA' aggiunto con successo a ChromaDB.
Embedding per 'ANAMNESI' aggiunto con successo a ChromaDB.
Embedding per 'CORONAROGRAFIA_PTCA' aggiunto con successo a ChromaDB.
Embedding per 'ECOCARDIO_DATI' aggiunto con successo a ChromaDB.
Embedding per 'ECOCAROTIDI' aggiunto con successo a ChromaDB.
Embedding per 'ESAMI_LABORATORIO' aggiunto con successo a ChromaDB.
Embedding per 'ESAMI_SPECIALISTICI' aggiunto con successo a ChromaDB.
Embedding per 'ESAMI_STRUMENTALI_CARDIO' aggiunto con successo a ChromaDB.
Embedding per 'LISTA_EVENTI' aggiunto con successo a ChromaDB.
Embedding per 'RICOVERO_OSPEDALIERO' aggiunto con successo a ChromaDB.
Embedding per 'VISITA_CONTROLLO_ECG' aggiunto con successo a ChromaDB.


In [None]:
# ChromaDB integrates query methods to rank documents similarity
def print_similarity_search(query: str):
  user_query = query
  query_embedding = embedding_model.encode(user_query)

  results = chroma_collection.query(
      query_embeddings=[query_embedding],  # la query dell’utente
      n_results=3,  # quanti documenti simili vuoi
      include=["documents", "metadatas", "distances"]
  )

  # Visualizza i risultati
  for i in range(3):
      print(f"🔹 Documento: {results['metadatas'][0][i]['source_file']}")
      print(f"🔸 Similarità (distanza): {results['distances'][0][i]:.4f}")
      print(f"📄 Testo: {results['documents'][0][i][:200]}...\n")

### Test of query similarity search

In [None]:
query = " "
print_similarity_search(query)