## Import Librerie

In [26]:
import pymongo
import numpy as np
import pandas as pd
import dotenv
import os

from urllib.parse import quote_plus

## SQuAD v2.0 in MongoDB

### Access Functions to MongoDB Atlas Cluster

In [27]:
def get_mongo_client(username: str, password: str, cluster: str = "x4cluster.n6xsnhl.mongodb.net") -> pymongo.MongoClient:
    """
    Restituisce un oggetto MongoClient autenticato al cluster specificato.

    Args:
        username (str): Username del database MongoDB.
        password (str): Password del database MongoDB.
        cluster (str): Nome del cluster MongoDB (default: "x4cluster.n6xsnhl.mongodb.net").

    Returns:
        pymongo.MongoClient: Istanza autenticata del client MongoDB.
    """
    encoded_username = quote_plus(username)
    encoded_password = quote_plus(password)
    uri = f"mongodb+srv://{encoded_username}:{encoded_password}@{cluster}/"
    return pymongo.MongoClient(uri)

def insert_dataframe_to_mongo(client, dataframe, collection_name: str, db_name : str = 'squadv2'):
    """
    Converte eventuali ndarray nel DataFrame e lo inserisce nella collezione MongoDB specificata.

    Args:
        client: Istanza di pymongo.MongoClient.
        dataframe (pd.DataFrame): Il DataFrame da inserire.
        collection_name (str): Nome della collezione.
        db_name (str): Nome del database.
    """

    def convert_ndarray_to_list(obj):
        if isinstance(obj, dict):
            return {k: convert_ndarray_to_list(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [convert_ndarray_to_list(i) for i in obj]
        elif hasattr(obj, "tolist"):  # per numpy.ndarray
            return obj.tolist()
        else:
            return obj

    cleaned_df = dataframe.applymap(convert_ndarray_to_list)
    records = cleaned_df.to_dict("records")

    client[db_name][collection_name].insert_many(records)
    print(f"{len(records)} documenti inseriti in '{db_name}.{collection_name}'")

def drop_collection(client, collection_name: str, db_name : str = 'squadv2'):
    """
    Elimina una collezione da un database MongoDB.

    Args:
        client: Istanza di pymongo.MongoClient.
        collection_name (str): Nome della collezione da eliminare.
        db_name (str): Nome del database.
    """
    if collection_name in client[db_name].list_collection_names():
        client[db_name].drop_collection(collection_name)
        print(f"Collezione '{db_name}.{collection_name}' eliminata.")
    else:
        print(f"La collezione '{db_name}.{collection_name}' non esiste.")

def read_collection(client, collection_name: str, db_name: str = 'squadv2', as_dataframe: bool = False, projection: dict = None):
    """
    Legge tutti i documenti da una collezione MongoDB.

    Args:
        client: Istanza di pymongo.MongoClient.
        db_name (str): Nome del database.
        collection_name (str): Nome della collezione da cui leggere.
        as_dataframe (bool): Se True, restituisce un DataFrame; altrimenti, una lista di dizionari.
        projection (dict): Specifica quali campi includere/escludere, es. {"_id": 0} per escludere l'_id.

    Returns:
        list[dict] o pandas.DataFrame: I documenti letti dalla collezione.
    """
    import pandas as pd

    collection = client[db_name][collection_name]
    cursor = collection.find({}, projection or {})

    data = list(cursor)
    print(f"{len(data)} documenti letti da '{db_name}.{collection_name}'")

    if as_dataframe:
        return pd.DataFrame(data)
    return data

def update_collection_from_dataframe(client, db_name: str, collection_name: str, dataframe, match_field: str):
    """
    Aggiorna i documenti in una collezione MongoDB usando i dati di un DataFrame.

    Args:
        client: Istanza di pymongo.MongoClient.
        db_name (str): Nome del database.
        collection_name (str): Nome della collezione.
        dataframe (pd.DataFrame): DataFrame contenente i dati aggiornati.
        match_field (str): Campo univoco da usare per identificare i documenti da aggiornare.
    """
    def convert_ndarray_to_list(obj):
        if isinstance(obj, dict):
            return {k: convert_ndarray_to_list(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [convert_ndarray_to_list(i) for i in obj]
        elif hasattr(obj, "tolist"):  # es. numpy.ndarray
            return obj.tolist()
        else:
            return obj

    cleaned_df = dataframe.applymap(convert_ndarray_to_list)
    collection = client[db_name][collection_name]

    updated_count = 0
    for record in cleaned_df.to_dict("records"):
        match_value = record.get(match_field)
        if match_value is None:
            continue  # Salta righe senza valore per il campo di matching

        result = collection.update_one(
            {match_field: match_value},  # filtro
            {"$set": record},            # update
            upsert=False                 # non inserisce se non esiste
        )
        if result.modified_count > 0:
            updated_count += 1

    print(f"{updated_count} documenti aggiornati in '{db_name}.{collection_name}'")


### MongoDB Cluster Usage

In [None]:
# dotenv.load_dotenv("key.env", override=True)

# username = os.getenv("USERNAME")
# password = os.getenv("PASSWORD")

# client = get_mongo_client(username, password)

# df = read_collection(client, 'squadv2_original_train', as_dataframe=True)

# df

# client.close()

## Training Mistral 7B v0.2

## BertScore Evaluation