In [2]:

import mlflow
import mlflow.pytorch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from mlflow.models import infer_signature

In [3]:
# ATTENTION ICI CA TELECHARGE LE MODELE
# Charger le tokenizer
tokenizer = AutoTokenizer.from_pretrained("OrdalieTech/Solon-embeddings-large-0.1")

# Charger le modèle
model = AutoModel.from_pretrained("OrdalieTech/Solon-embeddings-large-0.1")

In [4]:
# TEST
def extract_features(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

text = "Bonjour"
features = extract_features(text)
print(features)


[[ 0.99690074  1.2919712   0.81441706 ...  0.10150217 -0.5481098
   1.7338686 ]]


In [5]:
# Définir l'URI de suivi MLflow pour pointer vers votre instance locale
mlflow.set_tracking_uri("http://localhost:5000")  # Changez ceci pour votre URI MLflow

# Définir l'expérience
mlflow.set_experiment("Solon-embeddings")

2024/08/09 20:10:18 INFO mlflow.tracking.fluent: Experiment with name 'Solon-embeddings' does not exist. Creating a new experiment.


<Experiment: artifact_location='/app/mlartifacts/1', creation_time=1723227018432, experiment_id='1', last_update_time=1723227018432, lifecycle_stage='active', name='Solon-embeddings', tags={}>

In [6]:

# Exemple de question et de réponses
question = ["Il fait beau"]
responses = ["Il fait beau", "Il est beau", "Il va faire beau", "Il a fait beau", "C'est très beau"]
response_1 = ["Il fait beau"]
response_2 = ["Il est beau"]
response_3 = ["Il va faire beau"]
response_4 = ["Il a fait beau"]
response_5 = ["C'est très beau"]

# Extraire les embeddings
question_embedding = extract_features(question)
responses_embeddings = extract_features(responses)
response_1_embedding = extract_features(response_1)
response_2_embedding = extract_features(response_2)
response_3_embedding = extract_features(response_3)
response_4_embedding = extract_features(response_4)
response_5_embedding = extract_features(response_5)

# Calculer la similarité cosinus entre la question et les réponses
cos_similarities = cosine_similarity(question_embedding, responses_embeddings)

# Exemple de valeur moyenne de la similarité cosinus (pour une paire)
mean_cos_similarity = np.mean(cos_similarities)

cos_similaritie_1 = cosine_similarity(question_embedding, response_1_embedding)
cos_similaritie_2 = cosine_similarity(question_embedding, response_2_embedding)
cos_similaritie_3 = cosine_similarity(question_embedding, response_3_embedding)
cos_similaritie_4 = cosine_similarity(question_embedding, response_4_embedding)
cos_similaritie_5 = cosine_similarity(question_embedding, response_5_embedding)

# Enregistrer le modèle dans MLflow
mlflow.set_experiment("Solon-embeddings")

with mlflow.start_run() as run:
    # Enregistrer le modèle PyTorch
    mlflow.pytorch.log_model(model, "solon-embeddings-large-model")
    # Enregistrer le tokenizer comme artefact
    tokenizer.save_pretrained("models/solon-embeddings-large-tokenizer")
    
    # Enregistrer les paramètres et les métriques
    mlflow.log_param("model_name", "OrdalieTech/Solon-embeddings-large-0.1")
    mlflow.log_param("source", "Script d'installation Solon-embeddings-large-0.1.ipynb")
    mlflow.log_metric("mean_cos_similarity", mean_cos_similarity)
    mlflow.log_metric("cos_similarity_top_1", cos_similaritie_1)
    mlflow.log_metric("cos_similarity_top_2", cos_similaritie_2)
    mlflow.log_metric("cos_similarity_top_3", cos_similaritie_3)
    mlflow.log_metric("cos_similarity_top_4", cos_similaritie_4)
    mlflow.log_metric("cos_similarity_top_5", cos_similaritie_5)
    
    # Inférer la signature du modèle
    signature = infer_signature(question, question_embedding)
    
    # Loguer le modèle avec la signature et gérer les versions
    mlflow.pytorch.log_model(
        pytorch_model=model,
        artifact_path="solon-embeddings-large-model",
        signature=signature,
        registered_model_name="pytorch-solon-embeddings-large-model",
        await_registration_for=10  # Temps d'attente pour la création de la version (optionnel)
    )



Successfully registered model 'pytorch-solon-embeddings-large-model'.
2024/08/09 20:11:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 10 seconds for model version to finish creation. Model name: pytorch-solon-embeddings-large-model, version 1
Created version '1' of model 'pytorch-solon-embeddings-large-model'.
2024/08/09 20:11:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run rare-doe-302 at: http://localhost:5000/#/experiments/1/runs/6d780e028b664e8cadb6555fcd642692.
2024/08/09 20:11:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


In [7]:
# Afficher l'ID de l'exécution pour référence future
run_id = run.info.run_id
print(f"Run ID: {run_id}")

Run ID: 6d780e028b664e8cadb6555fcd642692
