In [1]:
pip install psutil


Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pynvml

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install scipy

Note: you may need to restart the kernel to use updated packages.


In [5]:
import subprocess
import time
import psutil
import json

def run_model(model_name, prompt):
    """
    Exécute un modèle via Docker (Ollama) et retourne la réponse.
    """
    start_time = time.time()
    
    # Lancer le modèle via Docker
    command = [
        "docker", "exec", "ollama", "ollama", "run", model_name, prompt
    ]
    
    # Exécuter la commande
    result = subprocess.run(command, capture_output=True, text=True)
    
    # Calculer le temps de latence
    latency = time.time() - start_time
    
    # Récupérer la réponse du modèle (JSON attendu)
    try:
        response = json.loads(result.stdout)
    except json.JSONDecodeError:
        response = {"output": result.stdout.strip()}
    
    return response, latency

In [6]:
from pynvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates


def get_gpu_usage():
    """
    Récupère l'utilisation des ressources GPU via pynvml.
    """
    try:
        # Initialiser la bibliothèque NVML (NVIDIA Management Library)
        nvmlInit()

        gpu_stats = []
        num_gpus = nvmlDeviceGetCount()  # Nombre de GPU disponibles

        # Récupérer les informations de chaque GPU
        for i in range(num_gpus):
            handle = nvmlDeviceGetHandleByIndex(i)  # Obtenir le handle du GPU
            memory_info = nvmlDeviceGetMemoryInfo(handle)  # Obtenir les infos de mémoire
            utilization = nvmlDeviceGetUtilizationRates(handle)  # Obtenir l'utilisation du GPU

            # Ajouter les informations de chaque GPU dans la liste
            gpu_stats.append({
                'GPU Index': i,
                'GPU Utilization (%)': utilization.gpu,  # Utilisation du GPU en %
                'Memory Used (MB)': memory_info.used // 1024,  # Convertir en Mo
                'Memory Free (MB)': memory_info.free // 1024,  # Convertir en Mo
            })
        
        return gpu_stats
    
    except Exception as e:
        print(f"Erreur lors de la récupération des statistiques GPU avec pynvml: {e}")
        return None

In [7]:
import psutil
def monitor_resources():
    """
    Retourne l'utilisation du CPU et de la mémoire.
    """
    cpu_usage = psutil.cpu_percent(interval=1)  # Moyenne sur 1 seconde
    memory_usage = psutil.virtual_memory().percent  # Utilisation de la mémoire
    return cpu_usage, memory_usage

In [8]:
import matplotlib.pyplot as plt
def plot_graphs(latencies, cpu_usages, memory_usages, gpu_utilizations, memory_used, memory_free):
    """
    Trace les courbes pour les différentes valeurs collectées.
    """
    plt.figure(figsize=(10, 6))

    # Tracer la courbe des latences
    plt.subplot(2, 2, 1)
    plt.plot(latencies, label="Latence", color='blue')
    plt.xlabel("Test")
    plt.ylabel("Latence (s)")
    plt.title("Latence du modèle")
    plt.legend()

    # Tracer la courbe d'utilisation CPU
    plt.subplot(2, 2, 2)
    plt.plot(cpu_usages, label="CPU Usage", color='red')
    plt.xlabel("Test")
    plt.ylabel("CPU (%)")
    plt.title("Utilisation du CPU")
    plt.legend()

    # Tracer la courbe d'utilisation de la mémoire
    plt.subplot(2, 2, 3)
    plt.plot(memory_usages, label="Memory Usage", color='green')
    plt.xlabel("Test")
    plt.ylabel("Memory (%)")
    plt.title("Utilisation de la mémoire")
    plt.legend()

    # Tracer les courbes des ressources GPU
    plt.subplot(2, 2, 4)
    plt.plot(gpu_utilizations, label="GPU Utilization", color='purple')
    plt.plot(memory_used, label="Memory Used", color='orange')
    plt.plot(memory_free, label="Memory Free", color='brown')
    plt.xlabel("Test")
    plt.ylabel("GPU Resources (MB / %)")
    plt.title("Ressources GPU")
    plt.legend()

    # Afficher les graphes
    plt.tight_layout()
    plt.show()

In [9]:
import numpy as np
from scipy.stats import trim_mean

def load_questions_from_file(file_path):
    """
    Charge les questions à partir d'un fichier texte.
    Chaque ligne du fichier doit contenir une question.
    """
    try:
        with open(file_path, 'r') as file:
            questions = file.readlines()
        # Nettoyer les espaces blancs et les retours à la ligne
        questions = [question.strip() for question in questions]
        return questions
    except Exception as e:
        print(f"Erreur lors de la lecture du fichier {file_path}: {e}")
        return []

def test_models(prompts, models, trim_percent=0.05):
    for model_name in models:
        # Initialisation des listes spécifiques à chaque modèle
        latencies = []
        cpu_usages = []
        memory_usages = []
        gpu_utilizations = []
        memory_used = []
        memory_free = []

        for prompt in prompts:
            response, latency = run_model(model_name, prompt)
            cpu, memory = monitor_resources()

            latencies.append(latency)
            cpu_usages.append(cpu)
            memory_usages.append(memory)

            # GPU
            gpu_usage = get_gpu_usage()
            if gpu_usage:
                for gpu in gpu_usage:
                    gpu_utilizations.append(gpu['GPU Utilization (%)'])
                    memory_used.append(gpu['Memory Used (MB)'])
                    memory_free.append(gpu['Memory Free (MB)'])

        num_tests = len(latencies)

        # Moyennes simples
        avg_latency = sum(latencies) / num_tests
        avg_cpu = sum(cpu_usages) / num_tests
        avg_memory = sum(memory_usages) / num_tests

        # Trimmed mean
        trimmed_latency = trim_mean(latencies, proportiontocut=trim_percent)
        trimmed_cpu = trim_mean(cpu_usages, proportiontocut=trim_percent)
        trimmed_memory = trim_mean(memory_usages, proportiontocut=trim_percent)
        trimmed_gpu = trim_mean(gpu_utilizations, proportiontocut=trim_percent) if gpu_utilizations else 0
        trimmed_mem_used = trim_mean(memory_used, proportiontocut=trim_percent) if memory_used else 0
        trimmed_mem_free = trim_mean(memory_free, proportiontocut=trim_percent) if memory_free else 0

        # Throughput = total requests / total time
        total_time = sum(latencies)
        throughput = num_tests / total_time if total_time > 0 else 0

        # Résultats
        print(f"\n--- Performance Results for {model_name} ---")
        print(f"Simple Mean Latency: {avg_latency:.4f} sec")
        print(f"Simple Mean CPU Usage: {avg_cpu:.2f}%")
        print(f"Simple Mean Memory Usage: {avg_memory:.2f}%")

        print(f"\nTrimmed Mean Results ({int(trim_percent*100)}% trimming):")
        print(f"Latency: {trimmed_latency:.4f} sec")
        print(f"CPU Usage: {trimmed_cpu:.2f}%")
        print(f"Memory Usage: {trimmed_memory:.2f}%")
        print(f"GPU Utilization: {trimmed_gpu:.2f}%")
        print(f"GPU Memory Used: {trimmed_mem_used:.2f} MB")
        print(f"GPU Memory Free: {trimmed_mem_free:.2f} MB")

        print(f"\nThroughput: {throughput:.2f} requests per second")

        # Optionnel : tracer les courbes pour ce modèle uniquement
        plot_graphs(latencies, cpu_usages, memory_usages, gpu_utilizations, memory_used, memory_free)



In [None]:
import json

# def process_questions(input_file):
#     """
#     Traite une liste de questions à partir d'un fichier et exécute chaque question sur les modèles.
#     """
#     with open(input_file, 'r') as file:
#         questions = file.readlines()
    
#     results = []
    
#     for question in questions:
#         question = question.strip()  # Enlever les espaces superflus
        
#         # Tester Llama3
#         response, latency = run_model("llama3", question)
#         cpu, memory = monitor_resources()
#         results.append({
#             "model": "Llama3",
#             "prompt": question,
#             "response": response['output'],
#             "latency": latency,
#             "cpu_usage": cpu,
#             "memory_usage": memory
#         })

#         # Tester DeepSeek-R1
#         response, latency = run_model("deepseek-r1", question)
#         cpu, memory = monitor_resources()
#         results.append({
#             "model": "DeepSeek-R1",
#             "prompt": question,
#             "response": response['output'],
#             "latency": latency,
#             "cpu_usage": cpu,
#             "memory_usage": memory
#         })
    
#     return results

# def save_results_to_file(results, output_file):
#     """
#     Sauvegarde les résultats dans un fichier JSON.
#     """
#     with open(output_file, 'w') as file:
#         json.dump(results, file, indent=4)
        
# output_file = 'model_results.json'  # Fichier pour sauvegarder les résultats
questions_file = 'questions.txt'

# results = process_questions(questions_file)
# save_results_to_file(results, output_file)
prompts = load_questions_from_file(questions_file)  # Charger les questions depuis le fichier

models = ["llama3", "deepseek-r1"]

test_models(prompts, models)



--- Performance Results for llama3 ---
Simple Mean Latency: 7.8861 sec
Simple Mean CPU Usage: 4.01%
Simple Mean Memory Usage: 15.33%

Trimmed Mean Results (5% trimming):
Latency: 7.6899 sec
CPU Usage: 1.76%
Memory Usage: 15.31%
GPU Utilization: 0.00%
GPU Memory Used: 11813248.00 MB
GPU Memory Free: 3915392.00 MB

Throughput: 0.13 requests per second
