In [None]:
# === IMPORTS ===
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
import numpy as np
import importlib
import os
import sys

# Add current directory to Python path
current_dir = os.path.dirname(os.path.abspath('__file__'))
if current_dir not in sys.path:
    sys.path.append(current_dir)
    print(f"✅ Current directory added to PYTHONPATH: {current_dir}")

# Check if all external modules are present
required_modules = ['ollama_server', 'model_manager', 'benchmark_core', 'visualization', 'model_benchmark_utils']
missing_modules = []

for module in required_modules:
    try:
        importlib.import_module(module)
    except ImportError:
        missing_modules.append(module)

if missing_modules:
    print(f"\n❌ Missing modules: {', '.join(missing_modules)}")
    print("Make sure all required Python modules are present in the same directory as this notebook.")
    # If modules are missing, inform the user but don't abort
else:
    print("✅ All required modules are present.")

# Import functions from external modules
from ollama_server import check_ollama_server, start_ollama_server
from model_manager import check_model_exists, load_model
from benchmark_core import benchmark_model, run_benchmark
from visualization import visualize_results
from model_benchmark_utils import run_benchmark_test

# Optional: Reload modules to ensure the latest versions are used
importlib.reload(importlib.import_module('ollama_server'))
importlib.reload(importlib.import_module('model_manager'))
importlib.reload(importlib.import_module('benchmark_core'))
importlib.reload(importlib.import_module('visualization'))
importlib.reload(importlib.import_module('model_benchmark_utils'))

✅ Alle benötigten Module sind vorhanden.


<module 'model_benchmark_utils' from 'c:\\Users\\Marc\\Desktop\\benchmark\\stonks\\project\\DeepLearningProject\\Benchmark\\model_benchmark_utils.py'>

# LLM Benchmark Framework

This notebook enables the comparison of arbitrary language models accessible through Ollama. You can use it to compare different models in terms of speed, performance and response quality for custom tasks.

In [None]:
# === BENCHMARK CONFIGURATION ===
# Ollama API URL
OLLAMA_API_URL = "http://localhost:11434/api"

# Models to compare (specify two or more models)
MODELS = [
    "deepseek-r1:1.5b",  # DeepSeek R1 with 1.5 billion parameters
    "llama3.2"        # Llama 3.2 with 8 billion parameters
]

# Additional model suggestions:
# - "phi3:3.8b" - Microsoft Phi-3 (small, efficient model) - 3.8 billion parameters
# - "mistral:7b" - Mistral 7B (good balance between size and performance) - 7 billion parameters
# - "gemma:7b" - Google Gemma 7B (efficient open-source model) - 7 billion parameters
# - "codellama:7b" - Code Llama 7B (specialized for code generation) - 7 billion parameters
# - "llama3:8b" - Meta Llama 3 8B (larger base model) - 8 billion parameters
# - "qwen2:7b" - Qwen2 7B (multilingual model with good German support) - 7 billion parameters
# - "neural-chat:7b" - Neural Chat 7B (optimized for conversations) - 7 billion parameters
# - "wizardcoder:7b" - WizardCoder 7B (good for programming tasks) - 7 billion parameters

# Tasks for the benchmark (can be extended as needed)
BENCHMARK_TASKS = [
    {
        "name": "Text Generation",
        "prompt": "Write a short paragraph about AI.",
        "max_tokens": 30,
    },
    {
        "name": "Code Generation",
        "prompt": "Write a simple Python function that checks if a number is even.",
        "max_tokens": 30,
    },
    {
        "name": "Factual Knowledge",
        "prompt": "What is the difference between ML and AI? Short answer.",
        "max_tokens": 30,
    }
]

# Benchmark parameters
REQUEST_TIMEOUT = 120  # Timeout for model responses (seconds)
RETRY_TIMEOUT = 300    # Timeout for retry attempts (seconds)
TEMPERATURE = 0.0      # Sampling temperature for the models

## Running the Benchmark

Starting the benchmark test and displaying the results.

In [None]:
# Execute the benchmark 
# In the notebook only call to external function - no definition

# Run benchmark with the configured parameters
benchmark_results = run_benchmark_test(
    api_url=OLLAMA_API_URL,
    models=MODELS,
    tasks=BENCHMARK_TASKS,
    temperature=TEMPERATURE
)

🚀 Starte Benchmark-Test für deepseek-r1:1.5b, llama3.2...

🚀 Versuche den Ollama-Server zu starten...
❌ Fehler beim Starten des Ollama-Servers: [WinError 2] The system cannot find the file specified
ℹ️ Bitte starten Sie den Ollama-Server manuell mit dem Befehl 'ollama serve'
❌ Ollama-Server konnte nicht gestartet werden.

❌ Keine Ergebnisse zum Visualisieren oder Speichern vorhanden.
