# Ollama server parallel API calls simulation

## Author: Arsen Apostolov
LinkedIn Profile: [www.linkedin.com/in/arsenapostolov](https://www.linkedin.com/in/arsenapostolov)

## Parametrization

In [None]:
# Ollama setup
ollama_url = "http://localhost:11434/api/generate"
ollama_model = "tinyllama"
ollama_prompt = "Why is the sky blue?"

# How many simulations to be done. Start from initial calls and increment till final calls or timeout of Ollama reached
initial_n_calls = 1
final_n_calls = 20
increment_n_calls = 1

## Code

In [None]:
import concurrent.futures
import requests
import time
import json
from datetime import datetime
import openpyxl
from openpyxl import Workbook
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from threading import Lock

In [None]:
log_file_path = "ollama_responses.xlsx"

In [None]:
# Function to initialize the workbook and sheet
def setup_workbook(log_file_path):
    wb = Workbook()
    ws = wb.active
    ws.title = "Responses"
    columns = ["ID", "Time", "Speed", "Waiting Time", "Response"]
    for col_num, column_title in enumerate(columns, 1):
        ws.cell(row=1, column=col_num, value=column_title)
    wb.save(log_file_path)

# Call this function at the start of your script or before you start logging
setup_workbook(log_file_path)

In [None]:
# Initialize a lock
lock = Lock()

def log_response(call_id, response_text, speed_tokens_per_second, waiting_time):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    wb = openpyxl.load_workbook(log_file_path)  # Open the workbook
    ws = wb.active
    ws.append([call_id, timestamp, speed_tokens_per_second, waiting_time, response_text])
    wb.save(log_file_path)  # Save the workbook
    wb.close()  # Close the workbook


In [None]:
def call_ollama_model(call_id):
    data = {
        "model": ollama_model,
        "prompt": ollama_prompt,
        "stream": False
    }
    start_call_time = time.time()  # Time before the request is made
    waiting_time = None  # Initialize waiting_time as None

    try:
        response = requests.post(ollama_url, json=data, timeout=30)
        end_call_time = time.time()  # Time after the response is received
        waiting_time = end_call_time - start_call_time
        if response.status_code == 200:
            response_data = response.json()
            if response_data.get('done', False):
                eval_duration_ns = response_data.get('eval_duration', 0)
                eval_count = response_data.get('eval_count', 0)
                speed_tokens_per_second = eval_count / (eval_duration_ns / 1e9) if eval_duration_ns > 0 else 0
                response_text = response_data.get('response', 'No response')
                log_response(call_id, response_text, speed_tokens_per_second, waiting_time)  # Log with speed and waiting time
                return speed_tokens_per_second, waiting_time
            else:
                log_response(call_id, "Incomplete response or no 'done' flag found.", 0, waiting_time)
                return 0, waiting_time
        else:
            log_response(call_id, f"Error: {response.status_code}", 0, waiting_time)
            return 0, waiting_time
    except requests.exceptions.RequestException as e:
        # Calculate waiting_time if it hasn't been calculated yet
        waiting_time = time.time() - start_call_time if waiting_time is None else waiting_time
        log_response(call_id, f"Request exception: {str(e)}", 0, waiting_time)
        return 0, waiting_time
    except json.JSONDecodeError as e:
        # Calculate waiting_time if it hasn't been calculated yet
        waiting_time = time.time() - start_call_time if waiting_time is None else waiting_time
        log_response(call_id, f"JSON decode exception: {str(e)}", 0, waiting_time)
        return 0, waiting_time


In [None]:
def perform_parallel_calls(n_calls):
    start_time = time.time()
    speeds = []
    waiting_times = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_calls) as executor:
        futures = [executor.submit(call_ollama_model, call_id) for call_id in range(n_calls)]
        try:
            for future in concurrent.futures.as_completed(futures):
                speed, waiting_time = future.result()
                if isinstance(speed, float):
                    speeds.append(speed)
                    waiting_times.append(waiting_time)
        except requests.exceptions.RequestException as e:
            print(f"Request exception: {str(e)}. Halting further execution.")
            return None, None, None  # Or raise an exception, or exit the script, depending on your requirements

    total_time = time.time() - start_time
    avg_speed = sum(speeds) / len(speeds) if speeds else 0
    avg_waiting_time = sum(waiting_times) / len(waiting_times) if waiting_times else 0
    return total_time / n_calls, avg_speed, avg_waiting_time

In [None]:
def main():
    call_numbers = []
    avg_times = []
    avg_speeds = []
    avg_waiting_times = []
    simulation_failed = False

    for n_calls in range(1, 20, 1):
        try:
            results = perform_parallel_calls(n_calls)
            if results is None:  # If perform_parallel_calls returned None, it's a signal to stop the simulation
                simulation_failed = True
                break
            avg_time, avg_speed, avg_waiting_time = results
            call_numbers.append(n_calls)
            avg_times.append(avg_time)
            avg_speeds.append(avg_speed)
            avg_waiting_times.append(avg_waiting_time)
            print(f"With {n_calls} parallel calls: Average time per call: {avg_time:.2f} seconds, "
                  f"Average speed: {avg_speed:.2f} tokens per second, "
                  f"Average waiting time: {avg_waiting_time:.2f} seconds.")
            time.sleep(2)  # Pause to avoid overwhelming the server
        except Exception as e:
            print(f"An error occurred: {e}")
            simulation_failed = True
            break

    # Plotting, executed regardless of simulation success or failure
    fig, ax1 = plt.subplots()

    # Stacked bar chart for call time and waiting time
    ax1.bar(call_numbers, avg_waiting_times, label='Average Waiting Time', color='orange', width=0.4)
    ax1.bar(call_numbers, avg_times, bottom=avg_waiting_times, label='Average Call Time', color='blue', width=0.4)
    ax1.set_xlabel('Number of Calls')
    ax1.set_ylabel('Time (seconds)')
    ax1.tick_params(axis='y')
    ax1.legend(loc='upper left')

    # Line chart for speed
    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    ax2.plot(call_numbers, avg_speeds, label='Average Speed', color='green', marker='o')
    ax2.set_ylabel('Speed (tokens/second)')
    ax2.tick_params(axis='y')
    ax2.legend(loc='upper right')

    # Final layout touches
    plt.title('Performance Metrics by Number of Calls')
    fig.tight_layout()  # to ensure the right y-label is not clipped
    plt.show()

if __name__ == "__main__":
    main()