In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pytz

In [None]:
basedir = "../data/2024-07-09/"
locust_data_history_file = basedir + "data_stats_history.csv"
locust_data_file = basedir + "data_stats.csv"
kepler_data_file = basedir+"kepler.csv"
pdu_data_file = basedir+"pdu.csv"

stages = [
    {"users": 100, "spawn_rate": 2},
    {"users": 200, "spawn_rate": 20},
    {"users": 300, "spawn_rate": 20},
    {"users": 500, "spawn_rate": 20},
    {"users": 800, "spawn_rate": 20},
    {"users": 1300, "spawn_rate": 50},
    {"users": 2100, "spawn_rate": 50}
]

valid_users = set()
for stage in stages:
    valid_users.add(stage['users'])

In [None]:
# CSV Daten in DataFrame einlesen
locust_df = pd.read_csv("%s" % locust_data_history_file)


# Konvertiere die Zeitstempel in ein datetime-Format, um die Daten einfacher zu analysieren
locust_df['Timestamp'] = pd.to_datetime(locust_df['Timestamp'], unit='s')
# Zeitzone in Deutschland festlegen
germany_tz = pytz.timezone('Europe/Berlin')
locust_df['Timestamp'] = locust_df['Timestamp'].dt.tz_localize('UTC').dt.tz_convert(germany_tz)

# Setze den Zeitstempel als Index
locust_df.set_index('Timestamp', inplace=True)

# Entferne die Spalte 'Type'
locust_df.drop(columns=['Type'], inplace=True)

# Filtere die Zeilen, bei denen 'Request Count' gleich 0 ist
locust_df = locust_df[locust_df['Total Request Count'] != 0]

# Entferne alle Zeilen, bei denen 'users' nicht im Set 'valid_users' ist
locust_df = locust_df[locust_df['User Count'].isin(valid_users)]

In [None]:
locust_df

In [None]:
# Get the min and max index (time) for the current stage
start_time, end_time = locust_df.index.min(), locust_df.index.max()
# Convert the timezoned datetime to naive datetime (UTC or localize as needed)
start_time = start_time.tz_convert("Europe/Berlin").tz_localize(None)
end_time = end_time.tz_convert("Europe/Berlin").tz_localize(None)
print("Start:",start_time, "; End:",end_time)

In [None]:
# Determine the appropriate unit for the total energy consumed
def convert_energy(total_energy_joules):
    if total_energy_joules >= 1e6:  # 1 Megajoule = 1,000,000 Joules
        energy_unit = "MJ"
        total_energy = total_energy_joules / 1e6
    elif total_energy_joules >= 1e3:  # 1 Kilojoule = 1,000 Joules
        energy_unit = "kJ"
        total_energy = total_energy_joules / 1e3
    else:
        energy_unit = "J"
        total_energy = total_energy_joules
    return total_energy, energy_unit

def transform_joules(energy, from_unit, to_unit):
    # Conversion factors to Joules
    conversion_to_joules = {
        "MJ": 1e6,
        "kJ": 1e3,
        "J": 1
    }
    
    if from_unit not in conversion_to_joules:
        raise ValueError(f"Unknown from_unit: {from_unit}")
    
    if to_unit not in conversion_to_joules:
        raise ValueError(f"Unknown to_unit: {to_unit}")
    
    # Convert from original unit to Joules
    energy_in_joules = energy * conversion_to_joules[from_unit]
    
    # Convert from Joules to the target unit
    energy_in_target_unit = energy_in_joules / conversion_to_joules[to_unit]
    
    return energy_in_target_unit

In [None]:
kepler_df = pd.read_csv(kepler_data_file,  parse_dates=['Time'])
kepler_df['Time'] = pd.to_datetime(kepler_df['Time'])  # Ensure consistent timezone
# Ensure the DataFrame is sorted by Time
kepler_df = kepler_df.sort_values(by='Time')
kepler_df.index = pd.to_datetime(kepler_df.index)
for column in kepler_df.columns:
    if column != 'Time':
        kepler_df[column] = kepler_df[column].ffill().fillna(0)

def calculate_kepler_energy_consumption(kepler_df, start_time, end_time):
    # Ensure the DataFrame is sorted by Time
    kepler_df = kepler_df.sort_values(by='Time')
    
    # Check if start_time is within the DataFrame's time range
    if start_time < kepler_df['Time'].min():
        print(f"Start time {start_time} is before the first timestamp. Using the first available value.")
        start_values = kepler_df.iloc[0].drop(labels='Time')
    else:
        start_values = kepler_df.loc[kepler_df['Time'] >= start_time].iloc[0].drop(labels='Time')

    # Check if end_time is within the DataFrame's time range
    if end_time > kepler_df['Time'].max():
        print(f"End time {end_time} is after the last timestamp. Using the last available value.")
        end_values = kepler_df.iloc[-1].drop(labels='Time')
    elif end_time<kepler_df['Time'].min():
        return 0, "J"
    else:
        end_values = kepler_df.loc[kepler_df['Time'] <= end_time].iloc[-1].drop(labels='Time')

    # Calculate the difference between end and start values
    energy_difference = end_values.values - start_values.values

    # Sum up the differences
    total_energy_sum = energy_difference.sum()

    return convert_energy(total_energy_sum)

# Calculate energy consumption for the specified time range
total_energy_kepler, unit_kepler = calculate_kepler_energy_consumption(kepler_df, start_time, end_time)

print(f"Total energy consumed from {start_time} to {end_time} tracked by kepler: {total_energy_kepler} {unit_kepler}")

In [None]:
pdu_df = pd.read_csv(pdu_data_file,  parse_dates=['Time'])
pdu_df['Time'] = pd.to_datetime(pdu_df['Time'])  # Ensure consistent timezone
# Ensure the DataFrame is sorted by Time
pdu_df = pdu_df.sort_values(by='Time')
pdu_df.index = pd.to_datetime(pdu_df.index)

# Function to remove ' W' and convert to numeric
def strip_w_convert(series):
    return pd.to_numeric(series.str.replace(' W', ''), errors='coerce')

# Apply the function to all columns except 'Time'
for column in pdu_df.columns:
    if column != 'Time':
        pdu_df[column] = strip_w_convert(pdu_df[column])

def calculate_pdu_energy_consumption(pdu_df, start_time, end_time):
    # Filter the DataFrame for the specified time range
    time_filtered_df = pdu_df[(pdu_df['Time'] >= start_time) & (pdu_df['Time'] <= end_time)].copy()

    if time_filtered_df.empty:
        print("The filtered DataFrame is empty. Ensure the time range is within the data bounds.")
        return convert_energy(0)

    # Calculate time difference between consecutive measurements in seconds
    time_filtered_df.loc[:, 'Time_diff'] = time_filtered_df['Time'].diff().dt.total_seconds()

    # Calculate the energy consumed during each interval (Power * Time_diff)
    time_filtered_df['Energy_Joules'] = time_filtered_df['Value'] * time_filtered_df['Time_diff']

    # Summing up the energy consumed
    total_energy_joules = time_filtered_df['Energy_Joules'].sum()

    # Convert the total energy to the appropriate unit
    return convert_energy(total_energy_joules)

# Calculate energy consumption for the specified time range
total_energy_pdu, unit_pdu = calculate_pdu_energy_consumption(pdu_df, start_time, end_time)

print(f"Total energy consumed from {start_time} to {end_time} tracked by pdu metrics: {total_energy_pdu} {unit_pdu}")


In [None]:
kepler_energy = []
pdu_energy = []
user_counts = []

# Loop through each stage
for stage in stages:
    stage_users = stage['users']
    stage_df = locust_df[locust_df['User Count'] == stage_users]
    
    # Append zeros for empty DataFrame and continue
    if stage_df.empty:
        print("The filtered DataFrame is empty. Ensure the time range is within the data bounds.")
        kepler_energy.append(0)
        pdu_energy.append(0)
        user_counts.append(stage_users)
        continue

    # Get the min and max index (time) for the current stage
    start_time, end_time = stage_df.index.min(), stage_df.index.max()
    
    # Convert the timezoned datetime to naive datetime (UTC or localize as needed)
    start_time = start_time.tz_convert("Europe/Berlin").tz_localize(None)
    end_time = end_time.tz_convert("Europe/Berlin").tz_localize(None)
    print("Stage:",stage_users," From:",start_time," To:", end_time)
        
    k_energy, k_unit = calculate_kepler_energy_consumption(kepler_df, start_time, end_time)
    p_energy, p_unit = calculate_pdu_energy_consumption(pdu_df, start_time, end_time)
    
    kepler_energy.append(transform_joules(k_energy, k_unit, "kJ"))
    pdu_energy.append(transform_joules(p_energy, p_unit, "kJ"))
    user_counts.append(stage_users)

print(kepler_energy, pdu_energy, user_counts)

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))

# Plot lines with markers to make data points visible
ax.plot(user_counts, kepler_energy, label='Kepler Energy Consumption', color='blue', marker='o', markersize=4, zorder=50)
ax.plot(user_counts, pdu_energy, label='PDU Energy Consumption', color='green', marker='o', markersize=4, zorder=50)

# Fill between for kepler energy
ax.fill_between(user_counts, kepler_energy, color='blue', alpha=0.6, step='post', zorder=5)

# Fill between for pdu energy
ax.fill_between(user_counts, pdu_energy, color='green', alpha=0.6, step='post', zorder=1)

# Additional Plot Settings
ax.set_xlabel('User Count')
ax.set_ylabel('Energy Consumption (KiloJoules)')
ax.set_title('Energy Consumption by User Count Stage (Kepler vs PDU)')
# Set x-axis ticks and labels only to show user counts
ax.set_xticks(user_counts)
ax.set_xticklabels(user_counts)
ax.legend()
ax.grid(True)

# Rotate x-ticks
plt.xticks(rotation=45)

# Show plot
plt.show()

In [None]:
energy_efficiency_kepler = []
energy_efficiency_pdu = []
successful_requests_per_stage = []
failed_requests_per_stage = []
requests_per_stage = []

# Loop through each stage
for stage in stages:
    stage_users = stage['users']
    stage_df = locust_df[locust_df['User Count'] == stage_users]
    
    # Append zeros for empty DataFrame and continue
    if stage_df.empty:
        print("The filtered DataFrame is empty. Ensure the time range is within the data bounds.")
        energy_efficiency_kepler.append(0)
        energy_efficiency_pdu.append(0)
        successful_requests_per_stage.append(0)
        failed_requests_per_stage.append(0)
        requests_per_stage.append(0)
        continue

    successful_requests = stage_df['Total Request Count'].max() - stage_df['Total Failure Count'].max()
    failed_requests = stage_df['Total Failure Count'].max()
    requests = stage_df['Total Request Count'].max()
    
    # Calculate energy efficiency (successful requests per kJ consumed)
    k_energy_efficiency = successful_requests / kepler_energy[user_counts.index(stage_users)]
    pdu_energy_efficiency = successful_requests / pdu_energy[user_counts.index(stage_users)]
    
    energy_efficiency_kepler.append(k_energy_efficiency)
    energy_efficiency_pdu.append(pdu_energy_efficiency)
    successful_requests_per_stage.append(successful_requests)
    failed_requests_per_stage.append(failed_requests)
    requests_per_stage.append(requests)

# Calculate total energy efficiency
total_successful_requests = sum(successful_requests_per_stage)
total_energy_kepler_kj = sum(kepler_energy)
total_energy_pdu_kj = sum(pdu_energy)

total_efficiency_kepler = total_successful_requests / total_energy_kepler_kj
total_efficiency_pdu = total_successful_requests / total_energy_pdu_kj

print("Total Energy Efficiency (Kepler):", total_efficiency_kepler, "successful requests/kJ")
print("Total Energy Efficiency (PDU):", total_efficiency_pdu, "successful requests/kJ")

In [None]:
# Plot energy efficiency for each stage and successful/failed requests
fig, ax1 = plt.subplots(figsize=(16, 8))
color = 'tab:blue'
ax1.set_ylabel('Request Counts', color=color) 

# Plot successful and failed requests
ax1.plot(user_counts, requests_per_stage, label='Total Requests', color='red', marker='s', linestyle='--')
ax1.plot(user_counts, successful_requests_per_stage, label='Successfull Requests', color='green', marker='s', linestyle='--')
ax1.plot(user_counts, failed_requests_per_stage, label='Failed Requests', color='orange', marker='x', linestyle='--')

ax1.tick_params(axis='y', labelcolor=color)
ax1.legend(loc='upper right')

# Ensure the y-axis starts at 0
ax1.set_ylim(bottom=0)

# Set x-axis ticks and labels only to show user counts
ax1.set_xticks(user_counts)
ax1.set_xticklabels(user_counts)

plt.title('Aggregated Total, Successfully and Failed Requests by User Count Stage (Resetted on each Stage)')
plt.xticks(rotation=45)

# Show plot
plt.show()

In [None]:
# Plot energy efficiency for each stage and successful/failed requests
fig, ax1 = plt.subplots(figsize=(16, 8))

color = 'tab:blue'
ax1.set_xlabel('User Count')
ax1.set_ylabel('Energy Efficiency', color=color)

# Plot energy efficiency
ax1.plot(user_counts, energy_efficiency_kepler, label='Kepler Energy Efficiency', color='blue', marker='o', markersize=4)
ax1.plot(user_counts, energy_efficiency_pdu, label='PDU Energy Efficiency', color='green', marker='o', markersize=4)

ax1.tick_params(axis='y', labelcolor=color)
ax1.legend(loc='upper left')
ax1.grid()

# Ensure the y-axis starts at 0
ax1.set_ylim(bottom=0)

# Create second y-axis
ax2 = ax1.twinx()  
color = 'tab:red'
ax2.set_ylabel('Request Counts', color=color) 

# Plot successful and failed requests
ax2.plot(user_counts, successful_requests_per_stage, label='Successfull Requests', color='red', marker='s', linestyle='--')

ax2.tick_params(axis='y', labelcolor=color)
ax2.legend(loc='upper right')

# Ensure the y-axis starts at 0
ax2.set_ylim(bottom=0)

# Set x-axis ticks and labels only to show user counts
ax1.set_xticks(user_counts)
ax1.set_xticklabels(user_counts)

plt.title('Energy Efficiency and Request Counts by User Count Stage (Kepler vs PDU)')
plt.xticks(rotation=45)

# Show plot
plt.show()

In [None]:
# Laden der Daten
locust_requests_count_df = pd.read_csv(locust_data_file)

# Filter out the 'Aggregated' row
locust_requests_count_df = locust_requests_count_df[locust_requests_count_df['Name'] != 'Aggregated']

# Filtere die DataFrame-Spalten, die `container_namespace="default"` enthalten
namespace_default_columns = [col for col in kepler_df.columns if 'container_namespace="default"' in col]

# Extrahiere den `container_name` aus den Headern und speichere ihn in einer neuen DataFrame-Spalte
service_names = [col.split('container_name="')[1].split('"')[0] for col in namespace_default_columns]
service_names = set(service_names)

def calculate_service_energy_consumption(kepler_df, service_name, start_time, end_time):
    # Ensure the DataFrame is sorted by Time
    kepler_df = kepler_df.sort_values(by='Time')
    
    # Filter only the columns related to the specific service
    service_columns = [col for col in kepler_df.columns if f'container_name="{service_name}"' in col]
    
    if not service_columns:
        print(f"No data available for service: {service_name}")
        return 0, "J"
    
    # Check if start_time is within the DataFrame's time range
    if start_time < kepler_df['Time'].min():
        print(f"Start time {start_time} is before the first timestamp. Using the first available value.")
        start_values = kepler_df.iloc[0][service_columns]
    else:
        start_values = kepler_df.loc[kepler_df['Time'] >= start_time].iloc[0][service_columns]

    # Check if end_time is within the DataFrame's time range
    if end_time > kepler_df['Time'].max():
        print(f"End time {end_time} is after the last timestamp. Using the last available value.")
        end_values = kepler_df.iloc[-1][service_columns]
    elif end_time < kepler_df['Time'].min():
        return 0, "J"
    else:
        end_values = kepler_df.loc[kepler_df['Time'] <= end_time].iloc[-1][service_columns]

    # Calculate the difference between end and start values
    energy_difference = end_values.values - start_values.values

    # Sum up the differences
    total_energy_sum = energy_difference.sum()
    
    # Convert to Joules
    energy, unit = convert_energy(total_energy_sum)
    return transform_joules(energy, unit, "J")

def calculate_metrics(service_list, start_time, end_time):
    metrics_results = {}
    for service in service_list:
        total_energy = calculate_service_energy_consumption(kepler_df, service, start_time, end_time)
        total_requests = locust_requests_count_df[locust_requests_count_df['Name'].str.contains(service, na=False)]['Request Count'].sum()
        failed_requests = locust_requests_count_df[locust_requests_count_df['Name'].str.contains(service, na=False)]['Failure Count'].sum()
        successful_requests = total_requests - failed_requests
        efficiency = successful_requests / total_energy
        metrics_results[service] = {
            'efficiency': efficiency,
            'total_energy': total_energy,
            'successful_requests': successful_requests,
            'failed_requests': failed_requests,
            'total_requests': total_requests,
        }
    return pd.DataFrame(metrics_results).transpose()

service_metrics = calculate_metrics(service_names, start_time, end_time)
service_metrics

In [None]:
# Visualisierung der Ergebnisse
fig, ax = plt.subplots(figsize=(16, 8))

filtered_service_metrics = service_metrics[service_metrics['efficiency'] > 0]

ax.bar(filtered_service_metrics.index, filtered_service_metrics["efficiency"])

ax.set_xlabel('Services')
ax.set_ylabel('Energy Efficiency (Success Requests / Joules)')
ax.set_title('Energy Efficiency per Service')
ax.grid(True)

plt.show()

In [None]:
combined_metrics = service_metrics[~service_metrics.index.str.endswith('-db')].copy()

# Add DB energy consumption to the respective services
combined_metrics['total_energy'] += combined_metrics.apply(
    lambda row: service_metrics.loc[f"{row.name}-db", 'total_energy'] if f"{row.name}-db" in service_metrics.index else 0, axis=1
)

# Calculate efficiency
combined_metrics['efficiency'] = combined_metrics['successful_requests'] / combined_metrics['total_energy']

combined_metrics

In [None]:
filtered_combined_metrics = combined_metrics[combined_metrics['efficiency'] > 0]

fig, ax = plt.subplots(figsize=(16, 8))
ax.bar(filtered_combined_metrics.index, filtered_combined_metrics["efficiency"])
ax.set_xlabel('Services')
ax.set_ylabel('Energy Efficiency (Success Requests / Joules)')
ax.set_title('Energy Efficiency per Service (Including DB Consumption)')
ax.grid(True)
plt.show()