In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import pytz
import plotly.graph_objects as go
import os
import matplotlib.dates as mdates
import re

In [None]:
basedir = "../data/03-communication-efficiency/2024-08-10/17-19/"
containers_data_file = "../data/03-communication-efficiency/2024-08-10/17-19/containers.csv"
failures_data_file = "../data/03-communication-efficiency/2024-08-10/17-19/failures.csv"
outputdir = basedir+"output/"
os.makedirs(outputdir, exist_ok=True)
locust_data_history_file = basedir + "data_stats_history.csv"
locust_data_file = basedir + "data_stats.csv"

stages = [
    {"users": 100, "spawn_rate": 2},
    {"users": 200, "spawn_rate": 1},
    {"users": 300, "spawn_rate": 1},
    {"users": 400, "spawn_rate": 1},
    {"users": 500, "spawn_rate": 1}
]

valid_users = set()
for stage in stages:
    valid_users.add(stage['users'])

In [None]:
locust_df = pd.read_csv("%s" % locust_data_history_file)

# timestamp to datetime
locust_df['Timestamp'] = pd.to_datetime(locust_df['Timestamp'], unit='s')

# convert datetime to europe berline timezone and use it as index
germany_tz = pytz.timezone('Europe/Berlin')
locust_df['Timestamp'] = locust_df['Timestamp'].dt.tz_localize('UTC').dt.tz_convert(germany_tz)
locust_df.set_index('Timestamp', inplace=True)

# Delete unrelevant data
locust_df.drop(columns=['Type'], inplace=True)
locust_df = locust_df[locust_df['Total Request Count'] != 0]

# Only look at the "wanted" stages (not the step-up between stages)
locust_df = locust_df[locust_df['User Count'].isin(valid_users)]
locust_df

In [None]:
locust_requests_count_df = pd.read_csv("%s" % locust_data_file)

# Filter out the 'Aggregated' row
locust_requests_count_df = locust_requests_count_df[locust_requests_count_df['Name'] != 'Aggregated']

# Create a new column for services
locust_requests_count_df['Service'] = locust_requests_count_df['Name'].apply(lambda x: x.split('/')[1])

# Remove the service prefix from the 'Name' column
locust_requests_count_df['Request Path'] = locust_requests_count_df.apply(
    lambda row: row['Name'].replace(f"/{row['Service']}", ""), axis=1)

# Calculate the total number of requests
total_requests = locust_requests_count_df['Request Count'].sum()

# Calculate the percentage of each request type
locust_requests_count_df['Request Percentage'] = (locust_requests_count_df['Request Count'] / total_requests) * 100

# Create a new Request Type column with HTTP method and percentage
locust_requests_count_df['Formatted Request Type'] = locust_requests_count_df.apply(
    lambda row: f"{row['Type']} {row['Request Path']} ({row['Request Percentage']:.2f}%)", axis=1)

# Prepare the data for the sunburst plot
labels = locust_requests_count_df['Formatted Request Type'].tolist() + locust_requests_count_df['Service'].unique().tolist()
parents = locust_requests_count_df['Service'].tolist() + [''] * len(locust_requests_count_df['Service'].unique())
values = locust_requests_count_df['Request Percentage'].tolist() + \
         locust_requests_count_df.groupby('Service')['Request Percentage'].sum().tolist()

# Create the sunburst plot
fig = go.Figure(go.Sunburst(
    labels=labels,
    parents=parents,
    values=values,
    insidetextorientation='radial',  # Control text orientation
    branchvalues='total',
))

# Update layout
fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),
    width=800,
    height=800,
    title='Distribution of Requests per Services in Percent'
)

# Show the plot
fig.show()

In [None]:
# Get the min and max index (time) for the current stage
start_time, end_time = locust_df.index.min(), locust_df.index.max()
# Convert the timezoned datetime to naive datetime (UTC or localize as needed)
start_time = start_time.tz_convert("Europe/Berlin").tz_localize(None)
end_time = end_time.tz_convert("Europe/Berlin").tz_localize(None)
print("Start:",start_time, "; End:",end_time)

In [None]:
fig, ax1 = plt.subplots(figsize=(16, 8))

# Primary Y-Axis
ax1.set_xlabel('Timestamp')
ax1.set_ylabel('Request/s & Failure/s')
ax1.plot(locust_df.index, locust_df['Requests/s'], label='Requests/s', marker='.')
ax1.plot(locust_df.index, locust_df['Failures/s'], label='Failures/s', marker='.')
ax1.tick_params(axis='y')
ax1.legend(loc='upper left')
ax1.grid(True)
ax1.xaxis.set_major_locator(mdates.MinuteLocator(interval=5))
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M', tz=germany_tz))

# Secondary Y-Axis
ax2 = ax1.twinx()
ax2.set_ylabel('User Count')
ax2.plot(locust_df.index, locust_df['User Count'], label='User Count', marker='.',color='magenta')
ax2.tick_params(axis='y', labelcolor='magenta')
ax2.legend(loc='upper right')

#plt.title('User Count Performance Metrics over Time')
plt.savefig(outputdir+'requests_by_stage.png', bbox_inches='tight')
plt.show()

In [None]:
tmp = locust_df.groupby('User Count').agg({'Total Request Count': 'max', 'Total Failure Count': 'max'})
tmp.to_latex(outputdir+'total_request_failure_count.tex', escape=True)
tmp

In [None]:
fig, ax1 = plt.subplots(figsize=(16, 8))

# Primary Y-Axis
ax1.set_xlabel('Timestamp')
ax1.set_ylabel('Total Median Response Time (ms)')
ax1.plot(locust_df.index, locust_df['Total Median Response Time'], label='Median Response Time', marker='.')
ax1.tick_params(axis='y')
ax1.legend(loc='upper left')
ax1.grid(True)
ax1.xaxis.set_major_locator(mdates.MinuteLocator(interval=5))
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M', tz=germany_tz))

# Secondary Y-Axis
ax2 = ax1.twinx()
ax2.set_ylabel('Total Average Response Time (ms)')
ax2.plot(locust_df.index, locust_df['Total Average Response Time'], label='Avg Response Time', color='magenta', marker='.')
ax2.tick_params(axis='y', labelcolor='magenta')
ax2.legend(loc='upper right')

#plt.title('Response Time Metrics over Time')
plt.savefig(outputdir+'requests_time_by_stage.png', bbox_inches='tight')
plt.show()

In [None]:
# Box plot for Total Average Response Time
plt.figure(figsize=(12, 6))
boxplot1 = locust_df.boxplot(column='Total Average Response Time', by='User Count', grid=True)
plt.title('')
plt.suptitle('')
plt.xlabel('User Count')
plt.ylabel('Total Average Response Time (ms)')

plt.xticks(rotation=45)
plt.savefig(outputdir + 'box_plt_average_response_time_by_stage.png', bbox_inches='tight')
plt.show()

# Box plot for Total Median Response Time
plt.figure(figsize=(12, 6))
boxplot2 = locust_df.boxplot(column='Total Median Response Time', by='User Count', grid=True)
plt.title('')
plt.suptitle('')
plt.xlabel('User Count')
plt.ylabel('Total Median Response Time (ms)')

plt.xticks(rotation=45)
plt.savefig(outputdir + 'box_plt_mean_response_time_by_stage.png', bbox_inches='tight')
plt.show()

In [None]:
# Fehlerrate analysieren
locust_df['Failure Rate'] = locust_df['Total Failure Count'] / (
            locust_df['Total Request Count'] + locust_df['Total Failure Count'])

plt.figure(figsize=(12, 6))
locust_df.groupby('User Count')['Failure Rate'].mean().plot(kind='bar', rot=45)
#plt.title('Average Failure Rate by Stage')
plt.xlabel('User Count')
plt.ylabel('Failure Rate')
plt.savefig(outputdir+'plt_average_failure_rate_by_stage.png', bbox_inches='tight')
plt.show()

In [None]:
# Load data from CSV
containers_df = pd.read_csv(containers_data_file)

# Parse the time column and set it as the index
containers_df['Time'] = pd.to_datetime(containers_df['Time'])
containers_df.set_index('Time', inplace=True)

# Format column names to display only container names
containers_df.columns = [col.split('=')[1].strip(' "{}') for col in containers_df.columns]
def plot_data(start_time, end_time):
    # Filter data within the specified time range
    mask = (containers_df.index >= start_time) & (containers_df.index <= end_time)
    filtered_data = containers_df.loc[mask]
    
    # Plotting
    plt.figure(figsize=(14, 6))
    
    for column in filtered_data.columns:
        if column.endswith("service") or column == "gateway":
            plt.plot(filtered_data.index, filtered_data[column], label=column, linewidth=2.0)
        else:
            plt.plot(filtered_data.index, filtered_data[column], label='_nolegend_', linestyle='--')  
    
    plt.xlabel('Time')
    plt.ylabel('Value')
    #plt.title('Container Replicas Over Time')

    # Add legend for high variability columns only
    plt.legend(loc='best', bbox_to_anchor=(1.05, 1), borderaxespad=0.)
    
    plt.grid(True)
    xticks = pd.date_range(start=start_time, end=end_time, freq='5min') 
    plt.xticks(ticks=xticks, labels=[tick.strftime("%H:%M") for tick in xticks], rotation=45)
    plt.savefig(outputdir+'plt_container_replicas.png', bbox_inches='tight')
    plt.tight_layout()
    plt.show()

# Plot data within the specified time range
plot_data(start_time, end_time)

In [None]:
failures_df = pd.read_csv(failures_data_file, parse_dates=['Time'])
failures_df['Time'] = pd.to_datetime(failures_df['Time'])  # Ensure consistent datetime format
failures_df.set_index('Time', inplace=True)

# Fill missing values forward and ensure continuity
for column in failures_df.columns:
    failures_df[column] = failures_df[column].ffill().fillna(0)

# Function to extract desired key-value pairs
def extract_keys(column_name):
    # Extract relevant fields using regex
    pattern = r'application="(.*?)".*?error="(.*?)".*?outcome="(.*?)".*?status="(.*?)".*?uri="(.*?)"'
    match = re.search(pattern, column_name)
    
    if match:
        application, error, outcome, status, uri = match.groups()
        # Create a new column name based on the extracted values
        return f'application={application}, error={error}, outcome={outcome}, status={status}, uri={uri}'
    return column_name

# Apply the function to each column name
failures_df.columns = failures_df.columns.map(extract_keys)

# Sum the columns with the same new name
failures_df = failures_df.T.groupby(level=0).sum().T

# Calculate the rate of failures
failures_rate_df = failures_df.diff(periods=1).fillna(0)

In [None]:
def plot_failure_rate_over_time(start_time, end_time):
    # Ensure start and end times are datetime objects
    start_time = pd.to_datetime(start_time)
    end_time = pd.to_datetime(end_time)
    
    # Filter data within the specified time range
    mask = (failures_rate_df.index >= start_time) & (failures_rate_df.index <= end_time)
    filtered_data = failures_rate_df.loc[mask]
    
    # Plotting
    plt.figure(figsize=(14, 6))

    for column in filtered_data.columns:
        if filtered_data[column].max() > 0:
            plt.plot(filtered_data.index, filtered_data[column], label=column, linewidth=2.0)
        else:
            plt.plot(filtered_data.index, filtered_data[column], label='_nolegend_', linestyle='--')  
    
    plt.xlabel('Time')
    plt.ylabel('Failure Rate')
    #plt.title("Failure Rate Over Time")
    
    # Add legend
    plt.legend(loc='upper left', bbox_to_anchor=(0, 1.2))
    
    plt.grid(True)
    xticks = pd.date_range(start=start_time, end=end_time, freq='5min') 
    plt.xticks(ticks=xticks, labels=[tick.strftime("%H:%M") for tick in xticks], rotation=45)
    plt.savefig(outputdir+"plt_failure_rate_over_time.png", bbox_inches='tight')
    plt.tight_layout()
    plt.show()

plot_failure_rate_over_time(start_time, end_time)

In [None]:
# Create overview of data
summary_stats = locust_df.groupby('User Count').agg({
    'Total Average Response Time': ['mean', 'median', 'std'],
    'Total Median Response Time': ['mean', 'median', 'std'],
    'Failure Rate': ['mean', 'median', 'std']
})

# visualize overview
summary_stats.plot(kind='bar', subplots=True, layout=(3, 3), figsize=(18, 16))
plt.show()

In [None]:
# Calculate confidence intervals for the 'Total Median Response Time'
grouped_stats = locust_df.groupby('User Count')['Total Median Response Time'].agg(['mean', 'count', 'std'])

# Calculate t-value for a 95% confidence interval
t_value = stats.t.ppf(0.975, grouped_stats['count'] - 1)  # 0.975 corresponds to (1 - alpha/2)

# Calculate the margin of error
grouped_stats['margin_of_error'] = t_value * grouped_stats['std'] / (grouped_stats['count'] ** 0.5)

# Calculate the lower and upper bounds of the confidence interval
grouped_stats['ci_low'] = grouped_stats['mean'] - grouped_stats['margin_of_error']
grouped_stats['ci_high'] = grouped_stats['mean'] + grouped_stats['margin_of_error']

# Plotting
plt.figure(figsize=(10, 6))
plt.errorbar(grouped_stats.index, grouped_stats['mean'],
             yerr=grouped_stats['margin_of_error'],
             fmt='-o', ecolor='r', capsize=5, capthick=2, label='95% CI')

plt.title('95% Confidence Intervals for Mean Total Median Response Time')
plt.xlabel('User Count')
plt.ylabel('Total Median Response Time')
plt.legend()
plt.grid(True)
plt.savefig(outputdir+'confidence_intervals_median_response_time.png', bbox_inches='tight')
plt.show()

In [None]:
# Calculate confidence intervals for the 'Failure Rate'
grouped_stats = locust_df.groupby('User Count')['Failure Rate'].agg(['mean', 'count', 'std'])

# Calculate t-value for a 95% confidence interval
t_value = stats.t.ppf(0.975, grouped_stats['count'] - 1)  # 0.975 corresponds to (1 - alpha/2)

# Calculate the margin of error
grouped_stats['margin_of_error'] = t_value * grouped_stats['std'] / (grouped_stats['count'] ** 0.5)

# Calculate the lower and upper bounds of the confidence interval
grouped_stats['ci_low'] = grouped_stats['mean'] - grouped_stats['margin_of_error']
grouped_stats['ci_high'] = grouped_stats['mean'] + grouped_stats['margin_of_error']

# Plotting
plt.figure(figsize=(10, 6))
plt.errorbar(grouped_stats.index, grouped_stats['mean'],
             yerr=grouped_stats['margin_of_error'],
             fmt='-o', ecolor='r', capsize=5, capthick=2, label='95% CI')

plt.title('95% Confidence Intervals for Mean Failure Rate')
plt.xlabel('User Count')
plt.ylabel('Failure Rate')
plt.legend()
plt.grid(True)
plt.savefig(outputdir+'confidence_intervals_failure_rate.png', bbox_inches='tight')
plt.show()