In [22]:
import re
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np

# Regular expressions
failure_pattern = re.compile(
    r'I(\d{8} \d{2}:\d{2}:\d{2}\.\d+).*Leader has stopped for term: (\d+)'
)
election_pattern = re.compile(
    r'I(\d{8} \d{2}:\d{2}:\d{2}\.\d+).*Became leader\. Starting to send heartbeats\. Elected Term: (\d+)')

def parse_log_file(file_path):
    failures = []
    elections = []

    with open(file_path, 'r') as file:
        for line in file:
            # Check for failure events
            failure_match = failure_pattern.search(line)
            if failure_match:
                timestamp_str = failure_match.group(1)
                timestamp = datetime.strptime(timestamp_str, '%Y%m%d %H:%M:%S.%f')
                term = int(election_match.group(2))
                failures.append({'timestamp': timestamp, 'term': term, 'file': os.path.basename(file_path)})
                continue

            # Check for election events
            election_match = election_pattern.search(line)
            if election_match:
                timestamp_str = election_match.group(1)
                term = int(election_match.group(2))
                timestamp = datetime.strptime(timestamp_str, '%Y%m%d %H:%M:%S.%f')
                elections.append({'timestamp': timestamp, 'term': term, 'file': os.path.basename(file_path)})
                continue

    return failures, elections

def parse_all_logs(log_directory):
    all_failures = []
    all_elections = []

    for filename in os.listdir(log_directory):
        if filename.startswith('node_') and filename.endswith('.log'):
            file_path = os.path.join(log_directory, filename)
            print(file_path)
            failures, elections = parse_log_file(file_path)
            print(f'Found {len(failures)} failures and {len(elections)} elections')
            all_failures.extend(failures)
            all_elections.extend(elections)

    return all_failures, all_elections

In [23]:
log_directory = './logs'  # Change this if your logs are in a different directory

# Parse all logs
all_failures, all_elections = parse_all_logs(log_directory)

# Convert to DataFrames
failures_df = pd.DataFrame(all_failures)
elections_df = pd.DataFrame(all_elections)

print(failures_df)
print(elections_df)

./logs/node_2.log
Found 1 failures and 2 elections
./logs/node_0.log
Found 4 failures and 4 elections
./logs/node_4.log
Found 2 failures and 3 elections
./logs/node_1.log
Found 3 failures and 3 elections
./logs/node_3.log
Found 2 failures and 2 elections
                    timestamp        file
0  2024-09-26 18:37:33.164552  node_2.log
1  2024-09-26 18:37:30.834817  node_0.log
2  2024-09-26 18:37:40.441008  node_0.log
3  2024-09-26 18:37:49.777864  node_0.log
4  2024-09-26 18:37:54.468081  node_0.log
5  2024-09-26 18:37:28.479329  node_4.log
6  2024-09-26 18:37:38.066117  node_4.log
7  2024-09-26 18:37:42.771521  node_1.log
8  2024-09-26 18:37:47.484929  node_1.log
9  2024-09-26 18:37:52.121856  node_1.log
10 2024-09-26 18:37:35.541015  node_3.log
11 2024-09-26 18:37:45.116174  node_3.log
                    timestamp  term        file
0  2024-09-26 18:37:30.854623     3  node_2.log
1  2024-09-26 18:37:54.507366    14  node_2.log
2  2024-09-26 18:37:28.525143     2  node_0.log
3  2024

In [13]:


# Sort DataFrames
failures_df = failures_df.sort_values('timestamp').reset_index(drop=True)
elections_df = elections_df.sort_values('timestamp').reset_index(drop=True)

# Calculate time differences
time_differences = []

for index, failure_row in failures_df.iterrows():
    failure_time = failure_row['timestamp']
    failure_file = failure_row['file']

    # Find the next election event after the failure
    subsequent_elections = elections_df[elections_df['timestamp'] > failure_time]

    if not subsequent_elections.empty:
        # Assuming the next election event is the new leader election
        next_election = subsequent_elections.iloc[0]
        election_time = next_election['timestamp']
        term = next_election['term']

        # Calculate time difference in milliseconds
        time_diff = (election_time - failure_time).total_seconds() * 1000

        time_differences.append({
            'term': term,
            'failure_time': failure_time,
            'election_time': election_time,
            'time_difference_ms': time_diff
        })

# Create DataFrame of time differences
differences_df = pd.DataFrame(time_differences)

# Output the time differences
print(differences_df)

# Plot the CDF
time_diff_values = differences_df['time_difference_ms'].values
sorted_times = np.sort(time_diff_values)
cdf = np.arange(1, len(sorted_times)+1) / len(sorted_times)

plt.figure(figsize=(10, 6))
plt.plot(sorted_times, cdf, marker='.', linestyle='none')
plt.xlabel('Time to Elect New Leader (ms)')
plt.ylabel('CDF')
plt.title('CDF of Time Needed to Elect a New Leader After Failure')
plt.grid(True)
plt.show()

KeyError: 'timestamp'