# Read input data

## Frontiers

In [None]:
import re
import pprint

# Define a pattern to match the relevant data
RE_IGNORE_OPTIONAL_LINE = r'(?:[^\n]*)?'
pattern = [
    r'Running ([/\w]+) on (\w+)[^\n]*',
    r'Number of threads: (\d+)',
    r'Frontier sizes: ((?:\d+ )+)',
    r'Frontier max deg diff: ((?:\d+ )+)',
    r'Runtime: (\d+\.\d+)'
]
pattern = r'\n?'.join(pattern) # This ? may cause unexpected behavior
print(f'Pattern: {pattern}')

# Read the text to parse from 'data_datasets/frontiers.log'
with open('data_datasets/frontiers.log', 'r') as file:
    file_txt = file.read()

# Find all matches in the string
matches = re.findall(pattern, file_txt)

# Convert matches to a list of dictionaries
frontier_data = [{
    'binary': match[0],
    'dataset': match[1],
    'threads': int(match[2]),
    'frontier_sizes': list(map(int, match[3].split())),
    'frontier_max_deg_diff': list(map(int, match[4].split())),
    } for match in matches]

pprint.pprint(frontier_data)

## Degree distribution

In [None]:
# Define a pattern to match the relevant data
RE_IGNORE_OPTIONAL_LINE = r'(?:[^\n]*)?'
pattern = [
    r'Running ([/\w]+) on (\w+)[^\n]*',
    r'Average degree: (\d+\.\d+)',
    r'Max degree: (\d+)',
    r'Min degree: (\d+)',
    r'Histogram Data: ((?:\(\d+-\d+:\d+\))+)'
]
pattern = r'\n?'.join(pattern) # This ? may cause unexpected behavior
print(f'Pattern: {pattern}')

# Read the text to parse from 'data_datasets/frontiers.log'
with open('data_datasets/degree_distribution.log', 'r') as file:
    file_txt = file.read()

# Find all matches in the string
matches = re.findall(pattern, file_txt)

# Convert matches to a list of dictionaries
def parse_hist_entry(entry) -> tuple[int, int, int]:
    entry = entry.split(":")
    s = entry[0].split("-")
    min = int(s[0])
    max = int(s[1])
    return min, max, int(entry[1])

degree_data = [{
    'binary': match[0],
    'dataset': match[1],
    'avg': float(match[2]),
    'max': int(match[3]),
    'min': int(match[4]),
    'degrees': list(map(parse_hist_entry, match[5][1:-1].split(")(")))
    } for match in matches]

pprint.pprint(degree_data)

In [None]:
for deg in degree_data:
    print(deg['dataset'])

# Generate plots on frontiers

In [None]:
import matplotlib.pyplot as plt

# Separate the data based on the size of the frontier_sizes array
small_frontier_data = [item for item in frontier_data if len(item['frontier_sizes']) < 100]
large_frontier_data = [item for item in frontier_data if len(item['frontier_sizes']) >= 100]

# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))

# Plot small frontier data
for item in small_frontier_data:
    axes[0].plot(item['frontier_sizes'], label=item['dataset'])
axes[0].set_title('Small-diameter Graphs')
axes[0].set_xlabel('Step')
axes[0].set_ylabel('Frontier Size')
axes[0].legend()
axes[0].grid(True)

# Plot large frontier data
for item in large_frontier_data:
    axes[1].plot(item['frontier_sizes'], label=item['dataset'])
axes[1].set_title('Large-diameter Graphs')
axes[1].set_xlabel('Step')
axes[1].set_ylabel('Frontier Size')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

# Create subplots for frontier_max_deg_diff
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))

# Plot small frontier max degree difference data
for item in small_frontier_data:
    axes[0].plot(item['frontier_max_deg_diff'], label=item['dataset'])
axes[0].set_title('Small-diameter Graphs - Max Degree Difference')
axes[0].set_xlabel('Step')
axes[0].set_ylabel('Max Degree Difference')
axes[0].set_yscale('log')
axes[0].legend()
axes[0].grid(True)

# Plot large frontier max degree difference data
for item in large_frontier_data:
    axes[1].plot(item['frontier_max_deg_diff'], label=item['dataset'])
axes[1].set_title('Large-diameter Graphs - Max Degree Difference')
axes[1].set_xlabel('Step')
axes[1].set_ylabel('Max Degree Difference')
axes[1].set_yscale('log')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))

# Boxplot for small frontier data
axes[0].boxplot([item['frontier_sizes'] for item in small_frontier_data], tick_labels=[item['dataset'] for item in small_frontier_data])
axes[0].set_title('Small-diameter Graphs')
axes[0].set_xlabel('Dataset')
axes[0].set_ylabel('Frontier Size')

# Boxplot for large frontier data
axes[1].boxplot([item['frontier_sizes'] for item in large_frontier_data], tick_labels=[item['dataset'] for item in large_frontier_data])
axes[1].set_title('Large-diameter Graphs')
axes[1].set_xlabel('Dataset')
axes[1].set_ylabel('Frontier Size')

plt.tight_layout()
plt.show()

# Generate plots on degrees

In [None]:
import matplotlib.pyplot as plt

# Determine the number of rows and columns for the plot grid
num_datasets = len(degree_data)
num_cols = 2
num_rows = (num_datasets + num_cols - 1) // num_cols

# Create subplots
fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20, 5 * num_rows))

# Flatten axes for easy iteration
axes = axes.flatten()
i = 0

# Plot each dataset
for i, data in enumerate(degree_data):
    ax = axes[i]
    degrees = list(filter(lambda x: x[2] > 0, data['degrees']))
    ranges = [f"{a}-{b}" for a, b, _ in degrees]
    frequencies = [c for _, _, c in degrees]
    ax.bar(ranges, frequencies)
    ax.set_title(data['dataset'])
    ax.set_xlabel('Degree Range')
    ax.set_ylabel('Frequency')
    ax.tick_params(axis='x', rotation=90)
    ax.set_yscale('log')
    ax.grid(True)

# Remove any empty subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()