# Read input data

## Frontiers

In [None]:
import re
import pprint
import os

# Define a pattern to match the relevant data
RE_IGNORE_OPTIONAL_LINE = r'(?:[^\n]*)?'
pattern = [
    r'Running ([/\w]+) on (\w+)[^\n]*',
    r'Number of threads: (\d+)',
    r'Frontier sizes: ((?:\d+ )+)',
    r'Frontier edges: ((?:\d+ )+)',
    r'Frontier max deg diff: ((?:\d+ )+)',
    r'Runtime: (\d+\.\d+)'
]
pattern = r'\n?'.join(pattern) # This ? may cause unexpected behavior
print(f'Pattern: {pattern}')

# Read the text to parse from 'data_datasets/frontiers.log'
with open('data_datasets/frontiers.log', 'r') as file:
    file_txt = file.read()

# Find all matches in the string
matches = re.findall(pattern, file_txt)

# Convert matches to a list of dictionaries
frontier_data = [{
    'binary': match[0],
    'dataset': match[1],
    'threads': int(match[2]),
    'frontier_sizes': list(map(int, match[3].split())),
    'frontier_deg': list(map(int, match[4].split())),
    'frontier_max_deg_diff': list(map(int, match[5].split())),
    } for match in matches]

pprint.pprint(frontier_data)

In [None]:
pattern = [
    r'Running [/\w]+ on (\w+)[^\n]*',
    RE_IGNORE_OPTIONAL_LINE,
    RE_IGNORE_OPTIONAL_LINE,
    r'Number of threads: \d+',
    r'Thread tot iterations: (?:\d+ )*',
    r'Thread tot writes: (?:\d+ )*',
    r'Top-down Bottom-up switches: ((?:\d+-\d+ )*)',
    r'Runtime: \d+\.[\d]+'
]
pattern = r'\n'.join(pattern)

# List all files in the 'data_balancing' directory
all_files = os.listdir('data_balancing')

# Filter files that start with 'balancing_'
files = [f for f in all_files if f.startswith('balancing_')]
switch_data = {}

for filename in files:
    # Read the text to parse from 'data_datasets/frontiers.log'
    with open(f'data_balancing/{filename}', 'r') as file:
        txt = file.read()

    # Find all matches in the string
    matches = re.findall(pattern, txt)

    # Convert matches to a list of dictionaries
    switch_data[filename[len('balancing_'):-4]] = [{
        'dataset': match[0],
        'td_bu_switches': [(int(x.split('-')[0]),int(x.split('-')[1])) for x in match[1].split()],
        } for match in matches]

pprint.pprint(switch_data)

## Degree distribution

In [None]:
# Define a pattern to match the relevant data
RE_IGNORE_OPTIONAL_LINE = r'(?:[^\n]*)?'
pattern = [
    r'Running ([/\w]+) on (\w+)[^\n]*',
    r'Average degree: (\d+\.\d+)',
    r'Max degree: (\d+)',
    r'Min degree: (\d+)',
    r'Histogram Data: ((?:\(\d+-\d+:\d+\))+)'
]
pattern = r'\n?'.join(pattern) # This ? may cause unexpected behavior
print(f'Pattern: {pattern}')

# Read the text to parse from 'data_datasets/frontiers.log'
with open('data_datasets/degree_distribution.log', 'r') as file:
    file_txt = file.read()

# Find all matches in the string
matches = re.findall(pattern, file_txt)

# Convert matches to a list of dictionaries
def parse_hist_entry(entry) -> tuple[int, int, int]:
    entry = entry.split(":")
    s = entry[0].split("-")
    min = int(s[0])
    max = int(s[1])
    return min, max, int(entry[1])

degree_data = [{
    'binary': match[0],
    'dataset': match[1],
    'avg': float(match[2]),
    'max': int(match[3]),
    'min': int(match[4]),
    'degrees': list(map(parse_hist_entry, match[5][1:-1].split(")(")))
    } for match in matches]

pprint.pprint(degree_data)

# Generate plots on frontiers

In [None]:
for switches in switch_data['small']:
    print(f'dataset: {switches["dataset"]}')
    for switch in switches['td_bu_switches']:
        print(f'switch: {switch}')

In [None]:
import matplotlib.pyplot as plt

FONT_TITLE = 20
FONT_AXES = 20
FONT_TICKS = 18
FONT_LEGEND = 16

PLOT_SWITCHES = False
PLOT_FRONTIER_DEG = False
colors = ['b', 'g', 'r', 'k', 'm', 'y', 'c', 'orange', 'purple', 'brown']
colors1 = colors[::-1]

# Separate the data based on the size of the frontier_sizes array
small_frontier_data = [item for item in frontier_data if len(item['frontier_sizes']) < 100]
large_frontier_data = [item for item in frontier_data if len(item['frontier_sizes']) >= 100]
for item in [small_frontier_data, large_frontier_data]:
    for i, data in enumerate(item):
        data['color'] = colors[i]
        data['color1'] = colors1[i]

# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
datasets_filter = None
# datasets_filter = ['Social_Network_1'] # Social_Network_1 Collaboration_Network_1 Web_Graph_1 Synthetic_Dense_1

# Plot small frontier data
for item in small_frontier_data:
    if datasets_filter is not None and item['dataset'] not in datasets_filter:
        continue
    # print(item)
    axes[0].plot(item['frontier_sizes'], label=item['dataset'], color=item['color'])
    if PLOT_FRONTIER_DEG:
        axes[0].plot(item['frontier_deg'], label=item['dataset']+'_deg', color=item['color1'])

if PLOT_SWITCHES:
    for switches in switch_data['small']:
        if datasets_filter is not None and switches['dataset'] not in datasets_filter:
            continue
        item = next(item for item in small_frontier_data if item['dataset'] == switches['dataset'])
        fronties = item['frontier_sizes']
        fronties_deg = item['frontier_deg']
        # print(f'fronties: {fronties}')
        # print(f'dataset: {switches["dataset"]}')
        for i, dir in switches['td_bu_switches']:
            # print(f'switch: {i, fronties[i-1]}')
            size = fronties[i-1]
            if size > 1e3:
                size = str(int(size/1e3)) + 'k'
            else:
                size = str(size)
            axes[0].text(i-1, fronties[i-1]+1e7, (r'$\uparrow$' if dir == 1 else r'$\downarrow$')+size, color=item['color'], ha='center', va='bottom')
            size = fronties_deg[i-1]
            if size > 1e3:
                size = str(int(size/1e3)) + 'k'
            else:
                size = str(size)
            axes[0].text(i-1, fronties_deg[i-1], (r'$\uparrow$' if dir == 1 else r'$\downarrow$')+size, color=item['color1'], ha='center', va='bottom')

axes[0].set_title('Large-frontier Graphs', fontsize=FONT_TITLE)
axes[0].set_xlabel('Iteration', fontsize=FONT_AXES)
axes[0].set_xticks(range(0, 14, 1))
axes[0].set_ylabel('Frontier Size (vertices count)', fontsize=FONT_AXES)
axes[0].set_yscale('log')
axes[0].tick_params(axis='x', labelsize=FONT_TICKS)
axes[0].tick_params(axis='y', labelsize=FONT_TICKS)
axes[0].legend(loc='best', fontsize=FONT_LEGEND)
axes[0].grid(True, linestyle=':')

# Plot large frontier data
for item in large_frontier_data:
    axes[1].plot(item['frontier_sizes'], label=item['dataset'], color=item['color'])
axes[1].set_title('Small-frontier Graphs', fontsize=FONT_TITLE)
axes[1].set_xlabel('Iteration', fontsize=FONT_AXES)
# axes[1].set_ylabel('Frontier Size')
axes[1].set_yscale('log')
axes[1].tick_params(axis='x', labelsize=FONT_TICKS)
axes[1].tick_params(axis='y', labelsize=FONT_TICKS)
axes[1].legend(loc='best', fontsize=FONT_LEGEND)
axes[1].grid(True, linestyle=':')

plt.tight_layout()
plt.show()

In [None]:
# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
datasets_filter = None
# datasets_filter = ['Social_Network_1'] # Social_Network_1 Collaboration_Network_1 Web_Graph_1 Synthetic_Dense_1

# Plot small frontier data
for item in small_frontier_data:
    if datasets_filter is not None and item['dataset'] not in datasets_filter:
        continue
    # print(item)
    axes[0].plot(item['frontier_deg'], label=item['dataset'], color=item['color'])

if PLOT_SWITCHES:
    for switches in switch_data['small']:
        if datasets_filter is not None and switches['dataset'] not in datasets_filter:
            continue
        item = next(item for item in small_frontier_data if item['dataset'] == switches['dataset'])
        fronties = item['frontier_sizes']
        fronties_deg = item['frontier_deg']
        # print(f'fronties: {fronties}')
        # print(f'dataset: {switches["dataset"]}')
        for i, dir in switches['td_bu_switches']:
            # print(f'switch: {i, fronties[i-1]}')
            size = fronties[i-1]
            if size > 1e3:
                size = str(int(size/1e3)) + 'k'
            else:
                size = str(size)
            axes[0].text(i-1, fronties[i-1]+1e7, (r'$\uparrow$' if dir == 1 else r'$\downarrow$')+size, color=item['color'], ha='center', va='bottom')
            size = fronties_deg[i-1]
            if size > 1e3:
                size = str(int(size/1e3)) + 'k'
            else:
                size = str(size)
            axes[0].text(i-1, fronties_deg[i-1], (r'$\uparrow$' if dir == 1 else r'$\downarrow$')+size, color=item['color1'], ha='center', va='bottom')

axes[0].set_title('Small-diameter Graphs')
axes[0].set_xlabel('Step')
axes[0].set_ylabel('Frontier Degree')
axes[0].set_yscale('log')
axes[0].legend()
axes[0].grid(True)

# Plot large frontier data
for item in large_frontier_data:
    axes[1].plot(item['frontier_deg'], label=item['dataset'], color=item['color'])
axes[1].set_title('Large-diameter Graphs')
axes[1].set_xlabel('Step')
# axes[1].set_ylabel('Frontier Degree')
axes[1].set_yscale('log')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))

# Boxplot for small frontier data
axes[0].boxplot([item['frontier_sizes'] for item in small_frontier_data], tick_labels=[item['dataset'] for item in small_frontier_data])
axes[0].set_title('Small-diameter Graphs')
axes[0].set_xlabel('Dataset')
axes[0].set_ylabel('Frontier Size')

# Boxplot for large frontier data
axes[1].boxplot([item['frontier_sizes'] for item in large_frontier_data], tick_labels=[item['dataset'] for item in large_frontier_data])
axes[1].set_title('Large-diameter Graphs')
axes[1].set_xlabel('Dataset')
axes[1].set_ylabel('Frontier Size')

plt.tight_layout()
plt.show()

In [None]:
# Create subplots for frontier_max_deg_diff
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))

#axes[0].set_yscale('log')

# Plot small frontier max degree difference data
for item in small_frontier_data:
    if item['dataset'] not in datasets_filter:
        continue
    print(item['frontier_max_deg_diff'])
    axes[0].plot(item['frontier_max_deg_diff'], label=item['dataset'], color=item['color'])

for switches in switch_data['small']:
    if switches['dataset'] not in datasets_filter:
        continue
    item = next(item for item in small_frontier_data if item['dataset'] == switches['dataset'])
    fronties = item['frontier_max_deg_diff']
    # print(f'fronties: {fronties}')
    # print(f'dataset: {switches["dataset"]}')
    for i, dir in switches['td_bu_switches']:
        print(f'switch: {i-1, fronties[i-1]}')
        size = fronties[i-1]
        if size > 1e3:
            size = str(int(size/1e3)) + 'k'
        else:
            size = str(size)
        print(size)
        axes[0].text(i-1, fronties[i-1], (r'$\uparrow$' if dir == 1 else r'$\downarrow$')+size, color=item['color'], ha='center', va='bottom')

axes[0].set_title('Small-diameter Graphs - Max Degree Difference')
axes[0].set_xlabel('Step')
axes[0].set_ylabel('Max Degree Difference')
axes[0].legend()
axes[0].grid(True)

# Plot large frontier max degree difference data
for item in large_frontier_data:
    axes[1].plot(item['frontier_max_deg_diff'], label=item['dataset'])
axes[1].set_title('Large-diameter Graphs - Max Degree Difference')
axes[1].set_xlabel('Step')
axes[1].set_ylabel('Max Degree Difference')
axes[1].set_yscale('log')
axes[1].legend()
axes[1].grid(True)

# plt.tight_layout()
plt.show()

# Generate plots on degrees

In [None]:
import matplotlib.pyplot as plt

# Determine the number of rows and columns for the plot grid
num_datasets = len(degree_data)
num_cols = 2
num_rows = (num_datasets + num_cols - 1) // num_cols

# Create subplots
fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20, 5 * num_rows))

# Flatten axes for easy iteration
axes = axes.flatten()
i = 0

# Plot each dataset
for i, switch_data in enumerate(degree_data):
    ax = axes[i]
    degrees = list(filter(lambda x: x[2] > 0, switch_data['degrees']))
    ranges = [f"{a}-{b}" for a, b, _ in degrees]
    frequencies = [c for _, _, c in degrees]
    ax.bar(ranges, frequencies)
    ax.set_title(switch_data['dataset'])
    ax.set_xlabel('Degree Range')
    ax.set_ylabel('Frequency')
    ax.tick_params(axis='x', rotation=90)
    ax.set_yscale('log')
    ax.grid(True)

# Remove any empty subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()