In [7]:
import sys
sys.path.append("/scratch/group/csce435-f24/python-3.10.4/lib/python3.10/site-packages")
sys.path.append("/scratch/group/csce435-f24/thicket")
from glob import glob
import matplotlib.pyplot as plt
import pandas as pd
import thicket as th

# Ensure pandas displays all rows/columns for easier inspection
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

Read all files

In [8]:
# Load the performance data into a Thicket dataframe
cali_files = glob("cali_outputs/*.cali")  # Adjust the path to where your cali files are located
tk = th.Thicket.from_caliperreader(cali_files)

(1/2) Reading Files: 100%|██████████| 254/254 [00:05<00:00, 42.56it/s]
(2/2) Creating Thicket: 100%|██████████| 253/253 [00:07<00:00, 34.97it/s]


In [9]:
tk.metadata_column_to_perfdata("input_size")
tk.metadata_column_to_perfdata("num_procs")
tk.metadata_column_to_perfdata("data_type")

# tk.dataframe

In [10]:
# import os
# import matplotlib.pyplot as plt

# # Create the directory to save plots if it does not exist
# output_dir = "plots"
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# # Define input sizes and thread counts (number of processes)
# n_vals = [65536, 262144, 1048576, 4194304, 67108864]
# thread_counts = [2, 4, 8, 16, 32]  # Replace with actual numbers of threads/ranks as needed
# legends = ["2^16", "2^18", "2^20", "2^22", "2^24"]  # Legend labels for input sizes
# data_types = ['s', 'r', 'v', 'p']  # List of data types to filter

# # Loop through each data type
# for data_type in data_types:
#     plt.figure()  # Create a new figure for each data type
#     plt.title(f"Speedup for Data Type: {data_type}")  # Set the title for the plot

#     # Loop through each input size to calculate and plot speedup
#     for n_val, label in zip(n_vals, legends):
#         # Filter the dataframe by input size and data type
#         df_filtered = tk.dataframe[(tk.dataframe['input_size'] == n_val) & (tk.dataframe['data_type'] == data_type)]

#         # Extract baseline time for 2 threads/processes
#         baseline_rows = df_filtered[df_filtered['num_procs'] == 2]
#         if baseline_rows.empty:
#             print(f"No baseline data available for input size {n_val} with 2 processors and data_type {data_type}.")
#             continue

#         baseline_time = baseline_rows['Avg time/rank'].values[0]

#         # Calculate speedup for each thread count
#         speedup_values = []
#         for threads in thread_counts:
#             # Check if data for this number of threads exists
#             thread_row = df_filtered[df_filtered['num_procs'] == threads]
#             if not thread_row.empty:
#                 time_for_threads = thread_row['Avg time/rank'].values[0]
#                 speedup = 2 * baseline_time / time_for_threads
#                 speedup_values.append(speedup)
#             else:
#                 speedup_values.append(None)  # Placeholder for missing data
#                 print(f"No data available for input size {n_val} with {threads} processors and data_type {data_type}.")

#         # Plot speedup for current input size
#         plt.plot(thread_counts, speedup_values, label=label)

#     # Configure plot settings
#     plt.xscale("log", base=2)  # Logarithmic scale for thread counts
#     plt.xlabel("Threads")
#     plt.ylabel("Speedup")
#     plt.legend(title="Input Size")
#     plt.grid(True)  # Optional: add grid for better visibility

#     # Save the plot as an image file in the specified output directory
# #     filename = f"{output_dir}/speedup_data_type_{data_type}.png"
# #     plt.savefig(filename)
# #     print(f"Plot saved as {filename}")

#     # Display the plot for the current data type
#     plt.show()


View Calltree

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Create the directory if it doesn't exist
output_dir = "weak_scaling_plots"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# MPI Only
# For Weak Scaling

def calculate_weak_scaling(tk, input_type, initial_size=65536, max_threads=1024):
    times = []
    l = []

    # List of starting thread counts
    starting_threads = [2, 4, 8, 16, 32, 64, 128, 256, 512]

    # Loop through each starting thread count
    for threads in starting_threads:
        time = []
        thread_l = []

        size = initial_size
        current_threads = threads

        # Filter the dataframe for the required initial size and input type
        base_df = tk.dataframe[(tk.dataframe['input_size'] == initial_size) &
                               (tk.dataframe['data_type'] == input_type) &
                               (tk.dataframe['num_procs'] == threads)]

        # Check if there is data for the current configuration
        if base_df.empty:
            print(f"No base time available for initial size {initial_size}, input type '{input_type}', and threads {threads}")
            continue

        # Get the base time for the current starting number of threads
        base = base_df['Avg time/rank'].values[0]

        # Loop through each scaling configuration
        while current_threads <= max_threads:
            # Filter for the current size and threads
            current_df = tk.dataframe[(tk.dataframe['input_size'] == size) &
                                      (tk.dataframe['data_type'] == input_type) &
                                      (tk.dataframe['num_procs'] == current_threads)]

            # Check if there is data for the current configuration
            if current_df.empty:
                print(f"No data available for size {size}, input type '{input_type}', and threads {current_threads}")
            else:
                # Calculate the speedup
                avg_time = current_df['Avg time/rank'].values[0]
                speedup = base / avg_time
                time.append(speedup)
                thread_l.append(current_threads)

            # Update the number of threads and input size
            current_threads *= 4
            size *= 4

        # Append results if any data was found
        if time:
            times.append(time)
            l.append(thread_l)

    return l, times

# Iterate through each input type and generate plots
input_types = ['s', 'r', 'v', 'p']
input_type_labels = {
    's': 'Sorted',
    'r': 'Random',
    'v': 'Reverse Sorted',
    'p': '1% Perturbed'
}

for input_type in input_types:
    l, times = calculate_weak_scaling(tk, input_type)

    # Plot the results if any data was generated
    if times:
        plt.figure(figsize=(10, 6))
        for i in range(len(times)):
            plt.plot(l[i], times[i], marker='o')

        plt.legend([f"2^{int(np.log2(65536 * (4**i)))} e/t" for i in range(len(times))])
        plt.xscale("log", base=2)
        plt.xlabel("Threads")
        plt.ylabel("Speedup (Normalized Time per Rank)")
        plt.title(f"Weak Scaling for Input Type: {input_type_labels[input_type]}")
        plt.grid(True)
        
        # Save the plot as a PNG file
        plot_filename = f"weak_scaling_{input_type_labels[input_type].replace(' ', '_').lower()}.png"
        plot_path = os.path.join(output_dir, plot_filename)
        plt.savefig(plot_path, format='png')
        plt.close()
        print(f"Plot saved to {plot_path}")
    else:
        print(f"No data available to plot for input type '{input_type}'.")


No data available for size 16777216, input type 's', and threads 1024
Plot saved to weak_scaling_plots/weak_scaling_sorted.png
Plot saved to weak_scaling_plots/weak_scaling_random.png
No data available for size 16777216, input type 'v', and threads 1024
Plot saved to weak_scaling_plots/weak_scaling_reverse_sorted.png
No data available for size 16777216, input type 'p', and threads 1024
Plot saved to weak_scaling_plots/weak_scaling_1%_perturbed.png


In [None]:
# import pandas as pd

# def export_process_time_csv(df, input_size=67108864, input_type="random", output_file="process_time_data.csv"):
#     """
#     Exports the number of processes vs. time for a specified input size and input type to a CSV file.

#     Parameters:
#     - df: DataFrame containing Caliper data with 'input_size', 'input_type', 'num_procs', and 'Avg time/rank' columns.
#     - input_size: Specific input size to filter (default is 2^16).
#     - input_type: Specific input type to filter (default is 'random').
#     - output_file: Name of the output CSV file (default is 'process_time_data.csv').
#     """
#     # Filter the data based on input size and input type
#     filtered_df = df[(df['input_size'] == input_size) &
#                      (df['data_type'] == data_type) &
#                      (df['name'] == "MPI_Comm_dup")]

#     print(filtered_df)
    
#     # Check if there is any data available after filtering
#     if filtered_df.empty:
#         print(f"No data available for input size {input_size} and input type {input_type}.")
#         return
    
#     # Select only the columns for number of processes and average time per rank
#     process_time_df = filtered_df[['num_procs', 'Total time']].sort_values(by='num_procs')



#     # Save the result to a CSV file
#     process_time_df.to_csv(output_file, index=False)
#     print(f"Data successfully saved to {output_file}")

# # Example usage with Caliper data DataFrame (tk.dataframe):
# export_process_time_csv(df=tk.dataframe, input_size=67108864, input_type="random", output_file="merge_sort_process_time.csv")

In [None]:
# import pandas as pd

# def export_speedup_data_csv(df, input_size=67108864, input_type="random", mpi_function="MPI_Comm_dup", output_file="speedup_data.csv"):
#     """
#     Exports the speedup data for a specified input size, input type, and MPI function to a CSV file.

#     Parameters:
#     - df: DataFrame containing Caliper data with 'input_size', 'input_type', 'num_procs', 'Total time', and 'name' columns.
#     - input_size: Specific input size to filter (default is 2^16).
#     - input_type: Specific input type to filter (default is 'random').
#     - mpi_function: Name of the MPI function to filter (default is 'MPI_Finalized').
#     - output_file: Name of the output CSV file (default is 'speedup_data.csv').
#     """
#     # Filter the data based on input size, input type, and MPI function
#     filtered_df = df[(df['input_size'] == input_size) &
#                      (df['data_type'] == data_type) &
#                      (df['name'] == mpi_function)]
        
#     print(filtered_df)
    
#     # Check if there is any data available after filtering
#     if filtered_df.empty:
#         print(f"No data available for input size {input_size}, input type {input_type}, and MPI function {mpi_function}.")
#         return
    
#     # Select only the relevant columns
#     process_total_time_df = filtered_df[['num_procs', 'Total time']]
    
#     # Calculate speedup based on the baseline time (using the minimum number of processes)
#     baseline_time = process_total_time_df['Total time'].min()  # Get the baseline time
#     process_total_time_df['Speedup'] = baseline_time / process_total_time_df['Total time']
    
#     # Sort by number of processes
#     process_total_time_df = process_total_time_df.sort_values(by='num_procs')

#     # Save the result to a CSV file
#     process_total_time_df.to_csv(output_file, index=False)
#     print(f"Speedup data successfully saved to {output_file}.")

# # Example usage with Caliper data DataFrame (tk.dataframe):
# export_speedup_data_csv(df=tk.dataframe, input_size=67108864, input_type="random", output_file="merge_sort_speedup_data26.csv")

In [None]:
# def plot_strong_scaling(df, input_size, title):
#     os.makedirs('figures', exist_ok=True)

#     plt.figure(figsize=(15, 7), facecolor='white')
#     export_data = []

    
#     for data_type in df.index.ge t_level_values('data_type').unique():
#         subset = df.xs((input_size, data_type), level=('input_size', 'data_type'), drop_level=False)
# #         print(subset)

#         # Filter out non-positive values for plotting
#         subset = subset[subset['Avg time/rank'] > 0]
        
#          # Append data for CSV, including "sample_sort" and data_type
#         for num_procs, avg_time in zip(subset.index.get_level_values('num_procs'), subset['Avg time/rank']):
#             export_data.append([num_procs, avg_time, "sample_sort", data_type])

#     # Export data to CSV file with data_type column
#     csv_path = f"figures/{title}_{input_size}_strong_scaling_data.csv"
#     export_df = pd.DataFrame(export_data, columns=['Number of Processes', 'Avg Time per Rank (seconds)', 'Algorithm', 'Data Type'])
#     export_df.to_csv(csv_path, index=False)
    
#     if pd.notna(input_size):
#         input_size_str = f"$2^{{{int(np.log2(input_size))}}}$"
#     else:
#         input_size_str = "Unknown Size"

#     plt.title(f"{title}: Strong Scaling (Input Size: {input_size_str})")
#     plt.xlabel('Number of Processes')
#     plt.ylabel('Avg Time per Rank (seconds)')
#     plt.xscale('log', base=2)
#     plt.yscale('log')
#     plt.legend()
#     plt.grid(True)
#     save_path = f"figures/{title}_{input_size_str}_strong_scaling.png"
#     plt.savefig(save_path, format='png', bbox_inches='tight')
#     plt.close()