In [2]:
from cuFPMiner_bit import cuFPMiner_bit
from cuFPMiner_hash import cuFPMiner_hash

from matplotlib import pyplot as plt
import pandas as pd
import subprocess
import re
import os

In [3]:
def parse_output(output):
    metrics = {}
    
    # Use regular expressions to find the relevant metrics
    time_to_read_match = re.search(r"Time to read:\s*([\d.]+)\s*seconds", output)
    runtime_match = re.search(r"Runtime:\s*([\d.]+)\s*seconds", output)
    num_patterns_match = re.search(r"Number of patterns:\s*(\d+)", output)
    memory_usage_match = re.search(r"Memory usage:\s*(\d+)\s*MB", output)
    peak_memory_usage_match = re.search(r"Peak memory usage:\s*(\d+)\s*MB", output)
    
    if time_to_read_match:
        metrics['time_to_read'] = float(time_to_read_match.group(1))
    if runtime_match:
        metrics['runtime'] = float(runtime_match.group(1))
    if num_patterns_match:
        metrics['number_of_patterns'] = int(num_patterns_match.group(1))
    if memory_usage_match:
        metrics['memory_usage_mb'] = int(memory_usage_match.group(1))
    if peak_memory_usage_match:
        metrics['peak_memory_usage_mb'] = int(peak_memory_usage_match.group(1))

    return metrics

def run_alg(alg_path, file_path, minsup, separator, num_cores):

    # Construct the command
    command = [
        alg_path, 
        file_path, 
        str(minsup), 
        separator, 
        str(num_cores), 
        "/dev/null"  # Discard output to /dev/null
    ]

    try:
        # Run the command and capture the output
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        output = result.stdout

        # Parse the output
        metrics = parse_output(output)
        return metrics

    except subprocess.CalledProcessError as e:
        print(f"Error: Command failed with return code {e.returncode}")
        print(f"Stderr: {e.stderr}")
        return None
    except Exception as ex:
        print(f"Error: {ex}")
        return None

In [4]:
work = [
        # ["../../datasets/transactional/Transactional_retail.csv", [100,90,80,70]],
        # ["../../datasets/transactional/Transactional_T10I4D100K.csv", [50, 25, 10, 5]],
        ["../../datasets/transactional/Transactional_pumsb.csv", [39000, 38000, 37000, 36000, 35000]],
        # ["../../datasets/transactional/Transactional_chess1.csv", [2000, 1900,1800,1700,1600]],
        # ["../../datasets/transactional/Transactional_kosarak.csv", [2000,3000,4000,5000,6000]],
        ]
sep = "\t"

output_dir = "../../results/frequent_patterns/"

In [5]:
def clean_filename(file_path):
    """Clean the filename by removing everything before the last '/' and removing 'transactional' and the extension."""
    base_name = os.path.basename(file_path)
    cleaned_name = base_name.replace("Transactional_", "").split(".")[0]
    return cleaned_name

def run_workload(work, sep, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_path, min_sups in work:
        cleaned_name = clean_filename(file_path)
        file_output_csv = os.path.join(output_dir, f"{cleaned_name}.csv")

        # Load existing results if the file already exists
        if os.path.exists(file_output_csv):
            df_existing = pd.read_csv(file_output_csv)
            processed_combinations = set(
                zip(df_existing["file"], df_existing["min_sup"], df_existing["algorithm"])
            )
        else:
            processed_combinations = set()

        results = []
        
        for min_sup in min_sups:
            # Define algorithms and configurations
            algorithms = [
                ("cuFPMiner_bit_csv", lambda: cuFPMiner_bit(file_path, min_sup, sep, 'csv', "managed")),
                # ("cuFPMiner_hash_csv", lambda: cuFPMiner_hash(file_path, min_sup, sep, 'csv', "managed")),
                ("cuFPMiner_hash_shared_csv", lambda: cuFPMiner_hash(file_path, min_sup, sep, 'csv', "managed", True))
            ]

            # Run Python-based algorithms
            for alg_name, alg_func in algorithms:
                if (file_path, min_sup, alg_name) in processed_combinations:
                    continue

                alg = alg_func()
                alg.mine()
                result = {
                    "algorithm": alg_name,
                    "file": file_path,
                    "min_sup": min_sup,
                    "runtime": alg.getRuntime(),
                    "patterns": len(alg.getPatterns()),
                    "memory": alg.getMemoryRSS(),
                    "time_to_read": alg.getTimeToRead(),
                }
                results.append(result)
                pd.DataFrame([result]).to_csv(file_output_csv, mode='a', header=not os.path.exists(file_output_csv), index=False)
                processed_combinations.add((file_path, min_sup, alg_name))

            # Run command-based algorithms
            for alg_name, command_template in [
                # ("apriori", "./apriori"),
                ("fpgrowth", "./fpgrowth"),
            ]:
                if (file_path, min_sup, alg_name + " 16 threads") in processed_combinations:
                    continue

                metrics = run_alg(command_template, file_path, min_sup, sep, 16)
                if metrics:
                    result = {
                        "algorithm": alg_name + " 16 threads",
                        "file": file_path,
                        "min_sup": min_sup,
                        "runtime": metrics.get("runtime", 0),
                        "patterns": metrics.get("number_of_patterns", 0),
                        "memory": metrics.get("peak_memory_usage_mb", 0),
                        "time_to_read": metrics.get("time_to_read", 0),
                    }
                    results.append(result)
                    pd.DataFrame([result]).to_csv(file_output_csv, mode='a', header=not os.path.exists(file_output_csv), index=False)
                    processed_combinations.add((file_path, min_sup, alg_name))

        print(f"Finished processing {file_path}")

# Main workflow
# run_workload(work, sep, output_dir)


In [14]:
fig_size = (5,5)

linestyle_tuple = [
     ('loosely dotted',        (0, (1, 10))),
     ('dotted',                (0, (1, 1))),
     ('densely dotted',        (0, (1, 1))),
     ('long dash with offset', (5, (10, 3))),
     ('loosely dashed',        (0, (5, 10))),
     ('dashed',                (0, (5, 5))),
     ('densely dashed',        (0, (5, 1))),

     ('loosely dashdotted',    (0, (3, 10, 1, 10))),
     ('dashdotted',            (0, (3, 5, 1, 5))),
     ('densely dashdotted',    (0, (3, 1, 1, 1))),

     ('dashdotdotted',         (0, (3, 5, 1, 5, 1, 5))),
     ('loosely dashdotdotted', (0, (3, 10, 1, 10, 1, 10))),
     ('densely dashdotdotted', (0, (3, 1, 1, 1, 1, 1)))]

marker_shapes = ['o', 's', '^', 'D', 'v', 'p', '*', 'h', 'H', '+', 'x', 'X']

# if name algorithm name contains .csv remove that part
# if name algorithm name contains _ remove that part



def plot_results(output_dir):
    for csv_file in os.listdir(output_dir):
        if not csv_file.endswith(".csv"):
            continue

        file_path = os.path.join(output_dir, csv_file)
        df = pd.read_csv(file_path)
        cleaned_name = os.path.splitext(csv_file)[0]
        
        df["algorithm"] = df["algorithm"].apply(lambda x: x.replace("_csv", "").replace("_", " "))
        
        # convert memory to MB
        df["memory"] = df["memory"]

        metrics = ["runtime", "patterns", "memory"]
        for metric in metrics:
            plt.figure(figsize=fig_size)
            algs = df["algorithm"].unique()
            line_color_marker = [[linestyle_tuple[i][1], f"C{i}", f"{marker_shapes[i]}"] for i in range(len(algs))]
            sorted_methods = sorted(algs)
            
            
            # for alg in df["algorithm"].unique():
            for i, alg in enumerate(sorted_methods):
                alg_df = df[df["algorithm"] == alg]
                plt.plot(alg_df["min_sup"], alg_df[metric], label=alg, marker=line_color_marker[i][2], linestyle=line_color_marker[i][0], color=line_color_marker[i][1])

            plt.title(f"{metric.capitalize()} vs Minimum Support ({cleaned_name})")
            plt.xlabel("Minimum Support")
            # plt.ylabel(metric.capitalize())
            if metric == "memory":
                plt.ylabel(f"Peak Memory Usage(MB)")
            elif metric == "runtime":
                plt.ylabel(f"{metric.capitalize()} (s)")
            else:
                plt.ylabel(f"{metric.capitalize()}")
            plt.legend()
            plt.grid()
            
            # tilt x-axis labels
            plt.xticks(rotation=45)

            plot_path = os.path.join(output_dir, f"{cleaned_name}_{metric}_vs_min_sup.svg")
            plt.savefig(plot_path, format="svg", transparent=True)
            plt.close()

            print(f"Saved plot {plot_path}")

plot_results(output_dir)


Saved plot ../../results/frequent_patterns/pumsb_runtime_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/pumsb_patterns_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/pumsb_memory_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/kosarak_runtime_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/kosarak_patterns_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/kosarak_memory_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/chess_runtime_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/chess_patterns_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/chess_memory_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/retail_runtime_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/retail_patterns_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/retail_memory_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/T10I4D100K_runtime_vs_min_sup.svg
Saved plot ../../results/frequent_patterns/T10I4D1