In [1]:
from algs.cuFPMiner_bit import cuFPMiner_bit
from algs.cuFPMiner_hash import cuFPMiner_hash
from algs.csv_to_parquet import csv_to_parquet
from matplotlib import pyplot as plt
import subprocess
import pandas as pd
import re
import os

ModuleNotFoundError: No module named 'algs'

In [2]:
def parse_output(output):
    metrics = {}
    
    # Use regular expressions to find the relevant metrics
    time_to_read_match = re.search(r"Time to read:\s*([\d.]+)\s*seconds", output)
    runtime_match = re.search(r"Runtime:\s*([\d.]+)\s*seconds", output)
    num_patterns_match = re.search(r"Number of patterns:\s*(\d+)", output)
    memory_usage_match = re.search(r"Memory usage:\s*(\d+)\s*MB", output)

    time_to_read = time_to_read_match.group(1) if time_to_read_match else "N/A"
    runtime = runtime_match.group(1) if runtime_match else "N/A"
    num_patterns = num_patterns_match.group(1) if num_patterns_match else "N/A"
    memory_usage = memory_usage_match.group(1) if memory_usage_match else "N/A"
    
    if time_to_read_match:
        metrics['time_to_read'] = float(time_to_read_match.group(1))
    if runtime_match:
        metrics['runtime'] = float(runtime_match.group(1))
    if num_patterns_match:
        metrics['number_of_patterns'] = int(num_patterns_match.group(1))
    if memory_usage_match:
        metrics['memory_usage_mb'] = int(memory_usage_match.group(1))
        

    return metrics

def run_alg(alg_path, file_path, minsup, separator, num_cores):

    # Construct the command
    command = [
        alg_path, 
        file_path, 
        str(minsup), 
        separator, 
        str(num_cores), 
        "/dev/null"  # Discard output to /dev/null
    ]

    try:
        # Run the command and capture the output
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        output = result.stdout

        # Parse the output
        metrics = parse_output(output)
        return metrics

    except subprocess.CalledProcessError as e:
        print(f"Error: Command failed with return code {e.returncode}")
        print(f"Stderr: {e.stderr}")
        return None
    except Exception as ex:
        print(f"Error: {ex}")
        return None

In [9]:
work = [
        # ["/home/tarun/cuPAMI/datasets/Transactional_retail.csv", [100,90,80,70]],
        # ["/home/tarun/cuPAMI/datasets/Transactional_T10I4D100K.csv", [50, 25, 10, 5]],
        # ["/home/tarun/cuPAMI/datasets/Transactional_pumsb.csv", [41000, 40000,39000, 38000]],
        ["/home/tarun/cuPAMI/datasets/Transactional_chess1.csv", [2000, 1900,1800,1700,1600]],
        ]
sep = "\t"

In [11]:
def clean_filename(file_path):
    """Clean the filename by removing everything before the last '/' and removing 'transactional' and the extension."""
    base_name = os.path.basename(file_path)
    cleaned_name = base_name.replace("Transactional_", "").split(".")[0]
    return cleaned_name

def run_workload(work, sep, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_path, min_sups in work:
        cleaned_name = clean_filename(file_path)
        file_output_csv = os.path.join(output_dir, f"{cleaned_name}.csv")

        # Load existing results if the file already exists
        if os.path.exists(file_output_csv):
            df_existing = pd.read_csv(file_output_csv)
            processed_combinations = set(
                zip(df_existing["file"], df_existing["min_sup"], df_existing["algorithm"])
            )
        else:
            processed_combinations = set()

        results = []
        parquet_file = file_path.replace(".csv", ".parquet")
        csv_to_parquet(file_path, parquet_file, sep)
        
        for min_sup in min_sups:
            # Define algorithms and configurations
            algorithms = [
                ("cuFPMiner_bit_csv", lambda: cuFPMiner_bit(file_path, min_sup, sep, 'csv', "managed")),
                ("cuFPMiner_hash_csv", lambda: cuFPMiner_hash(file_path, min_sup, sep, 'csv', "managed")),
                ("cuFPMiner_hash_shared_csv", lambda: cuFPMiner_hash(file_path, min_sup, sep, 'csv', "managed", True)),
                ("cuFPMiner_bit_parquet", lambda: cuFPMiner_bit(parquet_file, min_sup, sep, 'parquet', "managed")),
                ("cuFPMiner_hash_parquet", lambda: cuFPMiner_hash(parquet_file, min_sup, sep, 'parquet', "managed")),
                ("cuFPMiner_hash_shared_parquet", lambda: cuFPMiner_hash(parquet_file, min_sup, sep, 'parquet', "managed", True)),
            ]

            # Run Python-based algorithms
            for alg_name, alg_func in algorithms:
                if (file_path, min_sup, alg_name) in processed_combinations:
                    continue

                alg = alg_func()
                alg.mine()
                result = {
                    "algorithm": alg_name,
                    "file": file_path,
                    "min_sup": min_sup,
                    "runtime": alg.getRuntime(),
                    "patterns": len(alg.getPatterns()),
                    "memory": alg.getMemoryRSS(),
                    "time_to_read": alg.getTimeToRead(),
                }
                results.append(result)
                pd.DataFrame([result]).to_csv(file_output_csv, mode='a', header=not os.path.exists(file_output_csv), index=False)
                processed_combinations.add((file_path, min_sup, alg_name))

            # Run command-based algorithms
            for alg_name, command_template in [
                # ("apriori", "/home/tarun/cuPAMI/algs/apriori"),
                ("fpgrowth", "/home/tarun/cuPAMI/algs/fpgrowth"),
            ]:
                if (file_path, min_sup, alg_name) in processed_combinations:
                    continue

                metrics = run_alg(command_template, file_path, min_sup, sep, 16)
                if metrics:
                    result = {
                        "algorithm": alg_name + " 16 threads",
                        "file": file_path,
                        "min_sup": min_sup,
                        "runtime": metrics.get("runtime", 0),
                        "patterns": metrics.get("number_of_patterns", 0),
                        "memory": metrics.get("memory_usage_mb", 0),
                        "time_to_read": metrics.get("time_to_read", 0),
                    }
                    results.append(result)
                    pd.DataFrame([result]).to_csv(file_output_csv, mode='a', header=not os.path.exists(file_output_csv), index=False)
                    processed_combinations.add((file_path, min_sup, alg_name))

        print(f"Finished processing {file_path}")

def plot_results(output_dir):
    for csv_file in os.listdir(output_dir):
        if not csv_file.endswith(".csv"):
            continue

        file_path = os.path.join(output_dir, csv_file)
        df = pd.read_csv(file_path)
        cleaned_name = os.path.splitext(csv_file)[0]

        metrics = ["runtime", "patterns", "memory", "time_to_read"]
        for metric in metrics:
            plt.figure(figsize=(10, 6))
            for alg in df["algorithm"].unique():
                alg_df = df[df["algorithm"] == alg]
                plt.plot(alg_df["min_sup"], alg_df[metric], label=alg, marker="o")

            plt.title(f"{metric.capitalize()} vs Minimum Support ({cleaned_name})")
            plt.xlabel("Minimum Support")
            plt.ylabel(metric.capitalize())
            plt.legend()
            plt.grid()

            plot_path = os.path.join(output_dir, f"{cleaned_name}_{metric}_vs_min_sup.svg")
            plt.savefig(plot_path, format="svg", transparent=True)
            plt.close()

            print(f"Saved plot {plot_path}")

# Main workflow
output_dir = "results"
run_workload(work, sep, output_dir)
plot_results(output_dir)


  col_0 col_1 col_2 col_3 col_4 col_5 col_6 col_7 col_8 col_9  ... col_27  \
0     1     3     5     7     9    11    13    15    17    19  ...     56   
1     1     3     5     7     9    12    13    15    17    19  ...     56   
2     1     3     5     7     9    12    13    16    17    19  ...     56   
3     1     3     5     7     9    11    13    15    17    20  ...     56   
4     1     3     5     7     9    11    13    15    17    19  ...     56   

  col_28 col_29 col_30 col_31 col_32 col_33 col_34 col_35 col_36  
0     58     60     62     64     66     68     70     72     74  
1     58     60     62     64     66     68     70     72     74  
2     58     60     62     64     66     68     70     72     74  
3     58     60     62     64     66     68     70     72     74  
4     58     60     62     64     66     68     70     72     74  

[5 rows x 37 columns]
Finished processing /home/tarun/cuPAMI/datasets/Transactional_chess1.csv
Saved plot results/pumsb_runtime_vs_min