In [None]:
import glob
import json
import os

import matplotlib.pyplot as plt
from pandas import DataFrame, read_csv


In [None]:
X_train = read_csv("../data/PAMAP2/x_train_data.csv")
X_test = read_csv("../data/PAMAP2/x_test_data.csv")
y_train = read_csv("../data/PAMAP2/y_train_data.csv")
y_test = read_csv("../data/PAMAP2/y_test_data.csv")

x_label = [f"{num[0]}" for num in y_test.value_counts().index.sort_values().to_list()][
    1:
]

X_train["activity"] = y_train  # First 80% of the data
X_test["activity"] = y_test  # Last 20% of the data

In [None]:
def find_log_files(reports_dir: str = "../reports"):
    """
    Find all .log files in the reports directory
    """
    if not os.path.exists(reports_dir):
        raise FileNotFoundError(f"Reports directory not found: {reports_dir}")

    log_files = glob.glob(os.path.join(reports_dir, "*.log"))
    if not log_files:
        raise FileNotFoundError(f"No .log files found in {reports_dir}")

    return sorted(log_files)


def process_file_records(log_file_path: str):
    print(f"üìÅ Reading log file: {log_file_path}...")
    with open(log_file_path, "r") as file:
        lines = file.readlines()

    lines = [lines[i].strip() for i in range(1, len(lines))]
    print(f"‚úÖ Extracted {len(lines)} even lines\nüîß Parsing JSON lines...")
    records = []
    for i, line in enumerate(lines):
        try:
            data = json.loads(line.replace("'", '"').lower())
            records.append(data["event"] if line.startswith('{"event') else data)
        except json.JSONDecodeError as e:
            print(f"‚ö†Ô∏è  JSON decode error on line {i}: {e}")
            raise e

    if not records:
        raise ValueError("No valid JSON records found!")

    return records


def group_data(records: list):
    df = DataFrame(records)
    print(f"‚úÖ Created dataframe with {len(df)} records\nüìä Grouping every 4 rows...")
    grouped_df = DataFrame(
        [
            {
                "target": group["target"].mean(),
                "datetime": group["datetime"].iloc[-1],
                "group_size": len(group),
            }
            for i in range(0, len(df), 4)
            if len(group := df.iloc[i : i + 4]) > 0
        ]
    ).reset_index(drop=True)
    print(f"‚úÖ Created {len(grouped_df)} groups and Sorted by target")
    return grouped_df

In [None]:
def process_hyperparams_log(grouped_df: DataFrame):
    """
    Process hyperparameter tuning log file according to specified rules
    """
    print("üì¶ Creating batches...")
    activities = X_train["activity"].unique()

    batches = []

    # First batch: first 100 lines
    batch_1 = grouped_df.iloc[:100]
    batches.append(
        {
            "batch_num": activities[0],
            "size": len(batch_1),
            "max_target": batch_1["target"].max(),
            "mean_target": batch_1["target"].mean(),
            "min_target": batch_1["target"].min(),
        }
    )
    remaining_df = grouped_df.iloc[100:]

    # Remaining batches: 20 lines each
    batch_num = 0
    for i in range(0, len(remaining_df), 20):
        batch = remaining_df.iloc[i : i + 20]
        if len(batch) > 0:
            batches.append(
                {
                    "batch_num": activities[batch_num],
                    "size": len(batch),
                    "max_target": batch["target"].max(),
                    "mean_target": batch["target"].mean(),
                    "min_target": batch["target"].min(),
                }
            )
            batch_num += 1
            if batch_num > 11:
                print(f"‚ö†Ô∏è  Warning: More than 12 batches created: {len(batches)}")
                break

    batches_df = DataFrame(batches)
    print(f"‚úÖ Created {len(batches_df)} batches")
    print("\nüìã Batch Summary:\n", batches_df.to_string(index=False))

    return grouped_df, batches_df


def plot_all_batch_results(all_results: dict):
    """
    Plot the results for all log files using subplots
    """
    print("\nüìà Creating visualizations for all log files...")

    n_files = len(all_results)
    if n_files == 0:
        print("‚ùå No log files to plot")
        return

    # Calculate subplot layout
    n_cols = min(2, n_files)  # Max 2 columns
    n_rows = (n_files + n_cols - 1) // n_cols

    # Create figure with subplots for max target plots
    fig1, axes1 = plt.subplots(n_rows, n_cols, figsize=(12 * n_cols, 8 * n_rows))
    if n_files == 1:
        axes1 = [axes1]
    elif n_rows == 1:
        axes1 = [axes1]
    else:
        axes1 = axes1.flatten()

    # Create figure with subplots for distribution plots
    fig2, axes2 = plt.subplots(n_rows, n_cols, figsize=(12 * n_cols, 6 * n_rows))
    if n_files == 1:
        axes2 = [axes2]
    elif n_rows == 1:
        axes2 = [axes2]
    else:
        axes2 = axes2.flatten()

    for idx, (log_file, (grouped_df, batches_df)) in enumerate(all_results.items()):
        filename = os.path.basename(log_file).replace(".log", "")

        # Plot 1: Max target bar chart
        ax1 = axes1[idx]
        bars = ax1.bar(
            batches_df["batch_num"],
            batches_df["max_target"],
            color="steelblue",
            alpha=0.7,
            edgecolor="navy",
            linewidth=1,
        )
        ax1.set_title(
            f"F1 M√°ximo por Atividade - {filename}",
            fontsize=14,
            fontweight="bold",
            pad=10,
        )
        ax1.set_xlabel("Atividade de Teste", fontsize=10, fontweight="bold")
        ax1.set_ylabel("F1 M√°ximo", fontsize=10, fontweight="bold")

        # Add value labels on top of bars
        for i, bar in enumerate(bars):
            height = bar.get_height()
            ax1.text(
                bar.get_x() + bar.get_width() / 2.0,
                height + 0.0001,
                f"{height:.4f}",
                ha="center",
                va="bottom",
                fontsize=8,
            )

        ax1.grid(axis="y", alpha=0.3, linestyle="--")
        ax1.set_xticks(batches_df["batch_num"])

        # Add batch size information
        for i, (batch_num, size) in enumerate(
            zip(batches_df["batch_num"], batches_df["size"])
        ):
            y_min, y_max = ax1.get_ylim()
            ax1.text(
                batch_num,
                y_min + (y_max - y_min) * 0.02,
                f"n={size}",
                ha="center",
                va="bottom",
                fontsize=7,
                alpha=0.7,
            )

        # Plot 2: Target distribution
        ax2 = axes2[idx]
        x = batches_df["batch_num"]
        ax2.plot(
            x,
            batches_df["max_target"],
            "o-",
            label="Max Target",
            linewidth=2,
            markersize=4,
        )
        ax2.plot(
            x,
            batches_df["mean_target"],
            "s-",
            label="Mean Target",
            linewidth=2,
            markersize=4,
        )
        ax2.plot(
            x,
            batches_df["min_target"],
            "^-",
            label="Min Target",
            linewidth=2,
            markersize=4,
        )
        ax2.set_title(
            f"Distribui√ß√£o Target Score - {filename}",
            fontsize=14,
            fontweight="bold",
            pad=10,
        )
        ax2.set_xlabel("Batch Number", fontsize=10, fontweight="bold")
        ax2.set_ylabel("Target Score", fontsize=10, fontweight="bold")
        ax2.legend(fontsize=8)
        ax2.grid(alpha=0.3, linestyle="--")
        ax2.set_xticks(batches_df["batch_num"])

    # Hide unused subplots
    for idx in range(n_files, len(axes1)):
        axes1[idx].set_visible(False)
        axes2[idx].set_visible(False)

    fig1.suptitle(
        "F1 M√°ximo por Atividade - Todos os Arquivos de Log",
        fontsize=16,
        fontweight="bold",
    )
    fig2.suptitle(
        "Distribui√ß√£o Target Score - Todos os Arquivos de Log",
        fontsize=16,
        fontweight="bold",
    )

    fig1.tight_layout()
    fig2.tight_layout()
    plt.show()

In [None]:
try:
    print("üîç Finding log files in reports directory...")
    log_files = find_log_files("../reports")
    print(
        f"üìã Found {len(log_files)} log files: {[os.path.basename(f) for f in log_files]}"
    )

    all_results = {}

    for log_file in log_files:
        try:
            print(f"\n{'=' * 50}")
            print(f"Processing: {os.path.basename(log_file)}")
            print(f"{'=' * 50}")

            grouped_df, batches_df = process_hyperparams_log(
                group_data(process_file_records(log_file))
            )
            all_results[log_file] = (grouped_df, batches_df)

            print(f"\nüìä Summary Statistics for {os.path.basename(log_file)}:")
            print(f"Best target score: {grouped_df['target'].max():.6f}")
            print(f"Worst target score: {grouped_df['target'].min():.6f}")
            print(f"Average target score: {grouped_df['target'].mean():.6f}")

        except Exception as e:
            print(f"‚ùå Error processing {log_file}: {e}")
            continue

    if all_results:
        plot_all_batch_results(all_results)

        print(f"\n{'=' * 50}")
        print("üìä OVERALL SUMMARY")
        print(f"{'=' * 50}")
        for log_file, (grouped_df, _) in all_results.items():
            filename = os.path.basename(log_file)
            print(
                f"{filename}: Best={grouped_df['target'].max():.6f}, "
                f"Avg={grouped_df['target'].mean():.6f}, "
                f"Worst={grouped_df['target'].min():.6f}"
            )
    else:
        print("‚ùå No log files were successfully processed")

except FileNotFoundError as e:
    print(f"‚ùå {e}")
except Exception as e:
    print(f"‚ùå Unexpected error: {e}")