In [None]:
# create_regression_dashboard.ipynb

import subprocess
import os
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from compare import ReferenceComparer, get_last_two_commits
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configuration
NUM_COMMITS = 10  # Number of commits to analyze (configurable via env var)
OUTPUT_DIR = Path("dashboard_output")
OUTPUT_DIR.mkdir(exist_ok=True)

# Utility function to get commit history
def get_commit_history(num_commits):
    try:
        result = subprocess.run(
            ['git', 'log', '--format=%H %s', f'-n {num_commits+1}'],
            capture_output=True,
            text=True,
            check=True
        )
        commits = [line.split(" ", 1) for line in result.stdout.strip().split('\n')]
        return [(commit_hash, commit_msg) for commit_hash, commit_msg in commits]
    except subprocess.SubprocessError as e:
        logger.error(f"Failed to get commit history: {e}")
        return []

# Function to compare consecutive commits and collect data
def collect_regression_data(commits):
    data_changes = {
        "commits": [],
        "files": {},
        "metrics": {"different_keys": {}, "identical_keys_diff_data": {}}
    }
    
    for i in range(len(commits) - 1):
        ref1_hash, ref1_msg = commits[i + 1]
        ref2_hash, ref2_msg = commits[i]
        
        logger.info(f"Comparing {ref1_hash[:8]} -> {ref2_hash[:8]}")
        comparer = ReferenceComparer(ref1_hash=ref1_hash, ref2_hash=ref2_hash)
        comparer.setup()
        comparer.compare()
        
        # Store commit info
        data_changes["commits"].append({
            "hash": ref2_hash,
            "message": ref2_msg,
            "prev_hash": ref1_hash
        })
        
        # Aggregate changes per file
        for file_name, results in comparer.test_table_dict.items():
            if file_name not in data_changes["files"]:
                data_changes["files"][file_name] = []
            
            data_changes["files"][file_name].append({
                "commit": ref2_hash,
                "different_keys": results["different_keys"],
                "identical_keys_diff_data": results["identical_keys_diff_data"],
                "added_keys": results["added_keys"],
                "deleted_keys": results["deleted_keys"],
                "diff_data": results["identical_name_different_data_dfs"]
            })
            
            # Track metrics for plotting
            if file_name not in data_changes["metrics"]["different_keys"]:
                data_changes["metrics"]["different_keys"][file_name] = []
                data_changes["metrics"]["identical_keys_diff_data"][file_name] = []
            data_changes["metrics"]["different_keys"][file_name].append(results["different_keys"])
            data_changes["metrics"]["identical_keys_diff_data"][file_name].append(results["identical_keys_diff_data"])
        
        comparer.teardown()
    
    return data_changes

# Function to create the dashboard
def create_dashboard(data_changes):
    fig = make_subplots(
        rows=2, cols=1,
        subplot_titles=("Number of Different Keys Over Commits", "Keys with Same Name but Different Data Over Commits"),
        vertical_spacing=0.15
    )
    
    commit_hashes = [c["hash"][:8] for c in data_changes["commits"]]
    
    # Plot for different keys
    for file_name in data_changes["files"]:
        diff_keys = data_changes["metrics"]["different_keys"][file_name]
        fig.add_trace(
            go.Scatter(
                x=commit_hashes,
                y=diff_keys,
                mode="lines+markers",
                name=f"{file_name} (Different Keys)",
                hovertemplate=f"{file_name}<br>Commit: %{{x}}<br>Different Keys: %{{y}}"
            ),
            row=1, col=1
        )
    
    # Plot for identical keys with different data
    for file_name in data_changes["files"]:
        same_name_diff = data_changes["metrics"]["identical_keys_diff_data"][file_name]
        fig.add_trace(
            go.Scatter(
                x=commit_hashes,
                y=same_name_diff,
                mode="lines+markers",
                name=f"{file_name} (Same Name, Diff Data)",
                hovertemplate=f"{file_name}<br>Commit: %{{x}}<br>Same Name Diff Data: %{{y}}"
            ),
            row=2, col=1
        )
    
    # Update layout
    fig.update_layout(
        title_text="Regression Data Changes Across TARDIS Commits",
        height=800,
        width=1200,
        showlegend=True,
        hovermode="x unified"
    )
    fig.update_xaxes(title_text="Commit Hash", row=1, col=1)
    fig.update_xaxes(title_text="Commit Hash", row=2, col=1)
    fig.update_yaxes(title_text="Number of Different Keys", row=1, col=1)
    fig.update_yaxes(title_text="Number of Keys with Changed Data", row=2, col=1)
    
    # Save the dashboard
    fig.write_html(OUTPUT_DIR / "regression_dashboard.html")
    logger.info(f"Dashboard saved to {OUTPUT_DIR / 'regression_dashboard.html'}")
    
    return fig

# Main execution
if __name__ == "__main__":
    # Get number of commits from environment variable or default to 10
    num_commits = int(os.getenv("NUM_COMMITS", NUM_COMMITS))
    
    # Get commit history
    commits = get_commit_history(num_commits)
    if len(commits) < 2:
        raise ValueError(f"Not enough commits found. Required: {num_commits+1}, Found: {len(commits)}")
    
    # Collect regression data
    data_changes = collect_regression_data(commits)
    
    # Create and display the dashboard
    dashboard_fig = create_dashboard(data_changes)
    dashboard_fig.show()