In [None]:
# ==============================================================================
#           Greedy-Req (Binary-BV) Baseline - Statistical Analysis
# ------------------------------------------------------------------------------
# This script automates the execution of a greedy requirement selection
# algorithm that maximizes the Business Value of fully satisfied requirements.
# It runs the selection process 30 times for each budget level to gather
# robust statistical data.
# ==============================================================================

# === Core Imports and Setup ===
import pandas as pd
import numpy as np
import random
from google.colab import files
import matplotlib.pyplot as plt
from io import BytesIO
from openpyxl import Workbook
from openpyxl.drawing.image import Image as OpenpyxlImage
import warnings
warnings.filterwarnings('ignore')

# --- Configuration for the Experiment ---
BUDGET_PERCENTAGES = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
NUM_RUNS_PER_BUDGET = 30

print("Setup complete. Ready to run the Greedy-Req baseline.")```

#### **Cell 2: Data Loading and Algorithm Refactoring**
```python
# ======================================================
# Cell 2: Load Data and Refactor Algorithm into a Function
# ======================================================

print("Please upload the 'mapped_dataset' file you wish to analyze.")
uploaded = files.upload()

if not uploaded:
    print("\nNo file uploaded. Aborting.")
else:
    filename = next(iter(uploaded))
    print(f"\nSuccessfully uploaded '{filename}'")

    # --- Load and process the dataset ONCE ---
    df = pd.read_excel(filename)
    tests_df = df[["tc_id", "tc_executiontime"]].drop_duplicates()
    reqs_df  = df[["us_id", "us_businessvalue"]].drop_duplicates()
    trace_df = df[["us_id", "tc_id"]].drop_duplicates()

    # Create data maps for quick access
    data_maps = {
        'test_exec_time': dict(zip(tests_df["tc_id"], tests_df["tc_executiontime"])),
        'req_bv': dict(zip(reqs_df["us_id"], reqs_df["us_businessvalue"])),
        'reqs_df': reqs_df
    }
    tests_of = trace_df.groupby("us_id")["tc_id"].apply(set).to_dict()
    for rid in reqs_df["us_id"]: tests_of.setdefault(rid, set())
    data_maps['tests_of'] = tests_of

    total_exec_time = float(tests_df["tc_executiontime"].sum())
    print(f"Data prepared. Total possible execution time: {total_exec_time:.2f}")

    # --- Refactored Greedy-Req Algorithm ---
    def run_greedy_req(budget_minutes, maps):
        """Runs the Greedy-Req (Binary-BV) selection for a given budget."""
        # Unpack maps
        test_exec_time = maps['test_exec_time']
        req_bv = maps['req_bv']
        tests_of = maps['tests_of']
        reqs_df = maps['reqs_df']

        selected_tests = set()
        time_used = 0.0
        satisfied_reqs = set()

        def remaining_budget(): return budget_minutes - time_used
        def marginal_time(rid):
            missing = tests_of[rid] - selected_tests
            dt = sum(test_exec_time[t] for t in missing)
            return dt, missing

        while True:
            for rid in reqs_df["us_id"]:
                if rid not in satisfied_reqs and tests_of[rid].issubset(selected_tests):
                    satisfied_reqs.add(rid)

            best_req, best_ratio, best_missing, best_dt = None, -1.0, None, None

            for rid in reqs_df["us_id"]:
                if rid in satisfied_reqs: continue
                dt, missing = marginal_time(rid)
                if dt == 0: satisfied_reqs.add(rid); continue
                if dt > remaining_budget(): continue

                ratio = req_bv[rid] / dt
                if ratio > best_ratio:
                    best_req, best_ratio, best_missing, best_dt = rid, ratio, missing, dt

            if best_req is None: break

            selected_tests |= best_missing
            time_used += best_dt
            satisfied_reqs.add(best_req)

        # Calculate final metrics
        total_final_bv = float(sum(req_bv[r] for r in satisfied_reqs))
        num_reqs_final = int(len(satisfied_reqs))
        num_tests_final = int(len(selected_tests))
        total_reqs = int(reqs_df["us_id"].nunique())
        req_coverage_percent = (num_reqs_final / total_reqs) * 100.0 if total_reqs > 0 else 0.0

        return total_final_bv, num_reqs_final, num_tests_final, req_coverage_percent

In [None]:
# ======================================================
# Cell 3: Run Experiment and Generate Report
# ======================================================

if 'df' in locals():
    all_run_data = []
    print(f"\n--- Starting Statistical Run ({NUM_RUNS_PER_BUDGET} runs per budget) ---")

    for pct in BUDGET_PERCENTAGES:
        budget = (pct / 100.0) * total_exec_time
        print(f"\n--- Running for Budget: {pct}% (Max Time: {budget:.2f}) ---")

        for run in range(NUM_RUNS_PER_BUDGET):
            print(f"  Run {run + 1}/{NUM_RUNS_PER_BUDGET}...", end='\r')

            # The Greedy-Req algorithm is deterministic, but we run it 30 times for consistency
            # with the other experimental setups.
            bv, reqs, tests, req_cov = run_greedy_req(budget, data_maps)

            all_run_data.append({
                'budget_pct': pct,
                'run': run + 1,
                'total_final_bv': bv,
                'num_reqs_final': reqs,
                'num_tests_final': tests,
                'req_coverage_percent': req_cov
            })
        print(f"\n  -> Completed {NUM_RUNS_PER_BUDGET} runs.")

    # --- Analysis and Reporting ---
    raw_results_df = pd.DataFrame(all_run_data)

    summary_df = raw_results_df.groupby('budget_pct').agg(
        mean_bv=('total_final_bv', 'mean'),
        median_bv=('total_final_bv', 'median'),
        mean_req_cov=('req_coverage_percent', 'mean'),
        median_req_cov=('req_coverage_percent', 'median')
    ).reset_index()

    print("\n\n" + "="*60); print("--- Final Statistical Summary for Greedy-Req Selection ---")
    print(summary_df.to_string()); print("="*60)

    # --- Generate Plots ---
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))
    fig.suptitle(f'Greedy-Req Baseline Performance for: {filename}', fontsize=16)

    axes[0].plot(summary_df['budget_pct'], summary_df['mean_req_cov'], marker='o', label='Mean Req Coverage %')
    axes[0].set_title('Requirement Coverage % vs. Budget'); axes[0].set_ylabel('Coverage (%)'); axes[0].legend()

    axes[1].plot(summary_df['budget_pct'], summary_df['mean_bv'], marker='o', label='Mean Total BV')
    axes[1].set_title('Total Business Value vs. Budget'); axes[1].set_ylabel('Business Value'); axes[1].legend()

    for ax in axes.flat:
        ax.set_xlabel('Time Budget (%)'); ax.grid(True, linestyle='--', alpha=0.7)
        ax.set_xlim(0, 100); ax.tick_params(axis='x', rotation=45)
    plt.tight_layout(rect=[0, 0, 1, 0.95]); plt.show()

    # --- Create Final Excel Report ---
    report_suffix = filename.replace('mapped_dataset_', '').replace('.xlsx', '')
    output_excel_filename = f"Greedy-Req_Analysis_Report_{report_suffix}.xlsx"
    print(f"\nGenerating report: '{output_excel_filename}'")

    img_buffer = BytesIO(); fig.savefig(img_buffer, format='png'); img_buffer.seek(0)

    with pd.ExcelWriter(output_excel_filename, engine='openpyxl') as writer:
        raw_results_df.to_excel(writer, sheet_name='Raw Run Data', index=False)
        summary_df.to_excel(writer, sheet_name='Summary Table', index=False)
        ws = writer.book.create_sheet(title="Performance Plots")
        ws.add_image(OpenpyxlImage(img_buffer), 'A1')

    print(f"Report saved successfully. You can now download it.")
    files.download(output_excel_filename)
else:
    print("\nCannot run experiment. Please upload a file in Cell 2.")