In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import warnings

import os
import sys

from dotenv import load_dotenv
from tqdm import tqdm


load_dotenv()

In [None]:
git_path = os.getenv("GIT_FOLDER")
cv3_path = os.getenv("CV3_FOLDER")
sys.path.append(git_path)
sys.path.append(cv3_path)

In [None]:
from cerulean_cloud.cloud_function_asa.utils.constants import (
    INFRA_MEAN,
    INFRA_STD,
    VESSEL_MEAN,
    VESSEL_STD,
)

In [None]:
warnings.filterwarnings(action="ignore")

In [None]:
def hitl_ground_truth_ranks(df, source_type, coll_column="collated_score"):
    """
    Calculate the ranks from an hitl df
    """
    true_rank = []
    for slick_id, group in df.groupby("slick"):
        group.sort_values(by=coll_column, ascending=False, inplace=True)
        group["calculated_rank"] = list(range(1, len(group) + 1))
        true_source = group[group["hitl_verification"]]
        if len(true_source) != 1:
            continue
            # raise BaseException("can't have more than one true source per slick")

        calc_rank = true_source["calculated_rank"].values[0]
        if true_source["type"].values[0] == source_type:
            true_rank.append(calc_rank)
    return true_rank

In [None]:
def calculate_adj_coll_ranks(
    hitl_df,
    mean_adjustments,
    std_adjustments,
    fixed_source_type,
    adjusted_source_type,
    fixed_mean,
    fixed_std,
):
    """
    Calculates various rankings and top_3 rates of two source types
    by fixing the collation parameters of one source and adjusting the collation parameters
    of the other source.

    Returns four sets of 3d datapoints
    """
    adjusted_ranking_dps = []
    fixed_ranking_dps = []
    adjusted_top_3_dps = []
    fixed_top_3_dps = []

    # Wrap outer loop with tqdm for progress bar
    for mean_adjust in tqdm(mean_adjustments, desc="Adjusting Infra Mean"):
        for std_adjust in tqdm(
            std_adjustments, desc="Adjusting Infra Std", leave=False
        ):
            custom_coll_scores = []
            for source in hitl_df.iloc:
                m, s = (
                    (fixed_mean, fixed_std)
                    if source["type"] == fixed_source_type
                    else (mean_adjust, std_adjust)
                )
                custom_coll_scores.append((source["coincidence_score"] - m) / s)
            hitl_df["custom_coll_score"] = custom_coll_scores
            # adjusted_type_list = hitl_df[hitl_df['type'] == 2][hitl_df['hitl_verification']]  # Filtered infra list
            # fixed_type_list = hitl_df[hitl_df['type'] == 1][hitl_df['hitl_verification']]  # Filtered vessel list

            fixed_type_true_rank = hitl_ground_truth_ranks(
                hitl_df, fixed_source_type, coll_column="custom_coll_score"
            )
            adjusted_type_true_rank = hitl_ground_truth_ranks(
                hitl_df, adjusted_source_type, coll_column="custom_coll_score"
            )

            # Append ranking data points
            adjusted_ranking_dps.append(
                (mean_adjust, std_adjust, np.mean(adjusted_type_true_rank))
            )
            fixed_ranking_dps.append(
                (mean_adjust, std_adjust, np.mean(fixed_type_true_rank))
            )

            fixed_top_3_dps.append(
                (
                    mean_adjust,
                    std_adjust,
                    np.sum(np.array(fixed_type_true_rank) <= 3)
                    / len(fixed_type_true_rank),
                )
            )
            adjusted_top_3_dps.append(
                (
                    mean_adjust,
                    std_adjust,
                    np.sum(np.array(adjusted_type_true_rank) <= 3)
                    / len(adjusted_type_true_rank),
                )
            )

    return adjusted_ranking_dps, adjusted_top_3_dps, fixed_ranking_dps, fixed_top_3_dps

In [None]:
def search_optimal_rankings_dp(
    fix_ranking_dps, adj_ranking_dps, fix_type="Vessel", adj_type="Infra"
):
    """
    Search for optimal mean and std for ranking
    """
    max_ranking_dps = []
    for i in range(len(adj_ranking_dps)):
        x = adj_ranking_dps[i][0]
        y = adj_ranking_dps[i][1]
        max_ranking_dps.append(
            [x, y, max(adj_ranking_dps[i][2], fix_ranking_dps[i][2])]
        )

    minimum = 5
    for i in range(len(max_ranking_dps)):
        rank = max_ranking_dps[i][2]
        # print(rank)
        if rank < minimum:
            minimum = rank
            mean = max_ranking_dps[i][0]
            std = max_ranking_dps[i][1]
            min_fix_ranking = fix_ranking_dps[i][2]
            min_adj_ranking = adj_ranking_dps[i][2]

    print(
        "IDEAL MEAN AND STD ADJUSTMENT FOUND AT",
        round(mean, 3),
        "MEAN AND",
        round(std, 3),
        "STD",
    )
    print(f"With {fix_type} avg ranking:", min_fix_ranking)
    print(f"With {adj_type} avg ranking:", min_adj_ranking)

    return (mean, std, minimum)

In [None]:
def search_optimal_top_3_dp(
    fix_top_3_dps, adj_top_3_dps, fix_type="Vessel", adj_type="Infra"
):
    """
    Search for optimal mean and std for top 3 rate
    """
    min_top_3_dps = []
    for i in range(len(adj_top_3_dps)):
        x = adj_top_3_dps[i][0]
        y = adj_top_3_dps[i][1]
        min_top_3_dps.append([x, y, min(adj_top_3_dps[i][2], fix_top_3_dps[i][2])])

    maximum = 0
    for i in range(len(min_top_3_dps)):
        bot = min_top_3_dps[i][2]
        # print(rank)
        if bot > maximum:
            maximum = bot
            mean = min_top_3_dps[i][0]
            std = min_top_3_dps[i][1]
            min_fix_top_3 = fix_top_3_dps[i][2]
            min_adj_top_3 = adj_top_3_dps[i][2]

    print(
        "IDEAL MEAN AND STD ADJUSTMENT FOUND AT",
        round(mean, 3),
        "MEAN AND",
        round(std, 3),
        "STD",
    )
    print(f"With {fix_type} top 3 rate:", min_fix_top_3)
    print(f"With {adj_type} top 3 rate:", min_adj_top_3)

    return (mean, std, maximum)

In [None]:
def plot_dps_3d(
    dps1,
    dps2,
    opt_point=None,
    metric="Ranking",
    source_type1="Vessel",
    source_type2="Infra",
):
    # First set of points
    x1, y1, z1 = np.array(dps1).transpose(1, 0)

    # Second set of points
    x2, y2, z2 = np.array(dps2).transpose(1, 0)

    # Create the figure and add traces
    fig = go.Figure()

    # Infra Avg Ranking
    fig.add_trace(
        go.Scatter3d(
            x=x1,
            y=y1,
            z=z1,
            mode="markers",
            marker=dict(
                size=6,
                color=z1,  # Use z-values for color
                colorscale="Blues",  # Gradient scale
                opacity=1.0,
            ),
            name=f"{source_type1} Average {metric}",
        )
    )

    # Vessel Avg Ranking
    fig.add_trace(
        go.Scatter3d(
            x=x2,
            y=y2,
            z=z2,
            mode="markers",
            marker=dict(
                size=6,
                color=z2,  # Use z-values for color
                colorscale="Reds",
                opacity=1.0,
            ),
            name=f"{source_type2} Avg {metric}",
        )
    )

    # Optimized Point
    if opt_point is not None:
        mean = opt_point[0]
        std = opt_point[1]
        minimum = opt_point[2]
        fig.add_trace(
            go.Scatter3d(
                x=[mean],
                y=[std],
                z=[minimum],
                mode="markers",
                marker=dict(size=10, color="green", opacity=1.0),
                name="Optimized Point",
            )
        )

    # Update layout
    fig.update_layout(
        scene=dict(
            xaxis_title="Mean Adjustment",
            yaxis_title="Std Adjustment",
            zaxis_title=f"Avg {metric}",
            aspectratio=dict(x=1, y=1, z=1.0),
        ),
        title=f"Z (Average {metric}) at various X (mean adjustment) and Y (std adjustmentment)",
        showlegend=True,
    )

    fig.show()

Load hitl dataframe. 

In [None]:
csv_path = cv3_path + "asa_analysis/evaluation/slick_to_source dump 2024-12-31.csv"
hitl_df = pd.read_csv(csv_path)

Calculate MEAN and STD from coincidence scores

In [None]:
infra_list = hitl_df[hitl_df["type"] == 2][
    hitl_df["hitl_verification"]
]  # ['coincidence_score'].values
vess_list = hitl_df[hitl_df["type"] == 1][
    hitl_df["hitl_verification"]
]  # ['coincidence_score'].values
INFRA_COIN = infra_list["coincidence_score"].values
VESS_COIN = vess_list["coincidence_score"].values
NEW_INFRA_MEAN, NEW_INFRA_STD = np.mean(INFRA_COIN), np.std(INFRA_COIN)
NEW_VESSEL_MEAN, NEW_VESSEL_STD = np.mean(VESS_COIN), np.std(VESS_COIN)

In [None]:
print("INFRA MEAN:", INFRA_MEAN, "--->", NEW_INFRA_MEAN)
print("INFRA STD:", INFRA_STD, "--->", NEW_INFRA_STD)
print("VESSEL_MEAN:", VESSEL_MEAN, "--->", NEW_VESSEL_MEAN)
print("VESSEL_STD:", VESSEL_STD, "--->", NEW_VESSEL_STD)

Calculate metrics at varying collations

In [None]:
print("Calculate collated score by adjusting infra distributions")

infra_mean_adjustments = list(np.arange(0, 0.9, 0.025))
infra_std_adjustments = list(np.arange(0.0, 0.5, 0.01))

infra_ranking_dps, infra_top_3_dps, vessel_ranking_dps, vessel_top_3_dps = (
    calculate_adj_coll_ranks(
        hitl_df=hitl_df,
        mean_adjustments=infra_mean_adjustments,
        std_adjustments=infra_std_adjustments,
        fixed_source_type=1,
        adjusted_source_type=2,
        fixed_mean=NEW_VESSEL_MEAN,
        fixed_std=NEW_VESSEL_STD,
    )
)

Find optimal values and display 3D mean std charts

In [None]:
opt_point = search_optimal_rankings_dp(
    fix_ranking_dps=vessel_ranking_dps, adj_ranking_dps=infra_ranking_dps
)
plot_dps_3d(infra_ranking_dps, vessel_ranking_dps, opt_point=opt_point)

In [None]:
opt_point = search_optimal_top_3_dp(
    fix_top_3_dps=vessel_top_3_dps, adj_top_3_dps=infra_top_3_dps
)
plot_dps_3d(
    infra_top_3_dps, vessel_top_3_dps, opt_point=opt_point, metric="Top 3 Source Rate"
)

Verify absolute mean and std of collation scores

In [None]:
infra_list = hitl_df[hitl_df["type"] == 2][
    hitl_df["hitl_verification"]
]  # ['coincidence_score'].values
vess_list = hitl_df[hitl_df["type"] == 1][
    hitl_df["hitl_verification"]
]  # ['coincidence_score'].

print(
    "Vessel Collation Score mean and std:",
    round(np.mean(vess_list["custom_coll_score"]), 3),
    round(np.std(vess_list["custom_coll_score"]), 3),
)
print(
    "Infra Collation Score mean and std",
    round(np.mean(infra_list["custom_coll_score"]), 3),
    round(np.std(infra_list["custom_coll_score"]), 3),
)