In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import warnings


In [None]:
warnings.filterwarnings(action="ignore")

In [None]:
def vess_infra_true_ranks(df, coll_column = 'collated_score'):
    vessel_true_rank = []
    infra_true_rank = []
    for slick_id, group in df.groupby('slick'):
        group.sort_values(by=coll_column, ascending=False, inplace=True)
        group['calculated_rank'] = list(range(1,len(group)+1))
        true_source = group[group['hitl_verification']==True]
        if len(true_source)!=1:
            continue
            # raise BaseException("can't have more than one true source per slick")

        calc_rank = true_source['calculated_rank'].values[0]
        if true_source['type'].values[0]==1:
            vessel_true_rank.append(calc_rank)
        else:
            infra_true_rank.append(calc_rank)
    return vessel_true_rank, infra_true_rank

In [None]:
# csv_path = r'C:\Users\ebeva\SkyTruth\cv3\slick_to_source dump 2024-12-20.csv'
# df = pd.read_csv(csv_path)
# df = df.drop(columns=['create_time',
#         'hitl_user', 'hitl_time', 'active', 'git_hash', 'id-2', 'name',
#        'email', 'emailVerified', 'image', 'role', 'id-3'])
# df['calculated_rank'] = 0
# df['custom_rank'] = 0
# df['calc_coll_score'] = 0
# df['custom_coll_score'] = 0
# df = df[df['hitl_notes'] != 'multisource']
# df = df[df['hitl_notes'] != 'Multisource']
# df = df[df['hitl_notes'] != 'infrastructure']


# not_noise = []
# for source in df.iloc:
#     if source['coincidence_score'] == source['collated_score']:
#         not_noise.append(False)
#     else:
#         not_noise.append(True)
# df = df[not_noise]

csv_path = r'C:\Users\ebeva\SkyTruth\cv3\slick_to_source dump 2024-12-31.csv'
true_slick_df = pd.read_csv(csv_path)

In [None]:
rerun_df = pd.read_csv(r'C:\Users\ebeva\SkyTruth\cv3\asa_analysis\reattempt_rerun_with_weighted_8hour.csv')

Build a dataframe that combines infra sources with new vessel sources (increased time track + weighted curve)

In [None]:
columns = [
    'source', 'type', 'hitl_verification','slick', 'coincidence_score', 'collated_score', 'custom_coin_score', 'custom_coll_score', 'calculated_rank'
]
df = pd.DataFrame(columns=columns)

temp_w = 2.0
over_w = 1.0
dist_w = 2.0

for slick_id,group in true_slick_df.groupby('slick'):
    true_source = group[group['hitl_verification'] == True]
    if len(true_source)==0:
        continue
    true_type = true_source['type'].values[0]
    # if true_type==1:
    #     print(true_source['st_name'])
    new_vess_sources = rerun_df[rerun_df['slick_id']==slick_id]
    new_vess_sources['coincidence_score'] = (over_w*new_vess_sources['overlap_score'] + temp_w*new_vess_sources['temporal_score'] + dist_w*new_vess_sources['distance_score'])/(over_w + temp_w + dist_w)
    new_vess_sources = new_vess_sources.sort_values(by='coincidence_score', ascending=False)
    for i,vess in new_vess_sources.iterrows(): 
        row = {'source':vess['st_name'],
                'type':1,
                'hitl_verification': vess['st_name']==true_source['st_name'].values[0],
                'slick': slick_id,
                'coincidence_score':vess['coincidence_score'],
                'collated_score':vess['coll_score'],
                'custom_coin_score':0, 'custom_coll_score':0, 'calculated_rank':0
            }
        
        new_row = pd.DataFrame([row])
        df = pd.concat([df, new_row], ignore_index=True)
        
    for i,infra in group[group['type']==2].iterrows():
        row = {'source':infra['st_name'],
            'type':2,
            'hitl_verification': infra['hitl_verification'],
            'slick': slick_id,
            'coincidence_score':infra['coincidence_score'],
            'collated_score':infra['collated_score'],
            'custom_coin_score':0, 'custom_coll_score':0, 'calculated_rank':0
        }
        new_row = pd.DataFrame([row])
        df = pd.concat([df, new_row], ignore_index=True)

        # new_true_vess = rerun_df[rerun_df['st_name']==true_source['st_name'].values[0]][rerun_df['slick_id']==slick_id]
        # break

In [None]:
INFRA_MEAN = 0.58
INFRA_STD = 0.19
VESSEL_MEAN = 0.64
VESSEL_STD = 0.122

In [None]:
infra_list = df[df['type']==2][df['hitl_verification']==True]#['coincidence_score'].values
vess_list = df[df['type']==1][df['hitl_verification']==True]#['coincidence_score'].values
INFRA_COIN = infra_list['coincidence_score'].values
VESS_COIN = vess_list['coincidence_score'].values
NEW_INFRA_MEAN, NEW_INFRA_STD = np.mean(INFRA_COIN), np.std(INFRA_COIN)
NEW_VESSEL_MEAN, NEW_VESSEL_STD = np.mean(VESS_COIN), np.std(VESS_COIN)

In [None]:
len(vess_list)

In [None]:
print("INFRA MEAN:", INFRA_MEAN, "--->", NEW_INFRA_MEAN)
print("INFRA STD:", INFRA_STD, "--->", NEW_INFRA_STD)
print("VESSEL_MEAN:", VESSEL_MEAN, "--->", NEW_VESSEL_MEAN)
print("VESSEL_STD:", VESSEL_STD, "--->", NEW_VESSEL_STD)

(Optional) Analyze only slicks with multiple source types

In [None]:
# mixed_source_examples = []
# non_mixed_source = []
# for slick,group in df.groupby('slick'):
#     # print(np.unique(group['type'].values))
#     unique_source_types = np.unique(group['type'].values)
#     if len(unique_source_types)==2:
#         mixed_source_examples.append(slick)
#     else:
#         non_mixed_source.append(unique_source_types[0])
# df = df[df["slick"].isin(mixed_source_examples)]

Calculate collation score with fixed vessel mean/std and variable infra mean/std

In [None]:
from tqdm import tqdm
import numpy as np

print("Calculate collated score by adjusting infra distributions")

infra_mean_adjustments = list(np.arange(0, .9, 0.025))
infra_std_adjustments = list(np.arange(0.0, 0.5, 0.01))

infra_ranking_dps = [] 
vess_ranking_dps = []
vess_top_3_dps = []
infra_top_3_dps = []

# Wrap outer loop with tqdm for progress bar
for infra_mean_adjust in tqdm(infra_mean_adjustments, desc="Adjusting Infra Mean"):
    for infra_std_adjust in tqdm(infra_std_adjustments, desc="Adjusting Infra Std", leave=False):
        custom_coll_scores = []
        for source in df.iloc:
            m, s = (NEW_VESSEL_MEAN, NEW_VESSEL_STD) if source['type'] == 1 else (infra_mean_adjust, infra_std_adjust)
            custom_coll_scores.append((source['coincidence_score'] - m) / s)
        df['custom_coll_score'] = custom_coll_scores

        infra_list = df[df['type'] == 2][df['hitl_verification'] == True]  # Filtered infra list
        vess_list = df[df['type'] == 1][df['hitl_verification'] == True]  # Filtered vessel list
        vessel_true_rank, infra_true_rank = vess_infra_true_ranks(df, coll_column='custom_coll_score')

        # Append ranking data points
        infra_ranking_dps.append((infra_mean_adjust, infra_std_adjust, np.mean(infra_true_rank)))
        vess_ranking_dps.append((infra_mean_adjust, infra_std_adjust, np.mean(vessel_true_rank)))

        vess_top_3_dps.append((infra_mean_adjust, infra_std_adjust, np.sum(np.array(vessel_true_rank)<=3) / len(vessel_true_rank)))
        infra_top_3_dps.append((infra_mean_adjust, infra_std_adjust, np.sum(np.array(infra_true_rank)<=3) / len(infra_true_rank)))




In [None]:
infra_list = df[df['type']==2][df['hitl_verification']==True]#['coincidence_score'].values
vess_list = df[df['type']==1][df['hitl_verification']==True]#['coincidence_score'].

print("vess mean and std:",round(np.mean(vess_list['custom_coll_score']),3), round(np.std(vess_list['custom_coll_score']),3))
print("infra mean and std",round(np.mean(infra_list['custom_coll_score']),3), round(np.std(infra_list['custom_coll_score']),3))

In [None]:

max_ranking_dps = []
for i in range(len(infra_ranking_dps)):
    x = infra_ranking_dps[i][0]
    y = infra_ranking_dps[i][1]
    max_ranking_dps.append([x,y,max(infra_ranking_dps[i][2], vess_ranking_dps[i][2])])

minimum = 5
for i in range(len(max_ranking_dps)):
    rank = max_ranking_dps[i][2]
    # print(rank)
    if rank < minimum:
        minimum = rank
        mean = max_ranking_dps[i][0]
        std = max_ranking_dps[i][1]
        min_vess_ranking = vess_ranking_dps[i][2]
        min_infra_ranking = infra_ranking_dps[i][2]

print("IDEAL MEAN AND STD ADJUSTMENT FOUND AT", round(mean,3),"MEAN AND", round(std,3), "STD")
print("With vess avg ranking:", min_vess_ranking)
print("With infra avg ranking:", min_infra_ranking)


In [None]:
# IDEAL MEAN AND STD ADJUSTMENT FOUND AT 0.3 MEAN AND 0.22 STD
# With vess avg ranking: 1.3655913978494623
# With infra avg ranking: 1.3650793650793651

In [None]:
# First set of points
x1, y1, z1 = np.array(infra_ranking_dps).transpose(1,0)

# Second set of points
x2, y2, z2 = np.array(vess_ranking_dps).transpose(1,0)

# Create the figure and add traces
fig = go.Figure()

# Infra Avg Ranking
fig.add_trace(go.Scatter3d(
    x=x1, y=y1, z=z1,
    mode='markers',
    marker=dict(
        size=6,
        color=z1,  # Use z-values for color
        colorscale='Blues',  # Gradient scale
        opacity=1.0
    ),
    name='Infra Avg Ranking'
))

# Vessel Avg Ranking
fig.add_trace(go.Scatter3d(
    x=x2, y=y2, z=z2,
    mode='markers',
    marker=dict(
        size=6,
        color=z2,  # Use z-values for color
        colorscale='Reds',
        opacity=1.0
    ),
    name='Vessel Avg Ranking'
))

# Optimized Point
fig.add_trace(go.Scatter3d(
    x=[mean], y=[std], z=[minimum],
    mode='markers',
    marker=dict(
        size=10,
        color='green',
        opacity=1.0
    ),
    name='Optimized Point'
))

# Update layout
fig.update_layout(
    scene=dict(
        xaxis_title='Mean Adjustment',
        yaxis_title='Std Adjustment',
        zaxis_title='Avg Ranking',
        aspectratio=dict(x=1, y=1, z=1.0)
    ),
    title='Z (avg ranking) at various X (mean adjustment) and Y (std adjustmentment)',
    showlegend=True
)

fig.show()



In [None]:
min_top_3_dps = []
for i in range(len(infra_top_3_dps)):
    x = infra_top_3_dps[i][0]
    y = infra_top_3_dps[i][1]
    min_top_3_dps.append([x,y,min(infra_top_3_dps[i][2], vess_top_3_dps[i][2])])

maximum = 0
for i in range(len(min_top_3_dps)):
    bot = min_top_3_dps[i][2]
    # print(rank)
    if bot > maximum:
        maximum = bot
        mean = min_top_3_dps[i][0]
        std = min_top_3_dps[i][1]
        min_vess_top_3 = vess_top_3_dps[i][2]
        min_infra_top_3 = infra_top_3_dps[i][2]

print("IDEAL MEAN AND STD ADJUSTMENT FOUND AT", round(mean,3),"MEAN AND", round(std,3), "STD")
print("With vess top 3 rate:", min_vess_top_3)
print("With infra top 3 rate:", min_infra_top_3)

In [None]:
# IDEAL MEAN AND STD ADJUSTMENT FOUND AT 0.35 MEAN AND 0.26 STD
# With vess top 3 rate: 0.967741935483871
# With infra top 3 rate: 0.9682539682539683

In [None]:
# First set of points
x1, y1, z1 = np.array(infra_top_3_dps).transpose(1,0)

# Second set of points
x2, y2, z2 = np.array(vess_top_3_dps).transpose(1,0)

# Create the figure and add traces
fig = go.Figure()

# Infra Avg Ranking
fig.add_trace(go.Scatter3d(
    x=x1, y=y1, z=z1,
    mode='markers',
    marker=dict(
        size=6,
        color=z1,  # Use z-values for color
        colorscale='Blues',  # Gradient scale
        opacity=1.0
    ),
    name='Infra Top 3 Rate'
))

# Vessel Avg Ranking
fig.add_trace(go.Scatter3d(
    x=x2, y=y2, z=z2,
    mode='markers',
    marker=dict(
        size=6,
        color=z2,  # Use z-values for color
        colorscale='Reds',
        opacity=1.0
    ),
    name='Vessel Top 3 Rate'
))

# Optimized Point
fig.add_trace(go.Scatter3d(
    x=[mean], y=[std], z=[maximum],
    mode='markers',
    marker=dict(
        size=10,
        color='green',
        opacity=1.0
    ),
    name='Optimized Point'
))

# Update layout
fig.update_layout(
    scene=dict(
        xaxis_title='Mean Adjustment',
        yaxis_title='Std Adjustment',
        zaxis_title='Top 3 Rate',
        aspectratio=dict(x=1, y=1, z=1.0)
    ),
    title='Z (top 3 rate) at various X (mean adjustment) and Y (std adjustmentment)',
    showlegend=True
)

fig.show()



Investigate absolute ranking

In [None]:
ranks = []
sort_by = 'coincidence_score'
ranks_old = []
# sort_by = 'coll_score'
# sort_by = 'collated_score'
for slick,g in df.groupby('slick'):
    g_old = true_slick_df[true_slick_df['slick'] == slick]
    # if slick != 3573115:
    #     continue
    true_source = g[g['hitl_verification'] == True]
    true_source_old = g_old[g_old['hitl_verification'] == True]
    if len(true_source)==0:
        continue
    if true_source['type'].values[0]==1:
        g = g[g['type']==true_source['type'].values[0]]
        g = g.sort_values(by=sort_by, ascending=False)

        g_old = g_old[g_old['type']==true_source_old['type'].values[0]]
        g_old = g_old.sort_values(by=sort_by, ascending=False)

        g['calculated_rank'] = list(range(1,len(g)+1))
        g_old['calculated_rank'] = list(range(1,len(g_old)+1))

        # if g[g['hitl_verification'] == True]['calculated_rank'].values[0] > 3:
        #         print(slick)
        g_rank = g[g['hitl_verification'] == True]['calculated_rank'].values[0]
        g_rank_old = g_old[g_old['hitl_verification'] == True]['calculated_rank'].values[0]
        ranks.append(g_rank)
        ranks_old.append(g_rank_old)

        if g_rank > g_rank_old:
             print(slick)
             print(g_rank_old, "--->", g_rank)

        

In [None]:
np.mean(ranks_old)

In [None]:
np.mean(ranks)

Investigate temporal weighting adjustment

In [None]:
columns = [
    'source', 
    'type', 
    'hitl_verification',
    'slick', 
    'coincidence_score', 
    'collated_score', 
    'custom_coin_score', 
    'custom_coll_score', 
    'calculated_rank'
    'overlap_score',
    'temporal_score',
    'distance_score'
]
df_temps = pd.DataFrame(columns=columns)
for slick_id,group in true_slick_df.groupby('slick'):
    true_source = group[group['hitl_verification'] == True]
    if len(true_source)==0:
        continue
    true_type = true_source['type'].values[0]
    # if true_type==1:
    #     print(true_source['st_name'])
    new_vess_sources = rerun_df[rerun_df['slick_id']==slick_id]
    new_vess_sources = new_vess_sources.sort_values(by='coincidence_score', ascending=False)
    for i,vess in new_vess_sources.iterrows(): 
        row = {'source':vess['st_name'],
                'type':1,
                'hitl_verification': vess['st_name']==true_source['st_name'].values[0],
                'slick': slick_id,
                'coincidence_score':vess['coincidence_score'],
                'collated_score':vess['coll_score'],
                'overlap_score': vess['overlap_score'],
                'temporal_score':vess['temporal_score'],
                'distance_score':vess['distance_score'],
                'custom_coin_score':0, 'custom_coll_score':0, 'calculated_rank':0
            }
        
        new_row = pd.DataFrame([row])
        df_temps = pd.concat([df_temps, new_row], ignore_index=True)

In [None]:
w_temps = list(np.arange(0.0, 10, 0.25))
vess_ranking_dps = []
vess_top_3_dps = []
for w_temp in w_temps:
    custom_coll_scores = []
    for source in df_temps.iloc:
        custom_coll_scores.append((2*source['distance_score'] + 1*source['overlap_score'] + w_temp*source['temporal_score'])/(3 + w_temp))
    df_temps['custom_coll_score'] = custom_coll_scores

    # infra_list = df_temps[df_temps['type'] == 2][df_temps['hitl_verification'] == True]  # Filtered infra list
    vess_list = df_temps[df_temps['type'] == 1][df_temps['hitl_verification'] == True]  # Filtered vessel list
    vessel_true_rank, infra_true_rank = vess_infra_true_ranks(df_temps, coll_column='custom_coll_score')

    # Append ranking data points
    # infra_ranking_dps.append((infra_mean_adjust, infra_std_adjust, np.mean(infra_true_rank)))
    vess_ranking_dps.append(np.mean(vessel_true_rank))

    vess_top_3_dps.append(np.sum(np.array(vessel_true_rank)<=3) / len(vessel_true_rank))
    # infra_top_3_dps.append((infra_mean_adjust, infra_std_adjust, np.sum(np.array(infra_true_rank)<=3) / len(infra_true_rank)))


In [None]:
import matplotlib.pyplot as plt

# Example data (replace these with your actual data)
# w_temps = [...]
# vess_ranking_dps = [...]
# vess_top_3_dps = [...]

# Create a figure and subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharex=True)

# First plot: Average vessel rank
axes[0].plot(w_temps, vess_ranking_dps, label='Average Vessel Rank', color='blue')
axes[0].set_title("Average Vessel Rank")
axes[0].set_xlabel("Temporal Weighting Adjustment")
axes[0].set_ylabel("Average Rank")
axes[0].legend()

# Second plot: Top 3 source rate
axes[1].plot(w_temps, vess_top_3_dps, label='Top 3 Source Rate', color='green')
axes[1].set_title("Top 3 Source Rate")
axes[1].set_xlabel("Temporal Weighting Adjustment")
axes[1].set_ylabel("Source Rate")
axes[1].legend()

# Add an overall title
fig.suptitle("Comparison of Vessel Ranking Metrics with 8 Hour AIS track", fontsize=16)

# Adjust layout and display the plots
plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust rect to make space for the title
plt.show()
