In [None]:
import pandas as pd
file_path = '../data.json'
df = pd.read_json(file_path)

In [None]:
from datetime import datetime

flattened_data = [
    {
        'ship_id': row.uuid,
        'deadweight': row.deadweight,
        'lat': ping.get('lat'),
        'lon': ping.get('lon'),
        'speed': ping.get('speed'),
        'course': ping.get('course'),
        'heading': ping.get('heading'),
        'destination': ping.get('destination'),
        'timestamp': ping.get('last_position_UTC')
    }
    for row in df.itertuples()
    for ping in (row.positions if isinstance(row.positions, list) else [])
]

pings = pd.DataFrame(flattened_data)

if pd.api.types.is_numeric_dtype(pings['timestamp']):
    pings['timestamp'] = pd.to_datetime(pings['timestamp'], unit='s')
else:
    pings['timestamp'] = pd.to_datetime(pings['timestamp'])

pings['date'] = pings['timestamp'].dt.date
pings.set_index(['date', 'timestamp', 'ship_id'], inplace=True)
target_columns = ['lon', 'lat', 'destination', 'speed', 'course', 'heading', 'deadweight']
available_cols = [c for c in target_columns if c in pings.columns]
pings = pings[available_cols]

print(f"Created DataFrame with {len(pings)} rows.")
display(pings.head(15))

In [None]:
import folium

subset = pings.head(5000).reset_index()
map_center = [subset['lat'].mean(), subset['lon'].mean()]
m = folium.Map(location=map_center, zoom_start=6)

for _, row in subset.iterrows():
    folium.CircleMarker(
        location=[row['lat'], row['lon']],
        radius=3,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        tooltip=f"Ship: {row['ship_id']}<br>Time: {row['timestamp']}"
    ).add_to(m)

    display(m)

In [None]:
import numpy as np

conditions = [
    (pings['lon'] < 53.5),                    
    (pings['lon'] >= 53.5) & (pings['lon'] < 58.0),
    (pings['lon'] >= 58.0) & (pings['lon'] < 64.0), 
    (pings['lon'] >= 64.0) & (pings['lon'] < 94.0), 
    (pings['lon'] >= 94.0) & (pings['lon'] < 102.5),
    (pings['lon'] >= 102.5)      
]

choices = [
    '1_Ras_Tanura',
    '2_Hormuz',
    '3_Ras_al_Hadd',
    '4_Sri_Lanka_South',
    '5_Malacca_Strait',
    '6_Singapore'
]

pings['region'] = np.select(conditions, choices, default='Unknown')
display(pings[['lon', 'lat', 'region']].sample(10).sort_values('lon'))

Group by date and region in order

In [None]:
summary_df = pings.groupby(['date', 'region']).size().unstack(fill_value=0)
summary_df = summary_df.sort_index(axis=1)
display(summary_df.head(10))


Cleaning bad data

In [None]:
THRESHOLD = 25
summary_df_patched = summary_df.copy()
summary_df_patched[summary_df_patched <= THRESHOLD] = np.nan
summary_df_patched = summary_df_patched.interpolate(method='linear', limit_direction='both')
summary_df_patched = summary_df_patched.round().astype(int)

summary_df = summary_df_patched

changepoint detection

In [None]:
from scipy import stats
def online_detector(df, column, short_win=7, long_win=30, threshold=0.01):
    series = df[column].copy()

    short_mean = series.rolling(window=short_win).mean()
    long_mean = series.rolling(window=long_win).mean()

    short_std = series.rolling(window=short_win).std()
    long_std = series.rolling(window=long_win).std()
    def get_p_value(i):
        if i < long_win: return 1.0
        sample_now = series.iloc[i-short_win+1 : i+1]
        sample_past = series.iloc[i-long_win+1 : i-short_win+1]
        _, p = stats.ttest_ind(sample_now, sample_past, equal_var=False)
        return p

    p_values = pd.Series([get_p_value(i) for i in range(len(series))], index=series.index)
    changepoints = p_values[p_values < threshold].index

    return p_values, changepoints
results = {}
for col in summary_df.columns:
    p_vals, cp_dates = online_detector(summary_df, col)
    results[col] = cp_dates
    print(f"Detected {len(cp_dates)} potential shift days in {col}")

plot them

In [None]:
import matplotlib.pyplot as plt

def coalesce_changepoints(dates, gap_days=14):
    if hasattr(dates, 'empty'):
        if dates.empty: return []
    elif not dates:
        return []

    sorted_dates = sorted(list(dates))
    clean_events = [sorted_dates[0]]

    for i in range(1, len(sorted_dates)):
        diff = (sorted_dates[i] - clean_events[-1]).days
        if diff > gap_days:
            clean_events.append(sorted_dates[i])

    return clean_events

changepoint_dict = {
    col: coalesce_changepoints(results[col], gap_days=14)
    for col in summary_df.columns
}


fig, axes = plt.subplots(nrows=6, ncols=1, figsize=(15, 22), sharex=True)

for i, col in enumerate(summary_df.columns):
    ax = axes[i]
    ax.plot(summary_df.index, summary_df[col], label='Pings', color='#1f77b4', lw=1.5)
    ax.fill_between(summary_df.index, summary_df[col], color='#1f77b4', alpha=0.1)
    events = changepoint_dict[col]
    for cp in events:
        ax.axvline(x=cp, color='red', linestyle='--', alpha=0.8, label='Shift Detected' if cp == events[0] else "")

    ax.set_title(f"Zone: {col} | {len(events)} Major Shifts Detected", loc='left', fontsize=12, fontweight='bold')
    ax.grid(axis='y', linestyle='--', alpha=0.3)
    if i == 0: ax.legend(loc='upper right')

plt.xlabel("Timeline")
plt.tight_layout()
plt.show()

for zone, dates in changepoint_dict.items():
    print(f"{zone}: {len(dates)} events found.")

In [None]:
from datetime import timedelta
all_events = []
for zone, dates in changepoint_dict.items():
    for d in dates:
        all_events.append({'date': d, 'zone': zone})
all_events = sorted(all_events, key=lambda x: x['date'])

strong_changepoints = []
if all_events:
    current_group = [all_events[0]]

    for i in range(1, len(all_events)):
        if (all_events[i]['date'] - current_group[0]['date']).days <= 7:
            current_group.append(all_events[i])
        else:
            unique_zones = {event['zone'] for event in current_group}
            if len(unique_zones) >= 2:
                strong_changepoints.append(current_group[0]['date'])
            current_group = [all_events[i]]

    unique_zones = {event['zone'] for event in current_group}
    if len(unique_zones) >= 2:
        strong_changepoints.append(current_group[0]['date'])

if not strong_changepoints:
    print("No global events detected with the current 7-day threshold.")

In [None]:
import pandas as pd

def generate_prediction_df(df, changepoints, short_window=3, long_window=21, noise_threshold=50):
    prediction_data = []

    for date in changepoints:
        if date - pd.Timedelta(days=long_window) < df.index.min():
            continue
        long_start = date - pd.Timedelta(days=long_window)
        long_data = df.loc[long_start : date]

        short_start = date - pd.Timedelta(days=short_window)
        short_data = df.loc[short_start : date]
        supply_delta = short_data['2_Hormuz'].mean() - long_data['2_Hormuz'].mean()
        demand_delta = short_data['6_Singapore'].mean() - long_data['6_Singapore'].mean()
        net_flow = demand_delta - supply_delta
        pred = 0
        if net_flow > noise_threshold:
            pred = 1 
        elif net_flow < -noise_threshold:
            pred = -1 
        else:
            pred = 0  

        prediction_data.append({
            'Date': date,
            'Supply_Momentum': round(supply_delta, 2),
            'Demand_Momentum': round(demand_delta, 2),
            'Net_Flow': round(net_flow, 2),
            'Signal': pred
        })

    pred_df = pd.DataFrame(prediction_data)
    if not pred_df.empty:
        pred_df.set_index('Date', inplace=True)

        pred_df = pred_df[pred_df['Signal'] != 0]

    return pred_df

df_predictions = generate_prediction_df(summary_df, strong_changepoints, noise_threshold=50)




In [None]:
# --- CONFIGURATION ---
STOP_LOSS_PCT = 0.02          
TRAILING_PROPORTION = 0.6     
TRAILING_LIMIT_PCT = 0.03    

def simulate_trades(price_df, signals_df, stop_loss=STOP_LOSS_PCT, trailing_prop=TRAILING_PROPORTION, trailing_limit=TRAILING_LIMIT_PCT):
    trades = []
    signals_df = signals_df.sort_index()
    for date, row in signals_df.iterrows():
        if date not in price_df.index:
            continue
        entry_price = price_df.loc[date]['Close']
        signal = row['Signal'] 
        future_prices = price_df.loc[date:]['Close'].iloc[1:] 
        if future_prices.empty:
            continue
        max_pnl_pct = 0.0
        exit_reason = "End of Data"
        exit_date = future_prices.index[-1]
        exit_price = future_prices.iloc[-1]
        pnl_pct = 0.0
        for curr_date, curr_price in future_prices.items():
            if signal == 1:
                curr_pnl = (curr_price - entry_price) / entry_price
            else: 
                curr_pnl = (entry_price - curr_price) / entry_price
            if curr_pnl > max_pnl_pct:
                max_pnl_pct = curr_pnl
            if curr_pnl <= -stop_loss:
                exit_reason = f"Stop Loss (-{stop_loss:.1%})"
                exit_price = curr_price
                exit_date = curr_date
                pnl_pct = -stop_loss
                break

            floor = -1.0 
            if max_pnl_pct > 0:
                floor = max(floor, max_pnl_pct * trailing_prop)
            if max_pnl_pct >= trailing_limit:
                floor = max(floor, trailing_limit)
            if max_pnl_pct > 0.005 and curr_pnl <= floor:
                exit_reason = f"Trailing Exit (Prop/Limit Rule)"
                exit_price = curr_price
                exit_date = curr_date
                pnl_pct = floor 
                break

        if exit_reason == "End of Data":
             if signal == 1: pnl_pct = (exit_price - entry_price) / entry_price
             else: pnl_pct = (entry_price - exit_price) / entry_price

        trades.append({
            'Entry Date': date,
            'Signal': 'Long' if signal == 1 else 'Short',
            'Entry Price': entry_price,
            'Exit Date': exit_date,
            'Exit Price': exit_price,
            'Max RunUp': max_pnl_pct,
            'Return': pnl_pct,
            'Reason': exit_reason
        })

    return pd.DataFrame(trades)


trade_results = simulate_trades(uso_df, valid_signals)

if not trade_results.empty:
    win_rate = len(trade_results[trade_results['Return'] > 0]) / len(trade_results)
    avg_return = trade_results['Return'].mean()
    total_return_simple = trade_results['Return'].sum()

    trade_results['Equity_Curve'] = (1 + trade_results['Return']).cumprod()

    print(f"--- PERFORMANCE SUMMARY ---")
    print(f"Total Trades: {len(trade_results)}")
    print(f"Win Rate: {win_rate:.1%}")
    print(f"Average Return per Trade: {avg_return:.2%}")
    print(f"Simple Total Return (Sum): {total_return_simple:.2%}")
    print(f"Compounded Total Return: {(trade_results['Equity_Curve'].iloc[-1] - 1):.2%}")

    print("\n--- RECENT TRADES ---")
    print(trade_results[['Entry Date', 'Signal', 'Return', 'Reason']].tail())
else:
    print("No trades generated.")

In [None]:
import matplotlib.dates as mdates

full_idx = pd.date_range(start=trade_results['Entry Date'].min(), end=trade_results['Exit Date'].max(), freq='D')
strategy_curve = pd.Series(index=full_idx, data=np.nan)
current_equity = 1.0
strategy_curve.iloc[0] = current_equity

sorted_trades = trade_results.sort_values('Exit Date')

for _, row in sorted_trades.iterrows():
    current_equity = current_equity * (1 + row['Return'])
    if row['Exit Date'] in strategy_curve.index:
        strategy_curve.loc[row['Exit Date']] = current_equity

strategy_curve = strategy_curve.ffill()

benchmark_curve = uso_df.loc[strategy_curve.index[0]:strategy_curve.index[-1]]['Close']
benchmark_curve = benchmark_curve / benchmark_curve.iloc[0]

rolling_max = strategy_curve.cummax()
drawdown = (strategy_curve - rolling_max) / rolling_max

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['grid.alpha'] = 0.3

fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(15, 12),
                               gridspec_kw={'height_ratios': [3, 1]}, sharex=True)
plt.subplots_adjust(hspace=0.05)
ax1.plot(strategy_curve.index, strategy_curve, color='#00a65a', linewidth=2.5, label='Strategy (Shipping Signals)')
ax1.fill_between(strategy_curve.index, strategy_curve, 1, color='#00a65a', alpha=0.1) # Subtle fill for aesthetic

ax1.plot(benchmark_curve.index, benchmark_curve, color='#555555', linestyle='--', linewidth=1.5, alpha=0.7, label='USO Buy & Hold')

final_ret = strategy_curve.iloc[-1] - 1
ax1.text(strategy_curve.index[-1], strategy_curve.iloc[-1], f" +{final_ret:.1%}",
         color='#00a65a', fontweight='bold', fontsize=12, va='center')

stats_text = (
    f"Total Return: {final_ret:.1%}\n"
    f"Win Rate:     {win_rate:.1%}\n"
    f"Trades:       {len(trade_results)}\n"
    f"Avg Return:   {avg_return:.2%}"
)
props = dict(boxstyle='round', facecolor='white', alpha=0.9, edgecolor='lightgray')
ax1.text(0.02, 0.95, stats_text, transform=ax1.transAxes, fontsize=11,
         verticalalignment='top', bbox=props)

ax1.set_ylabel("Account Value (Growth of $1)", fontsize=12)
ax1.set_title("Strategy Performance vs. Benchmark", fontsize=16, fontweight='bold', pad=15)
ax1.legend(loc='upper left', bbox_to_anchor=(0.02, 0.75), frameon=True)
ax1.grid(True)
ax2.plot(drawdown.index, drawdown, color='#d9534f', linewidth=1)
ax2.fill_between(drawdown.index, drawdown, 0, color='#d9534f', alpha=0.3)
ax2.set_ylabel("Drawdown", fontsize=12)
ax2.set_xlabel("Date", fontsize=12)
ax2.grid(True)
ax2.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%b'))
ax2.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax2.spines['top'].set_visible(False)
ax1.spines['bottom'].set_visible(False)

plt.show()

monte carlo

In [None]:
def run_monte_carlo_simulation(df_trades, num_simulations=2000, initial_equity=1.0):
    if df_trades is None or df_trades.empty:
        print(" Error: Trade DataFrame is empty or None.")
        return

    returns = df_trades['Return'].values
    n_trades = len(returns)
    simulated_equity = np.zeros((num_simulations, n_trades))
    for i in range(num_simulations):
        random_returns = np.random.choice(returns, size=n_trades, replace=True)
        equity_curve = np.cumprod(1 + random_returns) * initial_equity
        simulated_equity[i, :] = equity_curve

    final_values = simulated_equity[:, -1]
    avg_return = np.mean(final_values) - initial_equity
    median_return = np.median(final_values) - initial_equity
    prob_loss = np.sum(final_values < initial_equity) / num_simulations
    var_95 = np.percentile(final_values, 5)
    best_case = np.max(final_values)
    worst_case = np.min(final_values)
    plt.figure(figsize=(14, 8))
    plt.plot(range(n_trades), simulated_equity[:100, :].T, color='yellow', alpha=0.1)

    mean_curve = np.mean(simulated_equity, axis=0)
    plt.plot(range(n_trades), mean_curve, color='#1f77b4', linewidth=2.5, label=f'Mean Expectation ({mean_curve[-1]:.2f}x)')
    p95_curve = np.percentile(simulated_equity, 95, axis=0)
    plt.plot(range(n_trades), p95_curve, color='green', linestyle='--', linewidth=2, label='95th Percentile (Lucky Run)')
    p05_curve = np.percentile(simulated_equity, 5, axis=0)
    plt.plot(range(n_trades), p05_curve, color='red', linestyle='--', linewidth=2, label=f'5th Percentile (Unlucky Run: {p05_curve[-1]:.2f}x)')
    plt.title(f"Monte Carlo Simulation: {num_simulations} Possible Futures", fontsize=16, fontweight='bold')
    plt.xlabel("Trade Count", fontsize=12)
    plt.ylabel("Equity Growth (Start = $1.0)", fontsize=12)
    plt.axhline(initial_equity, color='black', linestyle=':', linewidth=1)
    plt.legend(loc='upper left', fontsize=11, frameon=True, fancybox=True, framealpha=0.9)
    plt.grid(True, linestyle='--', alpha=0.4)
    plt.show()
    print(f"\n--- MONTE CARLO RESULTS ({num_simulations} Runs) ---")
    print(f"Base Trades Analyzed:   {n_trades}")
    print(f"Win Rate (Original):    {np.sum(returns > 0) / n_trades:.1%}")
    print(f"-" * 40)
    print(f"Probability of Profit:  {1 - prob_loss:.1%} (Chance you make money)")
    print(f"Probability of Loss:    {prob_loss:.1%} (Chance you lose money)")
    print(f"-" * 40)
    print(f"Expected Multiplier:    {np.mean(final_values):.2f}x  (Avg Outcome)")
    print(f"Worst Case (Simulated): {worst_case:.2f}x")
    print(f"Risk Floor (95% Conf):  {var_95:.2f}x (You beat this 95% of the time)")

target_df = None

if 'final_trades' in locals() and not final_trades.empty:
    target_df = final_trades
    print(" Found 'final_trades'. Running simulation...")
elif 'trade_results' in locals() and not trade_results.empty:
    target_df = trade_results
    print(" Found 'trade_results'. Running simulation...")
else:
    print("Could not find trade data variable. Regenerating trades now...")
    if 'uso_df' in locals() and 'df_signals' in locals():
         target_df = simulate_trades(uso_df, df_signals,
                                     stop_loss=0.02, trailing_prop=0.6, trailing_limit=0.02)
    elif 'uso_df' in locals() and 'valid_signals' in locals():
         target_df = simulate_trades(uso_df, valid_signals,
                                     stop_loss=0.02, trailing_prop=0.6, trailing_limit=0.02)
if target_df is not None:
    run_monte_carlo_simulation(target_df)
else:
    print("Critical Error: Could not find inputs (uso_df, signals) to regenerate trades.")