In [119]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import plotly.io as pio

pd.options.plotting.backend = 'plotly'
pio.templates.default = 'plotly_dark'
pd.set_option('display.float_format', '{:.3f}'.format)

### Fubon Real Transaction Statistics

In [None]:
data = pd.read_pickle('../data/fubon_transactions_real.pkl')

In [None]:
df = pd.DataFrame(data)
# Export transaction data to `.csv`
df['buy_val'] = df['buy'] * df['price']
df['sell_val'] = df['sell'] * df['price']
df['net_flow'] = df['buy_val'] - df['sell_val']
(df.groupby('date')[['net_flow']]
 .sum()
 .reset_index()
 .rename(columns={
     'date': 'Date',
    'net_flow': 'primary_flow_vnd'
 })
 .to_csv('../data/transaction_data.csv', index=False))

In [122]:
df.groupby('date')['ticker'].count().plot(title='Number of traded stock - Daily')

In [123]:
df.groupby('date')['value_million'].sum().mul(1_000_000).plot(title='Total Transaction Value - Daily')

In [124]:
net_flow = df.groupby('date').apply(lambda df: (df['buy']*df['price']).sum() - (df['sell']*df['price']).sum())
net_flow.plot(title='Net Flow - Daily')

In [125]:
daily_stock_traded = df.groupby('date')['ticker'].count()

In [126]:
daily_stock_traded.hist()

In [129]:
daily_stock_traded.describe()

count   335.000
mean      9.543
std       3.926
min       1.000
25%       6.000
50%      11.000
75%      13.000
max      24.000
Name: ticker, dtype: float64

In [130]:
net_flow.hist()

In [131]:
dates = [
    "2024-09-04",
    "2024-09-05",
    "2024-09-06",
    "2024-09-09",
    "2024-09-10",
    "2024-09-11",
    "2024-09-12",
    "2024-09-13",
    "2024-09-16",
    "2024-09-17",
    "2024-09-18",
    "2024-09-19",
    "2024-09-20",
    "2024-09-23",
    "2024-09-24",
    "2024-09-25",
    "2024-09-26",
    "2024-09-27",
    "2024-09-30",
    "2025-07-28"
]

### Primary Flow Summary

In [132]:
import numpy as np
import pandas as pd
import dask.dataframe as dd

def compute_numlot_summary(similarities=[10, 15, 20, 25, 30, 35, 40], hpd_threshold=0.75):
    records = []

    for sim_thresh in similarities:
        lot_df = (
            dd.read_csv(f'../data/lot_data/based_on_real_transaction/{sim_thresh}/*.csv')
            .drop_duplicates(subset=['ts', 'stock', 'side', 'num_lot'])
            .reset_index(drop=True)
            .compute()
        )

        q = lot_df['num_lot'].quantile([0.001, 0.999])
        q001, q999 = float(q.loc[0.001]), float(q.loc[0.999])
        print(f'Confidence Interval [0.1%, 99.9%] for {sim_thresh}% similarity: ({q001:.6g}, {q999:.6g})')

        num_lot = lot_df.loc[lot_df['num_lot'].between(q001, q999), 'num_lot']
        min_lot, max_lot = float(num_lot.min()), float(num_lot.max())
        bins = np.logspace(np.log10(min_lot), np.log10(max_lot), 10)
        num_lot_log_bin = pd.cut(num_lot, bins)
        vc = num_lot_log_bin.value_counts().sort_index()

        print('Numlot Log-binned num_lot')
        print(vc.to_string())

        total = vc.sum()
        best_range = (None, None)
        best_count = 0
        min_width = np.inf

        for i in range(len(vc)):
            for j in range(i, len(vc)):
                subset_count = vc.iloc[i:j+1].sum()
                portion = subset_count / total
                if portion >= hpd_threshold:
                    width = bins[j+1] - bins[i]
                    if width < min_width:
                        min_width = width
                        best_range = (bins[i], bins[j+1])
                        best_count = subset_count
                    break

        if best_range[0] is not None:
            lower, upper = best_range
            print(
                f'Range of bins containing >={hpd_threshold*100:.0f}% of data: '
                f'({lower:.6g}, {upper:.6g}) '
                f'with {best_count:,} observations '
                f'(~{best_count/total*100:.2f}% of total)'
            )
            records.append({
                "similarity": sim_thresh,
                "lower_bound": lower,
                "upper_bound": upper,
                "obs_in_range": int(best_count),
                "total_obs": int(total)
            })
        else:
            print(f'No {hpd_threshold*100:.0f}% cumulative range found.')
            records.append({
                "similarity": sim_thresh,
                "lower_bound": np.nan,
                "upper_bound": np.nan,
                "obs_in_range": 0,
                "total_obs": int(total)
            })

        print(f'Total observations in distribution: {total:,}\n')

    summary_df = pd.DataFrame(records)
    return summary_df


In [133]:
summary_df = compute_numlot_summary(hpd_threshold=0.8)

Confidence Interval [0.1%, 99.9%] for 10% similarity: (0.038713, 3.48627)
Numlot Log-binned num_lot
num_lot
(0.042, 0.0686]     222
(0.0686, 0.112]     635
(0.112, 0.183]      548
(0.183, 0.299]     1118
(0.299, 0.489]     1036
(0.489, 0.799]      660
(0.799, 1.305]      287
(1.305, 2.133]      105
(2.133, 3.484]       33
Range of bins containing >=80% of data: (0.0686209, 0.798902) with 3,997 observations (~86.07% of total)
Total observations in distribution: 4,644

Confidence Interval [0.1%, 99.9%] for 15% similarity: (0.0305, 3.88918)
Numlot Log-binned num_lot
num_lot
(0.0305, 0.0521]      21
(0.0521, 0.0889]    1553
(0.0889, 0.152]     2175
(0.152, 0.259]       930
(0.259, 0.443]       974
(0.443, 0.756]       827
(0.756, 1.291]       362
(1.291, 2.204]       119
(2.204, 3.763]        35
Range of bins containing >=80% of data: (0.0520787, 0.442693) with 5,632 observations (~80.50% of total)
Total observations in distribution: 6,996

Confidence Interval [0.1%, 99.9%] for 20% similar

In [134]:
summary_df

Unnamed: 0,similarity,lower_bound,upper_bound,obs_in_range,total_obs
0,10,0.069,0.799,3997,4644
1,15,0.052,0.443,5632,6996
2,20,0.042,0.364,9244,11327
3,25,0.048,0.569,17905,21416
4,30,0.047,0.621,27754,33917
5,35,0.046,0.701,42265,50586
6,40,0.027,0.454,55078,68349


In [135]:
summary_df[['lower_bound', 'upper_bound']].agg({
    "lower_bound": "min",
    "upper_bound": "max",
}
)

lower_bound   0.027
upper_bound   0.799
dtype: float64

In [138]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import plotly.express as px

def compute_numlot_summary_and_plot(similarities=[10, 15, 20, 25, 30, 35, 40], hpd_threshold=0.75):
    records = []
    plot_records = []

    for sim_thresh in similarities:
        lot_df = (
            dd.read_csv(f'../data/lot_data/based_on_real_transaction/{sim_thresh}/*.csv')
            .drop_duplicates(subset=['ts', 'stock', 'side', 'num_lot'])
            .reset_index(drop=True)
            .compute()
        )

        q = lot_df['num_lot'].quantile([0.001, 0.999])
        q001, q999 = float(q.loc[0.001]), float(q.loc[0.999])
        num_lot = lot_df.loc[lot_df['num_lot'].between(q001, q999), 'num_lot']

        min_lot, max_lot = float(num_lot.min()), float(num_lot.max())
        bins = np.logspace(np.log10(min_lot), np.log10(max_lot), 10)
        num_lot_log_bin = pd.cut(num_lot, bins)
        vc = num_lot_log_bin.value_counts().sort_index()

        # Lưu dữ liệu cho biểu đồ
        for interval, count in vc.items():
            plot_records.append({
                'similarity': sim_thresh,
                'bin_left': interval.left,
                'bin_right': interval.right,
                'count': count,
                'bin_label': f"({interval.left:.3g}, {interval.right:.3g}]"
            })

        # Phần summary (HPD range)
        total = vc.sum()
        best_range = (None, None)
        best_count = 0
        min_width = np.inf

        for i in range(len(vc)):
            for j in range(i, len(vc)):
                subset_count = vc.iloc[i:j+1].sum()
                portion = subset_count / total
                if portion >= hpd_threshold:
                    width = bins[j+1] - bins[i]
                    if width < min_width:
                        min_width = width
                        best_range = (bins[i], bins[j+1])
                        best_count = subset_count
                    break

        if best_range[0] is not None:
            lower, upper = best_range
            records.append({
                "similarity": sim_thresh,
                "lower_bound": lower,
                "upper_bound": upper,
                "obs_in_range": int(best_count),
                "total_obs": int(total)
            })

    summary_df = pd.DataFrame(records)
    plot_df = pd.DataFrame(plot_records)

    # Vẽ plotly: biểu đồ cột cho từng similarity
    fig = px.bar(
        plot_df,
        x="bin_label",
        y="count",
        color="similarity",
        barmode="group",
        title="Distribution of num_lot (log-binned) by Similarity",
        labels={"bin_label": "num_lot bin (log scale)", "count": "Count"}
    )
    fig.update_xaxes(tickangle=45)
    fig.show()

    return summary_df, plot_df


In [141]:
summary_df, plot_df = compute_numlot_summary_and_plot()

### Tuning Results Visualization

In [145]:
import pandas as pd
import os

def compare_fubon_flows(folder_name):
    # Base paths
    base_path = "../results"
    actual_data_path = "../data/transaction_data.csv"
    result_folder = os.path.join(base_path, folder_name)

    # Fixed date list
    dates = [ "2024-09-04", "2024-09-05", "2024-09-06", "2024-09-09", "2024-09-10", "2024-09-11", "2024-09-12", "2024-09-13", "2024-09-16", "2024-09-18", "2024-09-19", "2024-09-20", "2024-09-23", "2024-09-24", "2024-09-25", "2024-09-26", "2024-09-27", "2024-09-30", "2025-07-28"]

    # Load actual transaction data
    df_actual = pd.read_csv(actual_data_path)
    records = []

    for date in dates:
        try:
            file_path = os.path.join(result_folder, f"{date}.csv")

            # Check if result file exists
            if not os.path.exists(file_path):
                print(f"⚠️ Missing file for date: {date}")
                records.append({'Date': date, 'predicted': None, 'true': None})
                continue

            # Load predicted data
            tmp = pd.read_csv(file_path)
            predicted_value = tmp['arbit_value'].sum() - tmp['unwind_value'].sum()

            # Extract actual data
            actual_rows = df_actual.query(f'Date == "{date}"')
            actual_value = actual_rows['primary_flow_vnd'].iloc[0] if not actual_rows.empty else None

            records.append({'Date': date, 'predicted': predicted_value, 'true': actual_value})

        except Exception as e:
            print(f"⚠️ {date}: {e}")
            records.append({'Date': date, 'predicted': None, 'true': None})

    df_result = pd.DataFrame(records)
    df_clean = df_result.dropna()

    # Display and plot
    display(df_clean)
    plot = df_clean.set_index('Date').plot(title=f'Fubon Flow Detection - True vs. Predicted ({folder_name})')

    return df_clean, plot


df_result, plot = compare_fubon_flows("ver3")
plot.show()

Unnamed: 0,Date,predicted,true
0,2024-09-04,-82518870000.0,-34391885051.0
1,2024-09-05,-70422695000.0,-104535149830.0
2,2024-09-06,-10121365000.0,-17501589923.0
3,2024-09-09,-87224330000.0,-85565450065.0
5,2024-09-11,-1491150000.0,-40393990139.0
6,2024-09-12,45171620000.0,-29149839929.0
7,2024-09-13,-8553230000.0,-24180755064.0
8,2024-09-16,-6638535000.0,-9390455041.0
9,2024-09-18,-30815925000.0,-215812299827.0
11,2024-09-20,-637340000.0,30945295200.0


In [146]:
df_results, plot = compare_fubon_flows("two_stage_optimize")
plot.show()
(df_result['predicted'] - df_result['true']).dropna().abs().mean()

Unnamed: 0,Date,predicted,true
0,2024-09-04,-89674375000.0,-34391885051.0
1,2024-09-05,-116086565000.0,-104535149830.0
2,2024-09-06,9070380000.0,-17501589923.0
3,2024-09-09,-107651815000.0,-85565450065.0
5,2024-09-11,-20809690000.0,-40393990139.0
6,2024-09-12,50180870000.0,-29149839929.0
7,2024-09-13,-3872100000.0,-24180755064.0
8,2024-09-16,-13921335000.0,-9390455041.0
9,2024-09-18,-39447715000.0,-215812299827.0
11,2024-09-20,-14906255000.0,30945295200.0


np.float64(49089336666.61539)

In [147]:
from IPython.display import clear_output, display
import time

while True:
    df_results, fig = compare_fubon_flows("test_new_bucket")
    
    clear_output(wait=True) 
    display(fig) 
    display(df_results)
        
    if len(df_results) >= 13:
        print("✅ Done! len(df_results) =", len(df_results))
        break
    
    time.sleep(3)


Unnamed: 0,Date,predicted,true
0,2024-09-04,-106317925000.0,-34391885051.0
1,2024-09-05,-116526635000.0,-104535149830.0
2,2024-09-06,1693710000.0,-17501589923.0
3,2024-09-09,-93947055000.0,-85565450065.0
5,2024-09-11,-26777570000.0,-40393990139.0
6,2024-09-12,48690555000.0,-29149839929.0
7,2024-09-13,-2082975000.0,-24180755064.0
8,2024-09-16,-15861220000.0,-9390455041.0
9,2024-09-18,-43555680000.0,-215812299827.0
11,2024-09-20,-544070000.0,30945295200.0


✅ Done! len(df_results) = 13


In [148]:
from IPython.display import clear_output, display
import time

while True:
    df_results, fig = compare_fubon_flows("test_new_bucket_trial_1")
    
    clear_output(wait=True) 
    display(fig) 
    display(df_results)
        
    if len(df_results) >= 13:
        print("✅ Done! len(df_results) =", len(df_results))
        break
    
    time.sleep(3)


Unnamed: 0,Date,predicted,true
0,2024-09-04,-87078510000.0,-34391885051.0
1,2024-09-05,-117932600000.0,-104535149830.0
2,2024-09-06,18551960000.0,-17501589923.0
3,2024-09-09,-79969975000.0,-85565450065.0
5,2024-09-11,-13328245000.0,-40393990139.0
6,2024-09-12,34407040000.0,-29149839929.0
7,2024-09-13,-17334195000.0,-24180755064.0
8,2024-09-16,-28484675000.0,-9390455041.0
9,2024-09-18,-40472415000.0,-215812299827.0
11,2024-09-20,5308935000.0,30945295200.0


✅ Done! len(df_results) = 13


### Bucket Distribution Function Modeling

#### No first_center

In [53]:
import numpy as np
from typing import Dict, List

def generate_non_overlap_distribution(
    start: float = 0.5,
    stop: float = 100.0,
    num: int = 40,
    step: float = 0.5,
    decimals: int = 3,
    growth: float = 1.0,
    int_threshold: float = 3.0,
) -> Dict[float, List[float]]:
    if start <= 0 or stop <= 0 or stop <= start:
        raise ValueError("start and stop must be positive and stop > start.")
    if num < 2:
        raise ValueError("num must be >= 2.")
    if growth <= 0:
        raise ValueError("growth must be > 0.")

    t = np.linspace(0.0, 1.0, num)
    w = t ** growth
    log_start, log_stop = np.log(start), np.log(stop)
    raw = np.exp(log_start + (log_stop - log_start) * w)
    rounded = np.round(raw / step) * step
    rounded = np.clip(rounded, start, stop)
    lots = np.unique(rounded.astype(float))
    lots[-1] = float(round(stop, decimals))

    snapped = []
    for v in lots:
        if v >= int_threshold:
            v = round(v)
        snapped.append(float(v))
    lots = np.array(snapped, dtype=float)

    eps = 1e-9
    filtered = []
    last = None
    for v in lots:
        if last is None or v > last + eps:
            filtered.append(v)
            last = v
    if filtered[-1] < stop - eps:
        filtered.append(stop)

    lots = [round(float(x), decimals) for x in filtered]
    lots[-1] = round(stop, decimals)

    distribution: Dict[float, List[float]] = {}
    for i, lot in enumerate(lots):
        if i == 0:
            lower = round(start, decimals)
            upper = round((lots[i] + lots[i+1]) / 2, decimals)
        elif i == len(lots) - 1:
            lower = round((lots[i-1] + lots[i]) / 2, decimals)
            upper = round(stop, decimals)
        else:
            lower = round((lots[i-1] + lots[i]) / 2, decimals)
            upper = round((lots[i] + lots[i+1]) / 2, decimals)
        distribution[lot] = [lower, upper]
    return distribution

generate_non_overlap_distribution(
    start=0.1,
    stop=100,
    num=22,
    step=0.2,
    decimals=3,
    growth=1.1167,
    int_threshold=7
)

{0.2: [0.1, 0.3],
 0.4: [0.3, 0.5],
 0.6: [0.5, 0.7],
 0.8: [0.7, 0.9],
 1.0: [0.9, 1.2],
 1.4: [1.2, 1.7],
 2.0: [1.7, 2.4],
 2.8: [2.4, 3.4],
 4.0: [3.4, 4.9],
 5.8: [4.9, 6.9],
 8.0: [6.9, 9.5],
 11.0: [9.5, 13.5],
 16.0: [13.5, 19.5],
 23.0: [19.5, 28.5],
 34.0: [28.5, 41.0],
 48.0: [41.0, 58.5],
 69.0: [58.5, 84.5],
 100: [84.5, 100]}

#### Add first_center

In [109]:
import numpy as np
from typing import Dict, List

def generate_non_overlap_distribution(
    start: float = 0.5,
    stop: float = 100.0,
    num: int = 40,
    step: float = 0.5,
    decimals: int = 3,
    growth: float = 1.0,
    int_threshold: float = 3.0,
    first_center: float = None,
) -> Dict[float, List[float]]:
    if start <= 0 or stop <= 0 or stop <= start:
        raise ValueError("start and stop must be positive and stop > start.")
    if num < 2:
        raise ValueError("num must be >= 2.")
    if growth <= 0:
        raise ValueError("growth must be > 0.")

    t = np.linspace(0.0, 1.0, num)
    w = t ** growth
    log_start, log_stop = np.log(start), np.log(stop)
    raw = np.exp(log_start + (log_stop - log_start) * w)
    rounded = np.round(raw / step) * step
    rounded = np.clip(rounded, start, stop)
    lots = np.unique(rounded.astype(float))
    lots[-1] = float(round(stop, decimals))

    snapped = []
    for v in lots:
        if v >= int_threshold:
            v = round(v)
        snapped.append(float(v))
    lots = np.array(snapped, dtype=float)

    eps = 1e-9
    filtered = []
    last = None
    for v in lots:
        if last is None or v > last + eps:
            filtered.append(v)
            last = v
    if filtered[-1] < stop - eps:
        filtered.append(stop)

    if first_center is not None:
        fc = float(max(first_center, start))
        filtered = [x for x in filtered if x >= fc - eps]
        if not filtered or filtered[0] > fc + eps:
            filtered.insert(0, fc)
        else:
            filtered[0] = fc

    lots = [round(float(x), decimals) for x in filtered]
    lots[-1] = round(stop, decimals)

    clean = []
    last = None
    for v in lots:
        if last is None or v > last + eps:
            clean.append(v)
            last = v
    lots = clean

    distribution: Dict[float, List[float]] = {}
    for i, lot in enumerate(lots):
        if i == 0:
            lower = round(start, decimals)
            upper = round((lots[i] + lots[i+1]) / 2, decimals)
        elif i == len(lots) - 1:
            lower = round((lots[i-1] + lots[i]) / 2, decimals)
            upper = round(stop, decimals)
        else:
            lower = round((lots[i-1] + lots[i]) / 2, decimals)
            upper = round((lots[i] + lots[i+1]) / 2, decimals)
        distribution[lot] = [lower, upper]
    return distribution


generate_non_overlap_distribution(
    start=0.02,
    stop=64,
    num=56,
    step=0.45,
    decimals=3,
    growth=1.4014,
    int_threshold=5,
    first_center=0.25
)

{0.25: [0.02, 0.35],
 0.45: [0.35, 0.675],
 0.9: [0.675, 1.125],
 1.35: [1.125, 1.575],
 1.8: [1.575, 2.025],
 2.25: [2.025, 2.7],
 3.15: [2.7, 3.375],
 3.6: [3.375, 3.825],
 4.05: [3.825, 4.5],
 4.95: [4.5, 5.475],
 6.0: [5.475, 6.5],
 7.0: [6.5, 8.0],
 9.0: [8.0, 10.0],
 11.0: [10.0, 12.0],
 13.0: [12.0, 14.5],
 16.0: [14.5, 17.5],
 19.0: [17.5, 21.0],
 23.0: [21.0, 25.5],
 28.0: [25.5, 31.5],
 35.0: [31.5, 39.0],
 43.0: [39.0, 47.5],
 52.0: [47.5, 58.0],
 64: [58.0, 64]}