In [1]:
# ====================================================
# Cell 1: Setup & Imports
# ====================================================
import os, glob
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import mlflow




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\conda\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "d:\conda\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "d:\conda\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "d:\conda\Lib\site-packages\tornado\platform\asyncio.py", line 211, in start
    self.a

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ImportError: numpy.core.multiarray failed to import

In [None]:
mlflow.autolog()  # Enable MLflow autologging

In [None]:
# Paths
BASE_PATH = r"D:\hack\impostor_detection\impostor_detection\data\raw\sapimouse\sapimouse"
PROCESSED_PATH = r"D:\hack\impostor_detection\impostor_detection\data\processed"

os.makedirs(PROCESSED_PATH, exist_ok=True)

# Collect user directories
user_dirs = sorted([p for p in os.listdir(BASE_PATH) if p.lower().startswith("user")])
print(f"Found {len(user_dirs)} users")

In [2]:
# ====================================================
# Cell 2: Utility Functions
# ====================================================
def read_session_csv(path, user_id_from_folder):
    """Read session CSV robustly and return DataFrame with clean columns."""
    try:
        df = pd.read_csv(path, skipinitialspace=True, engine="python")
    except Exception:
        df = pd.read_csv(path, header=None, engine="python")

    if "client" not in df.columns and "client timestamp" not in df.columns:
        # fallback: assume order: ts, button, state, x, y
        df = df.rename(columns={df.columns[0]: "timestamp",
                                df.columns[1]: "button",
                                df.columns[2]: "state",
                                df.columns[3]: "x",
                                df.columns[4]: "y"})
        df["client"] = user_id_from_folder
    else:
        # If client timestamp present, split if needed
        if "client timestamp" in df.columns:
            s = df["client timestamp"].astype(str).str.split(r"\s+", n=1, expand=True)
            if s.shape[1] == 2:
                df["client"] = s[0]
                df["timestamp"] = pd.to_numeric(s[1], errors="coerce")
            else:
                df["client"] = user_id_from_folder
                df["timestamp"] = pd.to_numeric(s[0], errors="coerce")
            df = df.drop(columns=["client timestamp"])
        elif "timestamp" in df.columns:
            df["client"] = user_id_from_folder

    # Ensure numeric coords
    df["x"] = pd.to_numeric(df["x"], errors="coerce")
    df["y"] = pd.to_numeric(df["y"], errors="coerce")

    return df[["client","timestamp","button","state","x","y"]]

def extract_features(df, user_id):
    """Extract dynamic + statistical features for a session."""
    df = df.sort_values("timestamp").reset_index(drop=True)

    # Counts
    total_events = len(df)
    total_moves = (df["state"].str.lower() == "move").sum()
    total_pressed = (df["state"].str.lower() == "pressed").sum()
    total_released = (df["state"].str.lower() == "released").sum()
    total_drags = (df["state"].str.lower() == "drag").sum()
    clicks = total_pressed + total_released

    # Movement dynamics
    dx = df["x"].diff().fillna(0).astype(float)
    dy = df["y"].diff().fillna(0).astype(float)
    dt = df["timestamp"].diff().fillna(1).astype(float)
    dt[dt == 0] = 1.0
    dist = np.sqrt(dx**2 + dy**2)
    speed = dist / dt

    # Base stats
    avg_speed = speed.replace([np.inf, -np.inf], np.nan).fillna(0).mean()
    path_length = dist.sum()
    idle_time = dt[dt > 500].sum()
    total_time = df["timestamp"].iloc[-1] - df["timestamp"].iloc[0] if len(df) > 1 else 0
    idle_fraction = idle_time / total_time if total_time > 0 else 0

    # Auto min/max/mean/std for coords & speed
    stats = {}
    for col, series in {"x": df["x"], "y": df["y"], "speed": speed}.items():
        stats[f"{col}_min"] = series.min()
        stats[f"{col}_max"] = series.max()
        stats[f"{col}_mean"] = series.mean()
        stats[f"{col}_std"] = series.std()

    return {
        "user_id": user_id,
        "n_events": total_events,
        "total_moves": total_moves,
        "clicks": clicks,
        "total_drags": total_drags,
        "avg_speed_px_per_ms": avg_speed,
        "path_length_px": path_length,
        "idle_fraction": idle_fraction,
        **stats
    }


In [3]:
# ====================================================
# Cell 3: Build Feature Tables
# ====================================================
def build_feature_table(session_list, label="1min"):
    rows = []
    for user, path in tqdm(session_list, desc=f"{label} sessions"):
        raw = read_session_csv(path, user)
        feats = extract_features(raw, user)
        feats["session_file"] = os.path.basename(path)
        rows.append(feats)
    return pd.DataFrame(rows)

# Collect files
all_1min, all_3min = [], []
for user in user_dirs:
    user_folder = os.path.join(BASE_PATH, user)
    for fpath in glob.glob(os.path.join(user_folder, "*.csv")):
        if "1min" in fpath.lower():
            all_1min.append((user, fpath))
        elif "3min" in fpath.lower():
            all_3min.append((user, fpath))

df_1min = build_feature_table(all_1min, "1min")
df_3min = build_feature_table(all_3min, "3min")

print("Shapes:", df_1min.shape, df_3min.shape)
display(df_1min.head())


1min sessions: 100%|██████████| 122/122 [00:04<00:00, 27.33it/s]
3min sessions: 100%|██████████| 122/122 [00:09<00:00, 12.45it/s]

Shapes: (122, 21) (122, 21)





Unnamed: 0,user_id,n_events,total_moves,clicks,total_drags,avg_speed_px_per_ms,path_length_px,idle_fraction,x_min,x_max,...,x_std,y_min,y_max,y_mean,y_std,speed_min,speed_max,speed_mean,speed_std,session_file
0,user1,2055,1591,102,362,1.12443,38617.726413,0.0,128,1908,...,495.830753,69,880,447.439416,196.864942,0.0,12.846753,1.12443,1.890289,session_2020_05_14_1min.csv
1,user10,1760,1333,118,309,1.280491,37444.983442,0.093976,125,1808,...,496.700163,65,837,488.676705,208.288394,0.0,16.271911,1.280491,2.135903,session_2020_05_14_1min.csv
2,user100,2512,2119,168,225,1.293277,54045.074824,0.0,12,1410,...,329.411795,115,700,407.976911,144.142586,0.0,13.151499,1.293277,1.786868,session_2020_03_31_1min.csv
3,user101,1691,1417,67,207,0.616227,16718.778541,0.260148,53,1258,...,317.500771,67,563,363.34181,108.996939,0.0,12.547687,0.616227,1.140616,session_2020_06_08_1min.csv
4,user102,2236,1742,168,326,1.120358,41592.445983,0.03715,89,1507,...,357.827756,122,729,450.050984,149.782142,0.0,17.804494,1.120358,1.741587,session_2020_03_31_1min.csv


In [4]:
# ====================================================
# Cell 4: Standardization & Saving
# ====================================================
NUMERIC_COLS = [c for c in df_1min.columns if c not in ["user_id","session_file"]]

def save_versioned_csv(df, basename):
    """Save df to processed path with versioning."""
    v = 1
    while True:
        out_path = Path(PROCESSED_PATH) / f"{basename}_v{v}.csv"
        if not out_path.exists():
            break
        v += 1
    df.to_csv(out_path, index=False)
    print(f"Saved {out_path}")

# Standardize and save
scaler = StandardScaler()

if not df_1min.empty:
    df1_scaled = df_1min.copy()
    df1_scaled[NUMERIC_COLS] = scaler.fit_transform(df1_scaled[NUMERIC_COLS].fillna(0))
    save_versioned_csv(df1_scaled, "sapimouse_1min_features")

if not df_3min.empty:
    df3_scaled = df_3min.copy()
    df3_scaled[NUMERIC_COLS] = scaler.fit_transform(df3_scaled[NUMERIC_COLS].fillna(0))
    save_versioned_csv(df3_scaled, "sapimouse_3min_features")


Saved D:\hack\impostor_detection\impostor_detection\data\processed\sapimouse_1min_features_v1.csv
Saved D:\hack\impostor_detection\impostor_detection\data\processed\sapimouse_3min_features_v1.csv


In [5]:
# ====================================================
# Cell 5: Sanity Check
# ====================================================
print("1min summary stats:")
display(df_1min.describe().T)

print("3min summary stats:")
display(df_3min.describe().T)


1min summary stats:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
n_events,122.0,2354.442623,923.276263,865.0,1930.0,2213.5,2523.25,5808.0
total_moves,122.0,1909.631148,797.53312,667.0,1543.25,1767.0,2016.5,5400.0
clicks,122.0,137.786885,36.564723,50.0,108.5,140.0,166.0,222.0
total_drags,122.0,307.02459,152.13296,64.0,222.5,289.0,356.75,977.0
avg_speed_px_per_ms,122.0,1.095874,0.304653,0.414363,0.887291,1.04728,1.25163,2.128289
path_length_px,122.0,39419.440032,14422.598707,14944.85804,28740.046122,38636.934884,46635.116058,82994.256614
idle_fraction,122.0,0.122888,0.173674,0.0,0.010812,0.050778,0.161491,0.84926
x_min,122.0,70.172131,41.350686,10.0,36.5,71.5,95.0,208.0
x_max,122.0,1531.762295,323.528686,665.0,1294.25,1465.0,1810.5,2548.0
x_mean,122.0,800.23436,167.708419,379.099626,683.633998,761.453589,918.529241,1364.257671


3min summary stats:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
n_events,122.0,7277.819672,3091.477835,3104.0,5825.0,6725.0,7554.0,25451.0
total_moves,122.0,5872.95082,2753.075521,2388.0,4557.25,5364.5,6194.75,23131.0
clicks,122.0,435.057377,101.278954,208.0,370.5,437.0,496.0,709.0
total_drags,122.0,969.811475,359.061111,341.0,759.5,914.0,1052.5,2516.0
avg_speed_px_per_ms,122.0,1.132832,0.291427,0.460173,0.936857,1.064352,1.289891,1.987522
path_length_px,122.0,123852.711318,41070.02359,54508.931488,93878.809781,116973.651892,149806.524186,225044.997682
idle_fraction,122.0,0.073599,0.10855,0.0,0.012972,0.031526,0.093595,0.568507
x_min,122.0,39.803279,27.534993,9.0,12.0,35.0,65.75,107.0
x_max,122.0,1556.737705,322.506142,738.0,1314.0,1461.0,1867.0,2549.0
x_mean,122.0,794.846732,158.96971,383.586298,682.748732,752.898801,917.37488,1278.241384


In [None]:
% mlflow ui --host 0.0.0.0 --port 5000


In [None]:
# ====================================================
# Cell X: Train/Test split by user range (100-120)
# ====================================================
# This cell assumes `df_1min` and `df_3min` are already built in previous cells.
TEST_USER_START = 100
TEST_USER_END = 120
# user ids in the data are strings like 'user100' or numeric; handle both
def normalize_user(u):
    if pd.isna(u):
        return u
    try:
        s = str(u).lower().strip().replace('user', '')
        return int(s)
    except Exception:
        # if it can't be converted, return None
        return None

# prepare a helper to split any dataframe by user_id column
def split_by_user_range(df, start, end, user_col='user_id'):
    df = df.copy()
    # create normalized numeric user column for robust selection
    df['_user_num'] = df[user_col].apply(normalize_user)
    mask_test = df['_user_num'].between(start, end, inclusive='both')
    df_test = df[mask_test].drop(columns=['_user_num'])
    df_train = df[~mask_test].drop(columns=['_user_num'])
    return df_train.reset_index(drop=True), df_test.reset_index(drop=True)

# Ensure df_1min/df_3min exist; if not, try to load latest processed files
def load_latest_processed(prefix):
    pattern = str(Path(PROCESSED_PATH) / f"{prefix}_v*.csv")
    files = sorted(glob.glob(pattern), key=os.path.getmtime)
    if not files:
        return None
    return pd.read_csv(files[-1])

if 'df_1min' not in globals() or df_1min is None:
    df_1min = load_latest_processed('sapimouse_1min_features')
    if df_1min is None:
        raise RuntimeError('df_1min not found in memory and no processed file available')

if 'df_3min' not in globals() or df_3min is None:
    df_3min = load_latest_processed('sapimouse_3min_features')
    if df_3min is None:
        raise RuntimeError('df_3min not found in memory and no processed file available')

# perform splits
df1_train, df1_test = split_by_user_range(df_1min, TEST_USER_START, TEST_USER_END, user_col='user_id')
df3_train, df3_test = split_by_user_range(df_3min, TEST_USER_START, TEST_USER_END, user_col='user_id')

# Save with versioning using existing helper if available; fallback to simple save
# def save_split(df, basename):
#     try:
#         save_versioned_csv(df, basename)
#     except Exception:
#         out = Path(PROCESSED_PATH) / f"{basename}.csv"
#         df.to_csv(out, index=False)
#         print(f'Saved fallback {out}')

# save_split(df1_train, 'sapimouse_1min_train')
# save_split(df1_test, 'sapimouse_1min_test')
# save_split(df3_train, 'sapimouse_3min_train')
# save_split(df3_test, 'sapimouse_3min_test')

# Print summaries
print('1min - train shape, unique users:', df1_train.shape, df1_train['user_id'].nunique())
print('1min - test shape, unique users:', df1_test.shape, df1_test['user_id'].nunique())
print('Test users in 1min test (sample):', sorted(df1_test['user_id'].unique())[:10])

print('3min - train shape, unique users:', df3_train.shape, df3_train['user_id'].nunique())
print('3min - test shape, unique users:', df3_test.shape, df3_test['user_id'].nunique())
print('Test users in 3min test (sample):', sorted(df3_test['user_id'].unique())[:10])

# quick asserts to ensure test split contains only users in range
def assert_users_in_range(df, start, end, user_col='user_id'):
    nums = df[user_col].apply(normalize_user).dropna().unique()
    if len(nums) == 0:
        print('Warning: no numeric users found in dataframe')
        return
    bad = [u for u in nums if u < start or u > end]
    if bad:
        raise AssertionError(f'Found users outside expected range in test: {bad[:5]}...')
    print(f'All {len(nums)} numeric users in test are within {start}-{end}')

assert_users_in_range(df1_test, TEST_USER_START, TEST_USER_END)
assert_users_in_range(df3_test, TEST_USER_START, TEST_USER_END)

# expose variables to notebook namespace
globals().update({'df1_train': df1_train, 'df1_test': df1_test, 'df3_train': df3_train, 'df3_test': df3_test})
print('Done: created df1_train, df1_test, df3_train, df3_test')


In [None]:
# ====================================================
# Cell Y: Create pairwise (X1, X2, y) datasets for siamese/CMS training
# ====================================================
# This cell creates pair combinations (DF1-DF1, DF3-DF3, DF1-DF3) for both train and test
# and returns X1, X2 and y where y=1 means same user and y=0 means different users.
import itertools
from collections import defaultdict

def normalize_user(u):
    if pd.isna(u):
        return None
    try:
        s = str(u).lower().strip().replace('user', '')
        return int(s)
    except Exception:
        return None

def prepare_pairwise_dataset(df_a, df_b, user_col='user_id', max_pos_per_user=50, neg_ratio=1, allow_same_df=False, random_state=42):
    """Prepare pairwise samples between df_a and df_b."""
    rng = np.random.RandomState(random_state)
    a = df_a.reset_index(drop=False).rename(columns={'index':'_orig_idx_a'})
    b = df_b.reset_index(drop=False).rename(columns={'index':'_orig_idx_b'})

    a['_user_num'] = a[user_col].apply(normalize_user)
    b['_user_num'] = b[user_col].apply(normalize_user)

    # map user -> list of positions in a and b
    a_by_user = defaultdict(list)
    b_by_user = defaultdict(list)
    for i, u in enumerate(a['_user_num']):
        if u is not None: a_by_user[u].append(i)
    for j, u in enumerate(b['_user_num']):
        if u is not None: b_by_user[u].append(j)

    positives = []  # list of (i, j)
    # same-df pairing (within a single df) -- avoid self-pairing and duplicates
    if a is b or (allow_same_df and id(df_a) == id(df_b)) or (allow_same_df and df_a is df_b):
        # use a_by_user only (a==b)
        for u, idxs in a_by_user.items():
            if len(idxs) < 2:
                continue
            all_pairs = list(itertools.combinations(idxs, 2))
            if len(all_pairs) > max_pos_per_user:
                positives.extend(rng.choice(len(all_pairs), size=max_pos_per_user, replace=False).tolist())
                # above produced indices into all_pairs; convert next
                sampled = [all_pairs[ii] for ii in positives[-max_pos_per_user:]]
                positives = positives[:-max_pos_per_user] + sampled
            else:
                positives.extend(all_pairs)
        # in this same-df case, pairs are symmetric (i,j) where both refer to 'a' and 'b' = 'a'
        # we will translate them below
    else:
        # cross-df positives: users present in both frames
        common_users = set(a_by_user.keys()).intersection(b_by_user.keys())
        for u in common_users:
            ia = a_by_user[u]
            ib = b_by_user[u]
            all_pairs = list(itertools.product(ia, ib))
            if len(all_pairs) > max_pos_per_user:
                sampled_idx = rng.choice(len(all_pairs), size=max_pos_per_user, replace=False)
                positives.extend([all_pairs[ii] for ii in sampled_idx])
            else:
                positives.extend(all_pairs)

    pairs = []
    labels = []
    # handle same-df positives translation
    if a is b or (allow_same_df and (id(df_a) == id(df_b) or df_a is df_b)):
        # positives currently contains tuples (i,j) where both indices refer to 'a'
        for (i,j) in positives:
            # map to (i in a, j in b) where b is the same as a
            pairs.append((i, j))
            labels.append(1)
    else:
        for (i,j) in positives:
            pairs.append((i,j))
            labels.append(1)

    # build helpers for negatives: for each a-index, list of b-indexes with different users
    all_b_idxs = list(range(len(b)))
    b_user_to_idxs = {u: idxs for u, idxs in b_by_user.items()}
    for (ia, ib_pos_label) in list(pairs):
        user_a = a.loc[ia, '_user_num'] if ia is not None else None
        # eligible negatives in b are those with user != user_a
        if user_a is None:
            eligible = all_b_idxs
        else:
            ineligible = set(b_user_to_idxs.get(user_a, []))
            eligible = [x for x in all_b_idxs if x not in ineligible]
        if not eligible:
            continue
        neg_count = max(1, int(neg_ratio))
        chosen = rng.choice(eligible, size=min(len(eligible), neg_count), replace=False)
        for nb in np.atleast_1d(chosen):
            pairs.append((ia, int(nb)))
            labels.append(0)

    # If no positives found (possible when df_a and df_b share no users), generate random negatives
    if len(pairs) == 0:
        # create up to 1000 random negative pairs
        nA = len(a)
        nB = len(b)
        if nA and nB:
            max_random = min(1000, nA * nB)
            for _ in range(max_random):
                ia = rng.randint(0, nA)
                ib = rng.randint(0, nB)
                if a.loc[ia, '_user_num'] != b.loc[ib, '_user_num']:
                    pairs.append((ia, ib))
                    labels.append(0)
                if len(pairs) >= 1000:
                    break

    # build DataFrames X1 (rows from a) and X2 (rows from b) and y
    ia_list = [p[0] for p in pairs]
    ib_list = [p[1] for p in pairs]
    X1 = a.iloc[ia_list].drop(columns=['_user_num']).reset_index(drop=True)
    X2 = b.iloc[ib_list].drop(columns=['_user_num']).reset_index(drop=True)
    y = pd.Series(labels, name='same_user').astype(int).reset_index(drop=True)

    # shuffle
    perm = rng.permutation(len(y))
    X1 = X1.iloc[perm].reset_index(drop=True)
    X2 = X2.iloc[perm].reset_index(drop=True)
    y = y.iloc[perm].reset_index(drop=True)
    return X1, X2, y

def build_combined_pairs(df1, df3, same_df_max_pos=200, cross_df_max_pos=500, neg_ratio=1, random_state=42):
    "Build combined pairwise dataset from df1 and df3."
    # df1-df1 (same-df)
    X1_11, X2_11, y11 = prepare_pairwise_dataset(df1, df1, max_pos_per_user=same_df_max_pos, neg_ratio=neg_ratio, allow_same_df=True, random_state=random_state)
    # df3-df3 (same-df)
    X1_33, X2_33, y33 = prepare_pairwise_dataset(df3, df3, max_pos_per_user=same_df_max_pos, neg_ratio=neg_ratio, allow_same_df=True, random_state=random_state+1)
    # df1-df3 (cross)
    X1_13, X2_13, y13 = prepare_pairwise_dataset(df1, df3, max_pos_per_user=cross_df_max_pos, neg_ratio=neg_ratio, allow_same_df=False, random_state=random_state+2)

    # concat
    X1 = pd.concat([X1_11, X1_33, X1_13], ignore_index=True, sort=False)
    X2 = pd.concat([X2_11, X2_33, X2_13], ignore_index=True, sort=False)
    y = pd.concat([y11, y33, y13], ignore_index=True, sort=False)
    # final shuffle
    rng = np.random.RandomState(random_state+99)
    perm = rng.permutation(len(y))
    X1 = X1.iloc[perm].reset_index(drop=True)
    X2 = X2.iloc[perm].reset_index(drop=True)
    y = y.iloc[perm].reset_index(drop=True)
    return X1, X2, y

# Now build train and test pairwise datasets using the train/test splits created previously
print('Building train pairwise dataset...')
X1_train, X2_train, y_train = build_combined_pairs(df1_train, df3_train, same_df_max_pos=200, cross_df_max_pos=500, neg_ratio=1, random_state=42)
print('Building test pairwise dataset...')
X1_test, X2_test, y_test = build_combined_pairs(df1_test, df3_test, same_df_max_pos=200, cross_df_max_pos=500, neg_ratio=1, random_state=2025)

# quick summaries
print('TRAIN pairs:', X1_train.shape, X2_train.shape, 'y:', y_train.shape)
print('TRAIN class balance:', y_train.value_counts(normalize=False).to_dict())
print('TEST pairs:', X1_test.shape, X2_test.shape, 'y:', y_test.shape)
print('TEST class balance:', y_test.value_counts(normalize=False).to_dict())




In [None]:
# ====================================================
# Siamese Network Architecture
# ====================================================
import tensorflow as tf
from tensorflow.keras import layers, Model, Input

def build_encoder(input_shape, embedding_dim=64):
    """Basic encoder network for Siamese architecture."""
    inputs = Input(shape=input_shape)
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(embedding_dim, activation='relu')(x)
    encoder = Model(inputs, x, name="encoder")
    return encoder

def build_siamese_model(input_shape):
    """Creates Siamese network with two encoders and a sigmoid head."""
    encoder = build_encoder(input_shape)
    
    input_a = Input(shape=input_shape)
    input_b = Input(shape=input_shape)
    
    encoded_a = encoder(input_a)
    encoded_b = encoder(input_b)
    
    # Merge embeddings (L1 distance)
    merge = layers.Lambda(lambda x: tf.abs(x[0] - x[1]))([encoded_a, encoded_b])
    
    # Fully connected layers + sigmoid head
    x = layers.Dense(64, activation='relu')(merge)
    x = layers.Dropout(0.3)(x)
    output = layers.Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=[input_a, input_b], outputs=output, name="SiameseNetwork")
    return model

# Example initialization
input_shape = (df_1min.drop(columns=['session_file', 'user']).shape[1],)  # adjust if needed
model = build_siamese_model(input_shape)
model.summary()


In [None]:
# ====================================================
# Compile Siamese Network
# ====================================================
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)


In [None]:
# ====================================================
# Train Siamese Network
# ====================================================

history = model.fit(
    [X1_train, X2_train],
    y_train,
    validation_data=([X1_val, X2_val], y_val),
    epochs=30,
    batch_size=64,
    verbose=1
)


In [None]:
# ====================================================
# Training Curves
# ====================================================
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))

# Loss curve
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('Loss Curve')
plt.legend()

# Accuracy curve
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.title('Accuracy Curve')
plt.legend()

plt.show()


In [None]:
# ====================================================
# Model Evaluation
# ====================================================
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Predictions
y_pred = (model.predict([X1_test, X2_test]) > 0.5).astype(int)

# Metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Test Accuracy:", acc)
print("F1 Score:", f1)
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
