# Week 1 
## Load Data & Check

In [1]:
import numpy as np
import pandas as pd
import os

COMP_DIR = "/kaggle/input"  # 기본
for dirname, _, filenames in os.walk(COMP_DIR):
    for filename in filenames:
        if filename.endswith(".csv"):
            print(os.path.join(dirname, filename))


/kaggle/input/directional-forecasting-in-cryptocurrencies/sample_submission.csv
/kaggle/input/directional-forecasting-in-cryptocurrencies/train.csv
/kaggle/input/directional-forecasting-in-cryptocurrencies/test.csv


In [2]:
import pandas as pd

train_path = "/kaggle/input/directional-forecasting-in-cryptocurrencies/train.csv"
test_path  = "/kaggle/input/directional-forecasting-in-cryptocurrencies/test.csv"
sub_path   = "/kaggle/input/directional-forecasting-in-cryptocurrencies/sample_submission.csv"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)
sub   = pd.read_csv(sub_path)

print("train:", train.shape)
print("test :", test.shape)
print("sub  :", sub.shape)

print("\ntrain cols:\n", train.columns.tolist())
print("\ntest cols:\n", test.columns.tolist())
print("\nsub cols:\n", sub.columns.tolist())

train.head()


train: (2122438, 11)
test : (909617, 11)
sub  : (50, 2)

train cols:
 ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'quote_asset_volume', 'number_of_trades', 'taker_buy_base_volume', 'taker_buy_quote_volume', 'target']

test cols:
 ['row_id', 'timestamp', 'open', 'high', 'low', 'close', 'volume', 'quote_asset_volume', 'number_of_trades', 'taker_buy_base_volume', 'taker_buy_quote_volume']

sub cols:
 ['row_id', 'target']


Unnamed: 0,timestamp,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_volume,taker_buy_quote_volume,target
0,1525471260,0.9012,0.9013,0.9012,0.9013,134.98,121.646459,4.0,125.08,112.723589,1.0
1,1525471320,0.90185,0.90195,0.90185,0.90195,1070.54,965.505313,12.0,879.94,793.612703,0.0
2,1525471380,0.9014,0.9014,0.90139,0.90139,2293.06,2066.963991,5.0,0.0,0.0,0.0
3,1525471440,0.90139,0.9014,0.90138,0.90139,6850.59,6175.000909,19.0,1786.3,1610.149485,0.0
4,1525471500,0.90139,0.90139,0.9013,0.9013,832.3,750.222624,3.0,784.82,707.4289,0.0


## Timestamp Parsing, Sorting, and Basic Validation

In [3]:
# Parse timestamp (UTC)
train["timestamp"] = pd.to_datetime(train["timestamp"], errors="coerce", utc=True)
test["timestamp"]  = pd.to_datetime(test["timestamp"],  errors="coerce", utc=True)

# Check parsing failures / duplicates
print("timestamp parse NA (train):", train["timestamp"].isna().sum())
print("timestamp duplicates (train):", train["timestamp"].duplicated().sum())

print("timestamp parse NA (test):", test["timestamp"].isna().sum())
print("timestamp duplicates (test):", test["timestamp"].duplicated().sum())

# Sort by time
train = train.sort_values("timestamp").reset_index(drop=True)
test  = test.sort_values("timestamp").reset_index(drop=True)

# Time range check
print("\ntrain time range:", train["timestamp"].min(), "->", train["timestamp"].max())
print("test  time range:", test["timestamp"].min(),  "->", test["timestamp"].max())

# Confirm monotonicity
print("\ntrain is monotonic increasing:", train["timestamp"].is_monotonic_increasing)
print("test  is monotonic increasing:", test["timestamp"].is_monotonic_increasing)


timestamp parse NA (train): 0
timestamp duplicates (train): 0
timestamp parse NA (test): 0
timestamp duplicates (test): 0

train time range: 1970-01-01 00:00:01.525471260+00:00 -> 1970-01-01 00:00:01.652817480+00:00
test  time range: 1970-01-01 00:00:01.652817480+00:00 -> 1970-01-01 00:00:01.707394440+00:00

train is monotonic increasing: True
test  is monotonic increasing: True


## Basic Lag and Rolling Features

In [4]:
def add_basic_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # returns
    out["ret_1"] = out["close"].pct_change(1)
    out["ret_5"] = out["close"].pct_change(5)

    # lag features
    out["close_lag1"] = out["close"].shift(1)
    out["volume_lag1"] = out["volume"].shift(1)

    # rolling features (past only)
    out["ret_1_roll_mean_5"] = out["ret_1"].rolling(5).mean()
    out["ret_1_roll_std_5"]  = out["ret_1"].rolling(5).std()

    # spreads
    out["hl_spread"] = (out["high"] - out["low"]) / out["close"]
    out["oc_spread"] = (out["close"] - out["open"]) / out["open"]

    return out

feature_cols = [
    "ret_1","ret_5","close_lag1","volume_lag1",
    "ret_1_roll_mean_5","ret_1_roll_std_5",
    "hl_spread","oc_spread"
]

train_fe = add_basic_features(train)
test_fe  = add_basic_features(test)

train_fe[feature_cols + ["target"]].head(10)


Unnamed: 0,ret_1,ret_5,close_lag1,volume_lag1,ret_1_roll_mean_5,ret_1_roll_std_5,hl_spread,oc_spread,target
0,,,,,,,0.000111,0.000111,1.0
1,0.000721,,0.9013,134.98,,,0.000111,0.000111,0.0
2,-0.000621,,0.90195,1070.54,,,1.1e-05,-1.1e-05,0.0
3,0.0,,0.90139,2293.06,,,2.2e-05,0.0,0.0
4,-0.0001,,0.90139,6850.59,,,0.0001,-0.0001,0.0
5,-0.001431,-0.001431,0.9013,832.3,-0.000286,0.000799,0.001533,-0.001409,0.0
6,-0.002144,-0.004291,0.90001,23797.63,-0.000859,0.000915,0.002294,-0.002144,1.0
7,0.002138,-0.001542,0.89808,6978.25,-0.000308,0.00164,0.002144,0.002127,0.0
8,0.0,-0.001542,0.9,11902.24,-0.000308,0.00164,0.001833,-1.1e-05,0.0
9,-0.000233,-0.001675,0.9,5591.1,-0.000334,0.001637,0.001656,-0.000233,0.0


## Remove NaNs and Create Time-Based Split

In [5]:
# Drop rows with NaNs created by shift/rolling
train_fe = train_fe.dropna(subset=feature_cols + ["target"]).reset_index(drop=True)
test_fe  = test_fe.dropna(subset=feature_cols).reset_index(drop=True)

print("train_fe:", train_fe.shape)
print("test_fe :", test_fe.shape)

# Time-based split: last 20% as validation
n = len(train_fe)
split = int(n * 0.8)

X_train = train_fe.loc[:split-1, feature_cols]
y_train = train_fe.loc[:split-1, "target"].astype(int)

X_val   = train_fe.loc[split:, feature_cols]
y_val   = train_fe.loc[split:, "target"].astype(int)

print("\nX_train:", X_train.shape, "X_val:", X_val.shape)
print("y_train positive rate:", y_train.mean())
print("y_val   positive rate:", y_val.mean())


train_fe: (2122433, 19)
test_fe : (909612, 19)

X_train: (1697946, 8) X_val: (424487, 8)
y_train positive rate: 0.47751989757035856
y_val   positive rate: 0.46884828039492377


## Baseline Model (Logistic Regression) and Macro-F1

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Baseline pipeline
baseline_model = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=200))
])

# Train
baseline_model.fit(X_train, y_train)

# Validation prediction
val_pred = baseline_model.predict(X_val)

# Macro F1
macro_f1 = f1_score(y_val, val_pred, average="macro")

print("Baseline Macro-F1:", macro_f1)
print("Validation predicted positive rate:", val_pred.mean())


Baseline Macro-F1: 0.38701975004099903
Validation predicted positive rate: 0.04560092535224872


## LightGBM Baseline Model

In [7]:
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

lgb_model = LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)

lgb_model.fit(X_train, y_train)

val_pred_lgb = lgb_model.predict(X_val)

macro_f1_lgb = f1_score(y_val, val_pred_lgb, average="macro")

print("LightGBM Macro-F1:", macro_f1_lgb)
print("Validation predicted positive rate:", val_pred_lgb.mean())


[LightGBM] [Info] Number of positive: 810803, number of negative: 887143
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023060 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 1697946, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.477520 -> initscore=-0.089981
[LightGBM] [Info] Start training from score -0.089981
LightGBM Macro-F1: 0.41886188195718194
Validation predicted positive rate: 0.09018886326318591


## Generate Test Predictions and Create Submission File

In [9]:
# Sanity check lengths
print("test_fe rows:", len(test_fe))
print("test_pred len:", len(test_pred))
print("test rows (raw):", len(test))

# Create submission aligned to test_fe (after dropping NaNs)
submission = pd.DataFrame({
    "row_id": test_fe["row_id"].astype(int),
    "target": test_pred.astype(int)
})

print(submission.head())
print("\nPrediction distribution:")
print(submission["target"].value_counts(normalize=True))

submission_path = "submission_week1_baseline_lgb.csv"
submission.to_csv(submission_path, index=False)
print("\nSaved:", submission_path)


test_fe rows: 909612
test_pred len: 909612
test rows (raw): 909617
   row_id  target
0       5       0
1       6       1
2       7       0
3       8       0
4       9       0

Prediction distribution:
target
0    0.758641
1    0.241359
Name: proportion, dtype: float64

Saved: submission_week1_baseline_lgb.csv


## Extended Lag & Rolling Feature Engineering (v2)

In [14]:
import pandas as pd
import numpy as np

def add_features_v2(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # multi-horizon return lags
    for l in [1, 5, 10, 30, 60]:
        out[f"ret_{l}"] = out["close"].pct_change(l)

    # rolling stats on ret_1
    out["ret_1"] = out["close"].pct_change(1)  # ensure exists
    for w in [5, 15, 30, 60]:
        out[f"ret_1_roll_mean_{w}"] = out["ret_1"].rolling(w).mean()
        out[f"ret_1_roll_std_{w}"]  = out["ret_1"].rolling(w).std()

    # volume momentum
    out["volume_change_5"]  = out["volume"].pct_change(5)
    out["volume_change_30"] = out["volume"].pct_change(30)

    return out

train_v2 = add_features_v2(train)
test_v2  = add_features_v2(test)

train_v2.head()


Unnamed: 0,timestamp,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_volume,taker_buy_quote_volume,...,ret_1_roll_mean_5,ret_1_roll_std_5,ret_1_roll_mean_15,ret_1_roll_std_15,ret_1_roll_mean_30,ret_1_roll_std_30,ret_1_roll_mean_60,ret_1_roll_std_60,volume_change_5,volume_change_30
0,1970-01-01 00:00:01.525471260+00:00,0.9012,0.9013,0.9012,0.9013,134.98,121.646459,4.0,125.08,112.723589,...,,,,,,,,,,
1,1970-01-01 00:00:01.525471320+00:00,0.90185,0.90195,0.90185,0.90195,1070.54,965.505313,12.0,879.94,793.612703,...,,,,,,,,,,
2,1970-01-01 00:00:01.525471380+00:00,0.9014,0.9014,0.90139,0.90139,2293.06,2066.963991,5.0,0.0,0.0,...,,,,,,,,,,
3,1970-01-01 00:00:01.525471440+00:00,0.90139,0.9014,0.90138,0.90139,6850.59,6175.000909,19.0,1786.3,1610.149485,...,,,,,,,,,,
4,1970-01-01 00:00:01.525471500+00:00,0.90139,0.90139,0.9013,0.9013,832.3,750.222624,3.0,784.82,707.4289,...,,,,,,,,,,


## Define FEATURES_V2 (Explicit List)

In [15]:
FEATURES_V2 = [
    "ret_1","ret_5","ret_10","ret_30","ret_60",
    "ret_1_roll_mean_5","ret_1_roll_std_5",
    "ret_1_roll_mean_15","ret_1_roll_std_15",
    "ret_1_roll_mean_30","ret_1_roll_std_30",
    "ret_1_roll_mean_60","ret_1_roll_std_60",
    "volume_change_5","volume_change_30"
]

print("Num features:", len(FEATURES_V2))
print(FEATURES_V2)


Num features: 15
['ret_1', 'ret_5', 'ret_10', 'ret_30', 'ret_60', 'ret_1_roll_mean_5', 'ret_1_roll_std_5', 'ret_1_roll_mean_15', 'ret_1_roll_std_15', 'ret_1_roll_mean_30', 'ret_1_roll_std_30', 'ret_1_roll_mean_60', 'ret_1_roll_std_60', 'volume_change_5', 'volume_change_30']


## Prepare Training Data (Drop NaNs + Time Split)

In [16]:
# Drop NaNs from lag/rolling generation
train_v2 = train_v2.dropna(subset=FEATURES_V2 + ["target"]).reset_index(drop=True)
test_v2  = test_v2.dropna(subset=FEATURES_V2).reset_index(drop=True)

print("train_v2:", train_v2.shape)
print("test_v2 :", test_v2.shape)

# Time-based split (80/20)
n = len(train_v2)
split = int(n * 0.8)

X_train_v2 = train_v2.loc[:split-1, FEATURES_V2]
y_train_v2 = train_v2.loc[:split-1, "target"].astype(int)

X_val_v2   = train_v2.loc[split:, FEATURES_V2]
y_val_v2   = train_v2.loc[split:, "target"].astype(int)

print("X_train:", X_train_v2.shape, "X_val:", X_val_v2.shape)
print("y_train pos rate:", y_train_v2.mean(), "y_val pos rate:", y_val_v2.mean())


train_v2: (2121671, 26)
test_v2 : (909490, 26)
X_train: (1697336, 15) X_val: (424335, 15)
y_train pos rate: 0.47760019230134754 y_val pos rate: 0.46885126138546196


## Train LightGBM (MODEL_V2) and Evaluate Macro-F1

In [17]:
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

MODEL_V2 = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.03,
    num_leaves=63,
    min_child_samples=50,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

MODEL_V2.fit(X_train_v2, y_train_v2)

val_pred_v2 = MODEL_V2.predict(X_val_v2)
macro_f1_v2 = f1_score(y_val_v2, val_pred_v2, average="macro")

print("V2 Macro-F1 (default thr=0.5):", macro_f1_v2)
print("Validation predicted positive rate:", val_pred_v2.mean())


[LightGBM] [Info] Number of positive: 810648, number of negative: 886688
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.127106 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 1697336, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.477600 -> initscore=-0.089659
[LightGBM] [Info] Start training from score -0.089659
V2 Macro-F1 (default thr=0.5): 0.49745205306101314
Validation predicted positive rate: 0.30119127576089644


## Threshold Tuning for Macro-F1 (V2)

In [18]:
import numpy as np
from sklearn.metrics import f1_score

val_proba_v2 = MODEL_V2.predict_proba(X_val_v2)[:, 1]

best_thr = 0.5
best_f1 = 0

for thr in np.linspace(0.1, 0.9, 81):
    pred = (val_proba_v2 >= thr).astype(int)
    f1 = f1_score(y_val_v2, pred, average="macro")
    if f1 > best_f1:
        best_f1 = f1
        best_thr = float(thr)

BEST_THR_V2 = best_thr
print("BEST_THR_V2:", BEST_THR_V2)
print("Best Macro-F1:", best_f1)


BEST_THR_V2: 0.48
Best Macro-F1: 0.5116151614632223


## Create Full-Length Submission (V2 + Tuned Threshold)

In [19]:
import numpy as np
import pandas as pd

# Predict probabilities for rows where features exist (test_v2)
test_proba_v2 = MODEL_V2.predict_proba(test_v2[FEATURES_V2])[:, 1]
test_pred_v2 = (test_proba_v2 >= BEST_THR_V2).astype(int)

print("raw test rows:", len(test))
print("test_v2 rows (feature-ready):", len(test_v2))
print("pred rows:", len(test_pred_v2))

# Full-length submission over ALL test row_id
submission_v2 = pd.DataFrame({
    "row_id": test["row_id"].astype(int),
    "target": np.zeros(len(test), dtype=int)
})

# Fill predictions for available rows (aligned by row_id)
submission_v2.loc[test_v2["row_id"].astype(int).values, "target"] = test_pred_v2

print("\nHead:")
print(submission_v2.head(10))

print("\nPrediction distribution (full):")
print(submission_v2["target"].value_counts(normalize=True))

# Save
submission_path = "submission_v2_lgb_thr048_full.csv"
submission_v2.to_csv(submission_path, index=False)
print("\nSaved:", submission_path)


raw test rows: 909617
test_v2 rows (feature-ready): 909490
pred rows: 909490

Head:
   row_id  target
0       0       0
1       1       0
2       2       0
3       3       0
4       4       0
5       5       0
6       6       0
7       7       0
8       8       0
9       9       0

Prediction distribution (full):
target
0    0.550033
1    0.449967
Name: proportion, dtype: float64

Saved: submission_v2_lgb_thr048_full.csv


# Week 2
## Create v3 Microstructure Features (no leakage)

In [20]:
import numpy as np
import pandas as pd

def add_features_v3(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    eps = 1e-9  # avoid division by zero

    # --- microstructure ratios at time t (allowed) ---
    out["taker_buy_ratio_base"] = out["taker_buy_base_volume"] / (out["volume"] + eps)
    out["taker_buy_ratio_quote"] = out["taker_buy_quote_volume"] / (out["quote_asset_volume"] + eps)

    # trade intensity (how many trades per unit volume)
    out["trade_intensity"] = out["number_of_trades"] / (out["volume"] + eps)

    # quote per trade (avg trade size in USDT)
    out["quote_per_trade"] = out["quote_asset_volume"] / (out["number_of_trades"] + eps)

    # --- add rolling stats (past only) ---
    for w in [5, 15, 30, 60]:
        out[f"taker_buy_ratio_base_mean_{w}"] = out["taker_buy_ratio_base"].rolling(w).mean()
        out[f"taker_buy_ratio_base_std_{w}"]  = out["taker_buy_ratio_base"].rolling(w).std()

        out[f"trade_intensity_mean_{w}"] = out["trade_intensity"].rolling(w).mean()
        out[f"trade_intensity_std_{w}"]  = out["trade_intensity"].rolling(w).std()

    return out

train_v3 = add_features_v3(train_v2)  # build on v2 (already has return/rolling/volume_change)
test_v3  = add_features_v3(test_v2)

train_v3[[
    "taker_buy_ratio_base","taker_buy_ratio_quote","trade_intensity","quote_per_trade"
]].head(10)


Unnamed: 0,taker_buy_ratio_base,taker_buy_ratio_quote,trade_intensity,quote_per_trade
0,0.987392,0.987404,0.001406,638.325302
1,0.973248,0.973259,0.001673,536.052752
2,0.900953,0.901042,0.003265,274.761869
3,0.222389,0.222516,0.003336,268.822841
4,0.248953,0.249193,0.001372,652.848231
5,0.725272,0.725272,0.002015,444.818646
6,0.954729,0.954773,0.00256,350.157559
7,0.0,0.0,0.002355,380.24111
8,0.000359,0.000359,0.00072,1243.721483
9,1.0,1.0,0.010526,85.0782


## Define FEATURES_V3 and Prepare Training Data

In [21]:
# v3 = 기존 v2 features + 새 microstructure features
FEATURES_V3 = FEATURES_V2 + [
    "taker_buy_ratio_base",
    "taker_buy_ratio_quote",
    "trade_intensity",
    "quote_per_trade",
    "taker_buy_ratio_base_mean_5",
    "taker_buy_ratio_base_std_5",
    "taker_buy_ratio_base_mean_15",
    "taker_buy_ratio_base_std_15",
    "taker_buy_ratio_base_mean_30",
    "taker_buy_ratio_base_std_30",
    "taker_buy_ratio_base_mean_60",
    "taker_buy_ratio_base_std_60",
    "trade_intensity_mean_5",
    "trade_intensity_std_5",
    "trade_intensity_mean_15",
    "trade_intensity_std_15",
    "trade_intensity_mean_30",
    "trade_intensity_std_30",
    "trade_intensity_mean_60",
    "trade_intensity_std_60"
]

print("Num FEATURES_V3:", len(FEATURES_V3))

# drop NaNs (rolling 때문에)
train_v3 = train_v3.dropna(subset=FEATURES_V3 + ["target"]).reset_index(drop=True)
test_v3  = test_v3.dropna(subset=FEATURES_V3).reset_index(drop=True)

print("train_v3:", train_v3.shape)
print("test_v3 :", test_v3.shape)

# time split
n = len(train_v3)
split = int(n * 0.8)

X_train_v3 = train_v3.loc[:split-1, FEATURES_V3]
y_train_v3 = train_v3.loc[:split-1, "target"].astype(int)

X_val_v3   = train_v3.loc[split:, FEATURES_V3]
y_val_v3   = train_v3.loc[split:, "target"].astype(int)

print("X_train:", X_train_v3.shape, "X_val:", X_val_v3.shape)


Num FEATURES_V3: 35
train_v3: (2121612, 46)
test_v3 : (909431, 46)
X_train: (1697289, 35) X_val: (424323, 35)


## Advanced LightGBM

In [22]:
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

MODEL_V3 = LGBMClassifier(
    n_estimators=800,
    learning_rate=0.02,
    num_leaves=127,
    min_child_samples=100,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.0,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

MODEL_V3.fit(X_train_v3, y_train_v3)

val_pred_v3 = MODEL_V3.predict(X_val_v3)
macro_f1_v3 = f1_score(y_val_v3, val_pred_v3, average="macro")

print("V3 Macro-F1 (default thr=0.5):", macro_f1_v3)
print("Validation predicted positive rate:", val_pred_v3.mean())


[LightGBM] [Info] Number of positive: 810636, number of negative: 886653
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.301352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8925
[LightGBM] [Info] Number of data points in the train set: 1697289, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.477606 -> initscore=-0.089635
[LightGBM] [Info] Start training from score -0.089635
V3 Macro-F1 (default thr=0.5): 0.49244899226980376
Validation predicted positive rate: 0.26761452949757614


## Threshold tuning (V3)

In [24]:
import numpy as np
from sklearn.metrics import f1_score

val_proba_v3 = MODEL_V3.predict_proba(X_val_v3)[:, 1]

best_thr_v3 = 0.5
best_f1_v3 = 0

for thr in np.linspace(0.1, 0.9, 81):
    pred = (val_proba_v3 >= thr).astype(int)
    f1 = f1_score(y_val_v3, pred, average="macro")
    if f1 > best_f1_v3:
        best_f1_v3 = f1
        best_thr_v3 = float(thr)

print("BEST_THR_V3:", best_thr_v3)
print("Best Macro-F1 (V3):", best_f1_v3)


BEST_THR_V3: 0.48
Best Macro-F1 (V3): 0.5141046047716057


## Add Microstructure Lag Features (v3.1)

In [25]:
import pandas as pd
import numpy as np

def add_features_v3_1(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # lag-1 for key microstructure signals (past only)
    out["taker_buy_ratio_base_lag1"] = out["taker_buy_ratio_base"].shift(1)
    out["taker_buy_ratio_quote_lag1"] = out["taker_buy_ratio_quote"].shift(1)
    out["trade_intensity_lag1"] = out["trade_intensity"].shift(1)
    out["quote_per_trade_lag1"] = out["quote_per_trade"].shift(1)

    return out

train_v3_1 = add_features_v3_1(train_v3)
test_v3_1  = add_features_v3_1(test_v3)

train_v3_1[[
    "taker_buy_ratio_base","taker_buy_ratio_base_lag1",
    "trade_intensity","trade_intensity_lag1"
]].head(10)


Unnamed: 0,taker_buy_ratio_base,taker_buy_ratio_base_lag1,trade_intensity,trade_intensity_lag1
0,0.024385,,0.00304,
1,0.179277,0.024385,0.053137,0.00304
2,0.018882,0.179277,0.01101,0.053137
3,0.971483,0.018882,0.001706,0.01101
4,0.992151,0.971483,0.001026,0.001706
5,0.999924,0.992151,0.000704,0.001026
6,0.994012,0.999924,0.000779,0.000704
7,0.678494,0.994012,0.004326,0.000779
8,0.161772,0.678494,0.011821,0.004326
9,0.9475,0.161772,0.001359,0.011821


## Define FEATURES_V3_1 + Prepare Data

In [26]:
# v3.1 = v3 + microstructure lag1 features
FEATURES_V3_1 = FEATURES_V3 + [
    "taker_buy_ratio_base_lag1",
    "taker_buy_ratio_quote_lag1",
    "trade_intensity_lag1",
    "quote_per_trade_lag1"
]

print("Num FEATURES_V3_1:", len(FEATURES_V3_1))

# drop NaNs (새 lag1 때문에 초반 1행 추가로 빠짐)
train_v3_1 = train_v3_1.dropna(subset=FEATURES_V3_1 + ["target"]).reset_index(drop=True)
test_v3_1  = test_v3_1.dropna(subset=FEATURES_V3_1).reset_index(drop=True)

print("train_v3_1:", train_v3_1.shape)
print("test_v3_1 :", test_v3_1.shape)

# time split
n = len(train_v3_1)
split = int(n * 0.8)

X_train_v3_1 = train_v3_1.loc[:split-1, FEATURES_V3_1]
y_train_v3_1 = train_v3_1.loc[:split-1, "target"].astype(int)

X_val_v3_1   = train_v3_1.loc[split:, FEATURES_V3_1]
y_val_v3_1   = train_v3_1.loc[split:, "target"].astype(int)

print("X_train:", X_train_v3_1.shape, "X_val:", X_val_v3_1.shape)


Num FEATURES_V3_1: 39
train_v3_1: (2121611, 50)
test_v3_1 : (909430, 50)
X_train: (1697288, 39) X_val: (424323, 39)


## Train LightGBM (V3.1) and Evaluate Macro-F1

In [27]:
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

MODEL_V3_1 = LGBMClassifier(
    n_estimators=800,
    learning_rate=0.02,
    num_leaves=127,
    min_child_samples=100,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.0,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

MODEL_V3_1.fit(X_train_v3_1, y_train_v3_1)

val_pred_v3_1 = MODEL_V3_1.predict(X_val_v3_1)
macro_f1_v3_1 = f1_score(y_val_v3_1, val_pred_v3_1, average="macro")

print("V3.1 Macro-F1 (default thr=0.5):", macro_f1_v3_1)
print("Validation predicted positive rate:", val_pred_v3_1.mean())


[LightGBM] [Info] Number of positive: 810636, number of negative: 886652
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.338303 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9945
[LightGBM] [Info] Number of data points in the train set: 1697288, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.477607 -> initscore=-0.089633
[LightGBM] [Info] Start training from score -0.089633
V3.1 Macro-F1 (default thr=0.5): 0.49347038808849797
Validation predicted positive rate: 0.2708054948706528


## Threshold Tuning (V3.1)

In [28]:
import numpy as np
from sklearn.metrics import f1_score

val_proba_v3_1 = MODEL_V3_1.predict_proba(X_val_v3_1)[:, 1]

best_thr_v3_1 = 0.5
best_f1_v3_1 = 0

for thr in np.linspace(0.1, 0.9, 81):
    pred = (val_proba_v3_1 >= thr).astype(int)
    f1 = f1_score(y_val_v3_1, pred, average="macro")
    if f1 > best_f1_v3_1:
        best_f1_v3_1 = f1
        best_thr_v3_1 = float(thr)

print("BEST_THR_V3_1:", best_thr_v3_1)
print("Best Macro-F1 (V3.1):", best_f1_v3_1)


BEST_THR_V3_1: 0.48
Best Macro-F1 (V3.1): 0.5145422050773429


## Final Comparison Table (v2 vs v3 vs v3.1)

In [29]:
import pandas as pd

final_results = pd.DataFrame([
    {"version":"v2",   "n_features":len(FEATURES_V2),   "best_thr":0.48, "best_macro_f1":0.5116151614632223},
    {"version":"v3",   "n_features":len(FEATURES_V3),   "best_thr":0.48, "best_macro_f1":0.5141046047716057},
    {"version":"v3.1", "n_features":len(FEATURES_V3_1), "best_thr":0.48, "best_macro_f1":0.5145422050773429},
]).sort_values("best_macro_f1", ascending=False)

final_results.to_csv("final_results_summary.csv", index=False)
print(final_results)
print("Saved: final_results_summary.csv")


  version  n_features  best_thr  best_macro_f1
2    v3.1          39      0.48       0.514542
1      v3          35      0.48       0.514105
0      v2          15      0.48       0.511615
Saved: final_results_summary.csv


## Final Submission (pick best = v3.1)

In [30]:
import numpy as np
import pandas as pd

BEST_THR_FINAL = 0.48

test_proba_final = MODEL_V3_1.predict_proba(test_v3_1[FEATURES_V3_1])[:, 1]
test_pred_final = (test_proba_final >= BEST_THR_FINAL).astype(int)

submission_final = pd.DataFrame({
    "row_id": test["row_id"].astype(int),
    "target": np.zeros(len(test), dtype=int)
})

submission_final.loc[test_v3_1["row_id"].astype(int).values, "target"] = test_pred_final

print("Prediction distribution (full):")
print(submission_final["target"].value_counts(normalize=True))

submission_path = "submission_final_v3_1_lgb_thr048_full.csv"
submission_final.to_csv(submission_path, index=False)
print("Saved:", submission_path)


Prediction distribution (full):
target
0    0.541931
1    0.458069
Name: proportion, dtype: float64
Saved: submission_final_v3_1_lgb_thr048_full.csv
