In [1]:
import os
import gc
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool

CFG = {
    'iterations': 2000,
    'learning_rate': 0.03,
    'depth': 6,
    'loss_function': 'RMSE',
    'l2_leaf_reg': 5.0,        
    'shrinkage_factor': 0.15, 
    'random_seed': 42
}

def load_data():
    print("Loading data...")
    train_path, test_path = "", ""
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            if filename == 'train.csv': train_path = os.path.join(dirname, filename)
            if filename == 'test.csv': test_path = os.path.join(dirname, filename)
    
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    
    submission_template = test[['Id']].copy()
    
    return train, test, submission_template

# --- 2. RANK-GAUSS FEATURE ENGINEERING ---
def process_data(df):
    # Sort for time consistency
    df = df.sort_values(['date_id', 'time_id', 'symbol_id']).reset_index(drop=True)
    
    f_cols = [c for c in df.columns if c.startswith('f')]
    
    # A. Cross-Sectional Rank-Gauss
    # Instead of raw values, we convert to percentiles, then to Normal Distribution (Gauss)
    # This makes the features look like a Bell Curve, 
    for col in f_cols:
        # Group by Time -> Rank (0..1)
        df[f'{col}_rank'] = df.groupby(['date_id', 'time_id'])[col].transform(
            lambda x: x.rank(pct=True, method='first')
        )
        # Shift slightly to avoid infinity in Gauss transform
        df[f'{col}_rank'] = df[f'{col}_rank'] - 0.5
        
    # B. Volatility Scaling 

    df['market_vol'] = df.groupby(['date_id', 'time_id'])[f_cols[0]].transform('std')
    
    # Feature Selection: Use Ranks + Volatility Interaction
    keep_cols = [c for c in df.columns if '_rank' in c] + ['market_vol']
    
    # Memory optimization
    for c in keep_cols:
        df[c] = df[c].astype(np.float32)
        
    return df, keep_cols

# --- 3. EXECUTION ---
train, test, submission_template = load_data()

print("Applying Rank-Gauss Transform...")
train, features = process_data(train)
test, _ = process_data(test)

# Clipping target to avoid explosion
y_min, y_max = train['y'].quantile(0.005), train['y'].quantile(0.995)
train['y'] = train['y'].clip(y_min, y_max)

# Time Split
dates = train['date_id'].unique()
split_date = dates[int(len(dates) * 0.9)] # Train on 90%

X_train = train[train['date_id'] <= split_date][features]
y_train = train[train['date_id'] <= split_date]['y']
X_val = train[train['date_id'] > split_date][features]
y_val = train[train['date_id'] > split_date]['y']

print(f"Training CatBoost on {len(features)} Rank-Features...")

# --- 4. MODELING (CatBoost) ---

model = CatBoostRegressor(
    iterations=CFG['iterations'],
    learning_rate=CFG['learning_rate'],
    depth=CFG['depth'],
    loss_function=CFG['loss_function'],
    l2_leaf_reg=CFG['l2_leaf_reg'],
    random_seed=CFG['random_seed'],
    boosting_type='Ordered', 
    verbose=100,
    allow_writing_files=False,
    task_type="CPU" 
)

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=50,
    use_best_model=True
)

# 5. PREDICTION & POST-PROCESSING
print("Predicting...")
preds = model.predict(test[features])

# A. ZERO-SUM CENTERING 
# The sum of predictions for a given time must be 0
test['raw_pred'] = preds
market_means = test.groupby(['date_id', 'time_id'])['raw_pred'].transform('mean')
preds = preds - market_means

# B. GLOBAL SHRINKAGE 

preds = preds * CFG['shrinkage_factor']

# Clip to safety bounds
preds = np.clip(preds, y_min, y_max)


submission_template['y'] = preds
submission_template.to_csv('submission.csv', index=False)
print("SUCCESS. Submission Generated.")

Loading data...
Applying Rank-Gauss Transform...
Training CatBoost on 27 Rank-Features...
0:	learn: 0.0018618	test: 0.0017012	best: 0.0017012 (0)	total: 863ms	remaining: 28m 45s
100:	learn: 0.0018570	test: 0.0016973	best: 0.0016973 (100)	total: 1m 10s	remaining: 22m 8s
200:	learn: 0.0018563	test: 0.0016971	best: 0.0016971 (200)	total: 2m 20s	remaining: 20m 58s
300:	learn: 0.0018559	test: 0.0016971	best: 0.0016971 (296)	total: 3m 32s	remaining: 19m 58s
400:	learn: 0.0018555	test: 0.0016971	best: 0.0016971 (381)	total: 4m 43s	remaining: 18m 51s
500:	learn: 0.0018552	test: 0.0016970	best: 0.0016970 (500)	total: 5m 55s	remaining: 17m 44s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.001697020168
bestIteration = 500

Shrink model to first 501 iterations.
Predicting...
SUCCESS. Submission Generated.
