# Imports & Config

In [None]:
import numpy as np
import pandas as pd
import glob
import os
import hashlib

from sklearn.linear_model import RidgeCV, Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [None]:
TARGET = "exam_score"
RANDOM_STATE = 42
DATA_DIR = "/kaggle/input/s6e1-models"

# Discover Models

In [None]:
oof_files = sorted(glob.glob(f"{DATA_DIR}/**/*_oof.csv", recursive=True))
sub_files = [f.replace("_oof.csv", "_sub.csv") for f in oof_files]
model_names = [os.path.basename(f).replace("_oof.csv", "") for f in oof_files]

print(f"Models discovered: {len(model_names)}")

if len(oof_files) == 0:
    raise RuntimeError("No OOF files found in s6e1-models")


# Deduplicate Identical Predictions

In [None]:
unique_hashes = {}
keep_idx = []

for i, f in enumerate(sub_files):
    preds = pd.read_csv(f)[TARGET].values
    h = hashlib.md5(preds.tobytes()).hexdigest()

    if h not in unique_hashes:
        unique_hashes[h] = model_names[i]
        keep_idx.append(i)
    else:
        print(f"Removed duplicate: {model_names[i]} == {unique_hashes[h]}")


In [None]:
oof_files = [oof_files[i] for i in keep_idx]
sub_files = [sub_files[i] for i in keep_idx]
model_names = [model_names[i] for i in keep_idx]

print(f"Unique models kept: {len(model_names)}")


# Load Ground Truth + Predictions

In [None]:
train_df = pd.read_csv("/kaggle/input/playground-series-s6e1/train.csv")
y = train_df[TARGET].values

oofs = np.stack([pd.read_csv(f)[TARGET].values for f in oof_files], axis=1)
subs = np.stack([pd.read_csv(f)[TARGET].values for f in sub_files], axis=1)

print("OOF:", oofs.shape)
print("SUB:", subs.shape)

# Clip Predictions

In [None]:
LOW, HIGH = y.min(), y.max()

oofs = np.clip(oofs, LOW, HIGH)
subs = np.clip(subs, LOW, HIGH)

# Correlation-Based Feature Filtering

In [None]:
cors = np.array([
    np.corrcoef(oofs[:, i], y)[0, 1]
    for i in range(oofs.shape[1])
])

THRESH = 0.885   # sweet spot for this competition
keep = np.where(cors > THRESH)[0]

oofs = oofs[:, keep]
subs = subs[:, keep]
model_names = [model_names[i] for i in keep]

print(f"Models after correlation filter: {len(model_names)}")

# RidgeCV Meta Learner (Alpha Selection)

In [None]:
alphas = np.logspace(-4, 3, 30)

ridge_cv = RidgeCV(
    alphas=alphas,
    fit_intercept=True,
    cv=5
)

ridge_cv.fit(oofs, y)
best_alpha = ridge_cv.alpha_
print("Best alpha:", best_alpha)

# 1. RAW META RIDGE CV

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

meta_oof = np.zeros(len(y))
meta_sub = np.zeros(subs.shape[0])

for tr, val in kf.split(oofs):
    model = Ridge(alpha=best_alpha)
    model.fit(oofs[tr], y[tr])
    meta_oof[val] = model.predict(oofs[val])
    meta_sub += model.predict(subs) / kf.n_splits

meta_rmse = np.sqrt(mean_squared_error(y, meta_oof))
print(f"Meta Ridge CV RMSE: {meta_rmse:.6f}")


# 2. Rank Normalization

In [None]:
def rank_normalize(x):
    return pd.DataFrame(x).rank(method="average", pct=True).values

oofs_rank = rank_normalize(oofs)
subs_rank = rank_normalize(subs)

# 3. Rank Meta Ridge CV

In [None]:
rank_oof = np.zeros(len(y))
rank_sub = np.zeros(subs.shape[0])

for tr, val in kf.split(oofs_rank):
    model = Ridge(alpha=best_alpha)
    model.fit(oofs_rank[tr], y[tr])
    rank_oof[val] = model.predict(oofs_rank[val])
    rank_sub += model.predict(subs_rank) / kf.n_splits

rank_rmse = np.sqrt(mean_squared_error(y, rank_oof))
print(f"Rank Meta Ridge CV RMSE: {rank_rmse:.6f}")

# 4. Blend Search (Raw + Rank)

In [None]:
best_rmse = 1e9
best_w = 0.0

for w in np.arange(0.0, 1.01, 0.05):
    blend_oof = (1 - w) * meta_oof + w * rank_oof
    rmse = np.sqrt(mean_squared_error(y, blend_oof))
    print(f"Blend weight {w:.2f} -> RMSE {rmse:.6f}")

    if rmse < best_rmse:
        best_rmse = rmse
        best_w = w

print("\nBEST BLEND")
print(f"Rank weight: {best_w}")
print(f"Best CV RMSE: {best_rmse:.6f}")

# 5. Final Submission

In [None]:
final_sub = (1 - best_w) * meta_sub + best_w * rank_sub
final_sub = np.clip(final_sub, LOW, HIGH)

sample = pd.read_csv("/kaggle/input/playground-series-s6e1/sample_submission.csv")
sample[TARGET] = final_sub

fname = f"submission_blended_{best_rmse:.6f}.csv"
sample.to_csv(fname, index=False)

print("Saved:", fname)