In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import TargetEncoder
from sklearn.linear_model import LassoCV, ElasticNetCV
from category_encoders import TargetEncoder
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

train_df = pd.read_csv("/kaggle/input/playground-series-s6e1/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s6e1/test.csv")
original_df = pd.read_csv("/kaggle/input/exam-score-prediction-dataset/Exam_Score_Prediction.csv")
original_df2 = pd.read_csv("/kaggle/input/student-performance-prediction/student_performance_dataset.csv")

submission_df = pd.read_csv("/kaggle/input/playground-series-s6e1/sample_submission.csv")

In [6]:
print(f"Train shape:    {train_df.shape}")
print(f"Test shape:     {test_df.shape}")
print(f"Original shape: {original_df.shape}")

Train shape:    (630000, 13)
Test shape:     (270000, 12)
Original shape: (20000, 13)


In [7]:
TARGET = "exam_score"
ID_COL = "id"

base_features = [col for col in train_df.columns if col not in [TARGET, ID_COL]]
CATS = train_df.select_dtypes("object").columns.to_list()

print(f"\nBase features: {len(base_features)}")
print(f"Categorical features: {CATS}")


Base features: 11
Categorical features: ['gender', 'course', 'internet_access', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']


In [8]:
def preprocess_optimized(df):
    """
    Generate high-value features INCLUDING binned features.
    Returns: (DataFrame with selected features, list of numeric feature names)
    """
    df_temp = df.copy()
    eps = 1e-5

    # Polynomials (2nd order only)
    df_temp['study_hours_squared'] = df_temp['study_hours'] ** 2
    df_temp['class_attendance_squared'] = df_temp['class_attendance'] ** 2
    df_temp['sleep_hours_squared'] = df_temp['sleep_hours'] ** 2
    df_temp['age_squared'] = df_temp['age'] ** 2

    # Log transforms
    sh_pos = df_temp['study_hours'].clip(lower=0)
    ca_pos = df_temp['class_attendance'].clip(lower=0)
    sl_pos = df_temp['sleep_hours'].clip(lower=0)

    df_temp['log_study_hours'] = np.log1p(sh_pos)
    df_temp['log_class_attendance'] = np.log1p(ca_pos)
    df_temp['log_sleep_hours'] = np.log1p(sl_pos)

    # Sqrt transforms
    df_temp['sqrt_study_hours'] = np.sqrt(sh_pos)
    df_temp['sqrt_class_attendance'] = np.sqrt(ca_pos)

    # Key interactions
    df_temp['study_hours_times_attendance'] = df_temp['study_hours'] * df_temp['class_attendance']
    df_temp['study_hours_times_sleep'] = df_temp['study_hours'] * df_temp['sleep_hours']
    df_temp['attendance_times_sleep'] = df_temp['class_attendance'] * df_temp['sleep_hours']
    df_temp['age_times_study_hours'] = df_temp['age'] * df_temp['study_hours']

    # Important ratios
    df_temp['study_hours_over_sleep'] = df_temp['study_hours'] / (df_temp['sleep_hours'] + eps)
    df_temp['attendance_over_sleep'] = df_temp['class_attendance'] / (df_temp['sleep_hours'] + eps)
    df_temp['attendance_over_study'] = df_temp['class_attendance'] / (df_temp['study_hours'] + eps)

    # Ordinal encoding
    sleep_quality_map = {'poor': 0, 'average': 1, 'good': 2}
    facility_rating_map = {'low': 0, 'medium': 1, 'high': 2}
    exam_difficulty_map = {'easy': 0, 'moderate': 1, 'hard': 2}

    df_temp['sleep_quality_numeric'] = df_temp['sleep_quality'].map(sleep_quality_map).fillna(1).astype(int)
    df_temp['facility_rating_numeric'] = df_temp['facility_rating'].map(facility_rating_map).fillna(1).astype(int)
    df_temp['exam_difficulty_numeric'] = df_temp['exam_difficulty'].map(exam_difficulty_map).fillna(1).astype(int)

    # Ordinal × numeric interactions
    df_temp['study_hours_times_sleep_quality'] = df_temp['study_hours'] * df_temp['sleep_quality_numeric']
    df_temp['attendance_times_facility'] = df_temp['class_attendance'] * df_temp['facility_rating_numeric']
    df_temp['sleep_hours_times_difficulty'] = df_temp['sleep_hours'] * df_temp['exam_difficulty_numeric']

    # Ordinal × ordinal interactions
    df_temp['facility_x_sleepq'] = df_temp['facility_rating_numeric'] * df_temp['sleep_quality_numeric']
    df_temp['difficulty_x_facility'] = df_temp['exam_difficulty_numeric'] * df_temp['facility_rating_numeric']

    # Rule-based flags
    df_temp["high_att_high_study"] = ((df_temp["class_attendance"] >= 90) & (df_temp["study_hours"] >= 6)).astype(int)
    df_temp["ideal_sleep_flag"] = ((df_temp["sleep_hours"] >= 7) & (df_temp["sleep_hours"] <= 9)).astype(int)
    df_temp["high_study_flag"] = (df_temp["study_hours"] >= 7).astype(int)

    # Composite efficiency
    df_temp['efficiency'] = (df_temp['study_hours'] * df_temp['class_attendance']) / (df_temp['sleep_hours'] + 1)

    # Gap features
    df_temp['sleep_gap_8'] = (df_temp['sleep_hours'] - 8.0).abs()
    df_temp['attendance_gap_100'] = (df_temp['class_attendance'] - 100.0).abs()

    # BINNED FEATURES (KEEP THESE - THEY ARE VALUABLE!)
    df_temp['study_bin_num'] = pd.cut(df_temp['study_hours'], bins=5, labels=False).astype(int)
    df_temp['attendance_bin_num'] = pd.cut(df_temp['class_attendance'], bins=5, labels=False).astype(int)
    df_temp['sleep_bin_num'] = pd.cut(df_temp['sleep_hours'], bins=5, labels=False).astype(int)
    df_temp['age_bin_num'] = pd.cut(df_temp['age'], bins=5, labels=False).astype(int)

    # Feature list (34 features total)
    numeric_features = [
        'study_hours_squared', 'class_attendance_squared', 'sleep_hours_squared', 'age_squared',
        'log_study_hours', 'log_class_attendance', 'log_sleep_hours',
        'sqrt_study_hours', 'sqrt_class_attendance',
        'study_hours_times_attendance', 'study_hours_times_sleep', 'attendance_times_sleep',
        'age_times_study_hours',
        'study_hours_over_sleep', 'attendance_over_sleep', 'attendance_over_study',
        'sleep_quality_numeric', 'facility_rating_numeric', 'exam_difficulty_numeric',
        'study_hours_times_sleep_quality', 'attendance_times_facility', 'sleep_hours_times_difficulty',
        'facility_x_sleepq', 'difficulty_x_facility',
        'high_att_high_study', 'ideal_sleep_flag', 'high_study_flag',
        'efficiency',
        'sleep_gap_8', 'attendance_gap_100',
        'study_bin_num', 'attendance_bin_num', 'sleep_bin_num', 'age_bin_num'
    ]

    return df_temp[base_features + numeric_features], numeric_features

In [9]:
X_raw, numeric_cols = preprocess_optimized(train_df)
y = train_df[TARGET].reset_index(drop=True)

X_test_raw, _ = preprocess_optimized(test_df)
X_orig_raw, _ = preprocess_optimized(original_df)
y_orig = original_df[TARGET].reset_index(drop=True)

full_data = pd.concat([X_raw, X_test_raw, X_orig_raw], axis=0, ignore_index=True)

for col in numeric_cols:
    full_data[col] = full_data[col].astype(float)

X = full_data.iloc[:len(train_df)].copy()
X_test = full_data.iloc[len(train_df):len(train_df) + len(test_df)].copy()
X_original = full_data.iloc[len(train_df) + len(test_df):].copy()

print(f"Engineered features: {len(numeric_cols)}")
print(f"Total features: {X.shape[1]} (11 base + {len(numeric_cols)} engineered)")

Engineered features: 34
Total features: 45 (11 base + 34 engineered)


In [10]:
FOLDS = 10
y_bins = pd.qcut(y, q=10, labels=False)
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=1003)

oof_pred_lr = np.zeros(len(X))
oof_pred_lasso = np.zeros(len(X))
oof_pred_enet = np.zeros(len(X))

test_preds_lr = np.zeros((len(X_test), FOLDS))
test_preds_lasso = np.zeros((len(X_test), FOLDS))
test_preds_enet = np.zeros((len(X_test), FOLDS))

orig_preds_lr = np.zeros(len(X_original))
orig_preds_lasso = np.zeros(len(X_original))
orig_preds_enet = np.zeros(len(X_original))

In [None]:


for fold, (tr, val) in enumerate(kf.split(X, y_bins), 1):

    X_tr, X_val = X.iloc[tr], X.iloc[val]
    y_tr, y_val = y.iloc[tr], y.iloc[val]

    X_tr_full = pd.concat([X_tr, X_original], axis=0, ignore_index=True)
    y_tr_full = pd.concat([y_tr, y_orig], axis=0, ignore_index=True)

    te = TargetEncoder()

    X_tr_enc = X_tr_full.copy()
    X_val_enc = X_val.copy()
    X_test_enc = X_test.copy()

    X_tr_enc[CATS] = te.fit_transform(X_tr_full[CATS], y_tr_full)
    X_val_enc[CATS] = te.transform(X_val[CATS])
    X_test_enc[CATS] = te.transform(X_test[CATS])

    # Ridge
    ridge = RidgeCV(alphas=np.logspace(-3, 3, 20), cv=5)
    ridge.fit(X_tr_enc, y_tr_full)

    oof_pred_lr[val] = ridge.predict(X_val_enc)
    test_preds_lr[:, fold-1] = ridge.predict(X_test_enc)
    orig_preds_lr += ridge.predict(
        X_tr_enc.iloc[-len(X_original):]
    ) / FOLDS

    # Lasso
    lasso = LassoCV(alphas=np.logspace(-4, 1, 30), cv=5, n_jobs=-1)
    lasso.fit(X_tr_enc, y_tr_full)

    oof_pred_lasso[val] = lasso.predict(X_val_enc)
    test_preds_lasso[:, fold-1] = lasso.predict(X_test_enc)
    orig_preds_lasso += lasso.predict(
        X_tr_enc.iloc[-len(X_original):]
    ) / FOLDS

    # ElasticNet
    enet = ElasticNetCV(
        l1_ratio=[0.2, 0.5, 0.8],
        alphas=np.logspace(-4, 1, 20),
        cv=5,
        n_jobs=-1
    )
    enet.fit(X_tr_enc, y_tr_full)

    oof_pred_enet[val] = enet.predict(X_val_enc)
    test_preds_enet[:, fold-1] = enet.predict(X_test_enc)
    orig_preds_enet += enet.predict(
        X_tr_enc.iloc[-len(X_original):]
    ) / FOLDS

    rmse = np.sqrt(mean_squared_error(y_val, oof_pred_lr[val]))
    print(f"Fold {fold} | Ridge RMSE: {rmse:.5f}")


Fold 1 | Ridge RMSE: 8.84631


In [None]:
for col in base_features:
    full_data[col] = full_data[col].astype(str).astype("category")

X_xgb = full_data.iloc[:len(train_df)].copy()
X_test_xgb = full_data.iloc[len(train_df):len(train_df) + len(test_df)].copy()
X_original_xgb = full_data.iloc[len(train_df) + len(test_df):].copy()


In [None]:
X_xgb["ridge_pred"] = oof_pred_lr
X_xgb["lasso_pred"] = oof_pred_lasso
X_xgb["enet_pred"]  = oof_pred_enet

X_test_xgb["ridge_pred"] = test_preds_lr.mean(axis=1)
X_test_xgb["lasso_pred"] = test_preds_lasso.mean(axis=1)
X_test_xgb["enet_pred"]  = test_preds_enet.mean(axis=1)

X_original_xgb["ridge_pred"] = orig_preds_lr
X_original_xgb["lasso_pred"] = orig_preds_lasso
X_original_xgb["enet_pred"]  = orig_preds_enet


In [None]:
xgb_params = {
    "n_estimators": 40000,
    "learning_rate": 0.003,
    "max_depth": 8,
    "subsample": 0.82,
    "colsample_bytree": 0.52,
    "colsample_bynode": 0.68,
    "min_child_weight": 7,
    "reg_lambda": 8.0,
    "reg_alpha": 0.3,
    "tree_method": "hist",
    "enable_categorical": True,
    "eval_metric": "rmse",
    "random_state": 42,
    "early_stopping_rounds": 250,
    "device": "cuda"
}

oof_xgb = np.zeros(len(X_xgb))
test_preds_xgb = []

for fold, (tr, val) in enumerate(kf.split(X_xgb, y_bins), 1):

    X_tr, X_val = X_xgb.iloc[tr], X_xgb.iloc[val]
    y_tr, y_val = y.iloc[tr], y.iloc[val]

    X_tr_full = pd.concat([X_tr, X_original_xgb])
    y_tr_full = pd.concat([y_tr, y_orig])

    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_tr_full, y_tr_full,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    oof_xgb[val] = model.predict(X_val)
    test_preds_xgb.append(model.predict(X_test_xgb))

    print(f"Fold {fold} | XGB RMSE:",
          np.sqrt(mean_squared_error(y_val, oof_xgb[val])))


In [None]:
# ===== FINAL TEST PREDICTIONS =====
final_test_pred = np.mean(test_preds_xgb, axis=0)
final_test_pred = np.clip(final_test_pred, 0, 100)

# ===== SUBMISSION FILE =====
submission = pd.DataFrame({
    "id": test_df["id"],
    TARGET: final_test_pred
})

submission.to_csv("submission.csv", index=False)
print("submission.csv saved")

In [None]:


plt.figure(figsize=(5,5))
plt.scatter(y_val, final_test_pred, alpha=0.4)
plt.plot([y_val.min(), y_val.max()],
         [y_val.min(), y_val.max()],
         color="red")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted (Validation)")
plt.show()
