# WITH 174 FEATURES

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# 1. Load TRAIN features in chunks
# Example: combining poly_train_part1 + poly_train_part2 with other_train
poly_train_part1 = pd.read_csv("poly_train_part1.csv")
poly_train_part2 = pd.read_csv("poly_train_part2.csv")
other_train = pd.read_csv("other_train.csv")
y_train = pd.read_csv("y_train.csv")

# Combine polynomial parts
poly_train_full = pd.concat([poly_train_part1, poly_train_part2], axis=0).reset_index(drop=True)

# Combine with other_train
X_train = pd.concat([poly_train_full, other_train], axis=1)

# Clean target
y_train = y_train.reset_index(drop=True)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# 2. Load TEST set
poly_test = pd.read_csv("poly_test.csv")
other_test = pd.read_csv("other_test.csv")
y_test = pd.read_csv("y_test.csv").reset_index(drop=True)

X_test = pd.concat([poly_test, other_test], axis=1)

print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# 3. Convert to DMatrix (XGBoost format)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 4. Train XGBoost Regressor
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "hist",   # memory-efficient for large data
    "max_depth": 6,
    "eta": 0.1
}

num_round = 500

bst = xgb.train(params, dtrain, num_boost_round=num_round, evals=[(dtest, "eval")], verbose_eval=50)

# 5. Make Predictions & Evaluate
y_pred = bst.predict(dtest)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)

print("\n----- RESULTS -----")
print("MSE  :", mse)
print("RMSE :", rmse)
print("R²   :", r2)


X_train shape: (1885128, 174)
y_train shape: (1885128, 1)
X_test shape: (471282, 174)
y_test shape: (471282, 1)
[0]	eval-rmse:0.90631
[50]	eval-rmse:0.18163
[100]	eval-rmse:0.16842
[150]	eval-rmse:0.16034
[200]	eval-rmse:0.15391
[250]	eval-rmse:0.14942
[300]	eval-rmse:0.14568
[350]	eval-rmse:0.14207
[400]	eval-rmse:0.13901
[450]	eval-rmse:0.13624
[499]	eval-rmse:0.13380

----- RESULTS -----
MSE  : 0.017902156338095665
RMSE : 0.13379893997373696
R²   : 0.9821070432662964


# WITH 20 FEATURES

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# 1. LOAD TRAIN & TEST DATA (TOP FEATURES, MEMORY OPTIMIZED)
X_train = pd.read_csv("X_train_top_features.csv", dtype=np.float32)
X_test = pd.read_csv("X_test_top_features.csv", dtype=np.float32)

y_train = pd.read_csv("y_train.csv").values.ravel()
y_test = pd.read_csv("y_test.csv").values.ravel()

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# 2. CONVERT TO DMatrix (XGBoost format)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 3. TRAIN XGBOOST REGRESSOR (FAST + MEMORY EFFICIENT)
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "hist",     # best for large data on CPU
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.7,          # row subsampling
    "colsample_bytree": 0.7,   # feature subsampling
    "seed": 42
}

num_rounds = 500

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=num_rounds,
    evals=[(dtrain, "train"), (dtest, "test")],
    verbose_eval=50
)

# 4. EVALUATE ON TRAIN DATA
y_train_pred = bst.predict(dtrain)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

print("\n----- TRAIN RESULTS (XGBoost) -----")
print("MSE  :", mse_train)
print("RMSE :", rmse_train)
print("R²   :", r2_train)

# 5. EVALUATE ON TEST DATA
y_test_pred = bst.predict(dtest)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

print("\n----- TEST RESULTS (XGBoost) -----")
print("MSE  :", mse_test)
print("RMSE :", rmse_test)
print("R²   :", r2_test)


X_train shape: (1885128, 20)
y_train shape: (1885128,)
X_test shape: (471282, 20)
y_test shape: (471282,)
[0]	train-rmse:0.90617	test-rmse:0.90628
[50]	train-rmse:0.18378	test-rmse:0.18461
[100]	train-rmse:0.16889	test-rmse:0.17024
[150]	train-rmse:0.15999	test-rmse:0.16176
[200]	train-rmse:0.15454	test-rmse:0.15648
[250]	train-rmse:0.14928	test-rmse:0.15142
[300]	train-rmse:0.14567	test-rmse:0.14785
[350]	train-rmse:0.14221	test-rmse:0.14448
[400]	train-rmse:0.13909	test-rmse:0.14143
[450]	train-rmse:0.13646	test-rmse:0.13892
[499]	train-rmse:0.13374	test-rmse:0.13626

----- TRAIN RESULTS (XGBoost) -----
MSE  : 0.017886225221657375
RMSE : 0.13373939293139242
R²   : 0.9821199711750015

----- TEST RESULTS (XGBoost) -----
MSE  : 0.01856751469783279
RMSE : 0.13626266802698672
R²   : 0.9814420164498081


In [1]:
# STEP 1: Load Data (same as yours)
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load top-20 feature datasets
X_train = pd.read_csv("X_train_top_features.csv", dtype=np.float32)
X_test  = pd.read_csv("X_test_top_features.csv", dtype=np.float32)

y_train = pd.read_csv("y_train.csv", dtype=np.float32).values.ravel()
y_test  = pd.read_csv("y_test.csv", dtype=np.float32).values.ravel()

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape :", X_test.shape)
print("y_test shape :", y_test.shape)

# STEP 2: Create VALIDATION SPLIT (from training only)
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.2,
    random_state=42
)

# STEP 3: Convert to DMatrix (XGBoost format)
dtrain = xgb.DMatrix(X_train_split, label=y_train_split)
dval   = xgb.DMatrix(X_val, label=y_val)
dtest  = xgb.DMatrix(X_test, label=y_test)

# STEP 4: Train XGBoost Model (WITH validation)
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "hist",     # fast for large CPU datasets
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "seed": 42
}

num_rounds = 500

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=num_rounds,
    evals=[(dtrain, "train"), (dval, "validation")],
    verbose_eval=50
)

# STEP 5: Validation Evaluation
y_val_pred = bst.predict(dval)

mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = np.sqrt(mse_val)
r2_val = r2_score(y_val, y_val_pred)

print("\n----- VALIDATION RESULTS (XGBoost) -----")
print("MSE  :", mse_val)
print("RMSE :", rmse_val)
print("R²   :", r2_val)

# STEP 6: FINAL TEST EVALUATION (ONLY ONCE)
y_test_pred = bst.predict(dtest)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

print("\n----- TEST RESULTS (FINAL - XGBoost) -----")
print("MSE  :", mse_test)
print("RMSE :", rmse_test)
print("R²   :", r2_test)

X_train shape: (1885128, 20)
y_train shape: (1885128,)
X_test shape : (471282, 20)
y_test shape : (471282,)
[0]	train-rmse:0.90632	validation-rmse:0.90548
[50]	train-rmse:0.18308	validation-rmse:0.18380
[100]	train-rmse:0.16806	validation-rmse:0.16895
[150]	train-rmse:0.16008	validation-rmse:0.16123
[200]	train-rmse:0.15396	validation-rmse:0.15538
[250]	train-rmse:0.14913	validation-rmse:0.15077
[300]	train-rmse:0.14517	validation-rmse:0.14713
[350]	train-rmse:0.14176	validation-rmse:0.14395
[400]	train-rmse:0.13855	validation-rmse:0.14097
[450]	train-rmse:0.13590	validation-rmse:0.13847
[499]	train-rmse:0.13352	validation-rmse:0.13615

----- VALIDATION RESULTS (XGBoost) -----
MSE  : 0.018536195158958435
RMSE : 0.13614769612064112
R²   : 0.981441855430603

----- TEST RESULTS (FINAL - XGBoost) -----
MSE  : 0.018638448789715767
RMSE : 0.13652270430121052
R²   : 0.9813711047172546


In [2]:
# STEP 1: Imports & Data Loading
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

# Load training data ONLY (top-20 features)
X_train = pd.read_csv("X_train_top_features.csv", dtype=np.float32)
y_train = pd.read_csv("y_train.csv", dtype=np.float32).values.ravel()

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

# STEP 2: Define 5-Fold Cross-Validation
kf = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

# STEP 3: XGBoost Parameters (same as your best model)
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "hist",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "seed": 42
}

num_rounds = 500

# STEP 4: Run 5-Fold CV and Collect R² Scores
r2_scores = []

fold = 1
for train_idx, val_idx in kf.split(X_train):

    print(f"\n--- Fold {fold} ---")

    # Split data
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    # Convert to DMatrix
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval   = xgb.DMatrix(X_val, label=y_val)

    # Train model
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=num_rounds,
        evals=[(dval, "validation")],
        verbose_eval=False
    )

    # Predict on validation fold
    y_val_pred = model.predict(dval)

    # Compute R²
    r2 = r2_score(y_val, y_val_pred)
    r2_scores.append(r2)

    print(f"Fold {fold} R²: {r2:.6f}")

    fold += 1

# STEP 5: Show Stability of R²
r2_scores = np.array(r2_scores)

print("\n===== 5-FOLD CROSS-VALIDATION RESULTS =====")
print("R² scores per fold:", r2_scores)
print("Mean R²          :", r2_scores.mean())
print("Std Deviation R² :", r2_scores.std())

X_train: (1885128, 20)
y_train: (1885128,)

--- Fold 1 ---
Fold 1 R²: 0.981406

--- Fold 2 ---
Fold 2 R²: 0.981960

--- Fold 3 ---
Fold 3 R²: 0.981950

--- Fold 4 ---
Fold 4 R²: 0.981658

--- Fold 5 ---
Fold 5 R²: 0.981849

===== 5-FOLD CROSS-VALIDATION RESULTS =====
R² scores per fold: [0.98140568 0.98196036 0.9819504  0.98165828 0.9818486 ]
Mean R²          : 0.9817646622657776
Std Deviation R² : 0.00020977577246453737


In [1]:
# STEP 1: Imports & Load Data
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Load top-20 feature training and test data
X_train = pd.read_csv("X_train_top_features.csv", dtype=np.float32)
y_train = pd.read_csv("y_train.csv", dtype=np.float32).values.ravel()

X_test = pd.read_csv("X_test_top_features.csv", dtype=np.float32)
y_test = pd.read_csv("y_test.csv", dtype=np.float32).values.ravel()

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test :", X_test.shape)
print("y_test :", y_test.shape)

# STEP 2: XGBoost Parameters (same as used in CV)
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "hist",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "seed": 42
}

num_rounds = 500

# STEP 3: Convert full training data to DMatrix
dtrain_full = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

# STEP 4: Train Final Model on Full Training Data
final_model = xgb.train(
    params,
    dtrain_full,
    num_boost_round=num_rounds,
    verbose_eval=50  # optional, shows progress every 50 rounds
)

# STEP 5: Predict on Test Data
y_test_pred = final_model.predict(dtest)

# STEP 6: Evaluate Test Performance
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

print("\n===== FINAL TEST RESULTS =====")
print("MSE  :", mse_test)
print("RMSE :", rmse_test)
print("R²   :", r2_test)



X_train: (1885128, 20)
y_train: (1885128,)
X_test : (471282, 20)
y_test : (471282,)

===== FINAL TEST RESULTS =====
MSE  : 0.018567511811852455
RMSE : 0.13626265743721738
R²   : 0.9814420342445374


In [2]:
# STEP 7: Save model in XGBoost native JSON format
final_model.save_model("xgb_final_model.json")

print("Model saved successfully as 'xgb_final_model.json'")


Model saved successfully as 'xgb_final_model.json'


In [3]:
import joblib

# Save the trained XGBoost booster
joblib.dump(final_model, "xgb_final_model_joblib.pkl")
print("Model saved as 'xgb_final_model_joblib.pkl' using joblib")

# Later, to load it:
# loaded_model = joblib.load("xgb_final_model_joblib.pkl")
# y_test_pred = loaded_model.predict(dtest)


Model saved as 'xgb_final_model_joblib.pkl' using joblib
