In [17]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/abes-dataset/sample_submission.csv
/kaggle/input/abes-dataset/train.csv
/kaggle/input/abes-dataset/test.csv


In [18]:
!pip install xgboost




In [19]:
import xgboost as xgb
from xgboost import XGBRegressor


In [20]:
#Prakash Tiwari
#2300320120175(CS-C)
#prakash.23b0121192@abes.ac.in

In [21]:
import xgboost as xgb
xgb.__version__


'2.0.3'

In [22]:
# ===============================
# 1. Imports
# ===============================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

sns.set(style="whitegrid")

# ===============================
# 2. Load Data
# ===============================
train = pd.read_csv("/kaggle/input/abes-dataset/train.csv")
test = pd.read_csv("/kaggle/input/abes-dataset/test.csv")
sample_submission = pd.read_csv("/kaggle/input/abes-dataset/sample_submission.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Sample submission shape:", sample_submission.shape)
display(train.head())

# ===============================
# 3. Feature Engineering
# ===============================

def bmi_category(bmi):
    if bmi < 18.5:
        return "underweight"
    elif bmi < 25:
        return "normal"
    elif bmi < 30:
        return "overweight"
    else:
        return "obese"

# Work on copies to avoid modifying originals
train_fe = train.copy()
test_fe = test.copy()

# ---- BMI category ----
train_fe["bmi_cat"] = train_fe["bmi"].apply(bmi_category)
test_fe["bmi_cat"] = test_fe["bmi"].apply(bmi_category)

# ---- Age group ----
bins = [0, 30, 45, 60, 100]
labels = ["young", "adult", "mid_age", "senior"]

train_fe["age_group"] = pd.cut(train_fe["age"], bins=bins, labels=labels, include_lowest=True)
test_fe["age_group"] = pd.cut(test_fe["age"], bins=bins, labels=labels, include_lowest=True)

# ---- Smoker numeric for interactions ----
train_smoker_num = train_fe["smoker"].map({"no": 0, "yes": 1})
test_smoker_num = test_fe["smoker"].map({"no": 0, "yes": 1})

# ---- Interaction features ----
train_fe["age_smoker"] = train_fe["age"] * train_smoker_num
test_fe["age_smoker"] = test_fe["age"] * test_smoker_num

train_fe["age_bmi"] = train_fe["age"] * train_fe["bmi"]
test_fe["age_bmi"] = test_fe["age"] * test_fe["bmi"]

# Quick check
display(train_fe.head())

# ===============================
# 4. Define Features and Target
# ===============================

# Drop 'charges' and 'id' from features
X = train_fe.drop(columns=["charges", "id"])
y = train_fe["charges"]

# For test set, keep id for submission later
test_ids = test_fe["id"]
X_test_final = test_fe.drop(columns=["id"])

print("Train features shape:", X.shape)
print("Test features shape:", X_test_final.shape)

# ===============================
# 5. Train / Validation Split
# ===============================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train:", X_train.shape, "X_valid:", X_valid.shape)

# ===============================
# 6. Preprocessing: Scaling + OneHot
# ===============================

numeric_features = ["age", "bmi", "children", "age_smoker", "age_bmi"]
categorical_features = ["sex", "smoker", "region", "bmi_cat", "age_group"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features),
    ]
)

# ===============================
# 7. Improved Gradient Boosting Model
#    (tuned hyperparameters)
# ===============================

gbr_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", GradientBoostingRegressor(
        n_estimators=600,
        learning_rate=0.08,
        max_depth=4,
        subsample=0.8,
        random_state=42
    ))
])

# ===============================
# 8. Train on Train Split & Evaluate on Validation
# ===============================
gbr_model.fit(X_train, y_train)
y_pred_valid = gbr_model.predict(X_valid)
rmse_gbr = mean_squared_error(y_valid, y_pred_valid, squared=False)

print(f"Validation RMSE (Improved Gradient Boosting): {rmse_gbr:.4f}")

# ===============================
# 9. Train Final Model on FULL Training Data
# ===============================

final_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", GradientBoostingRegressor(
        n_estimators=600,
        learning_rate=0.08,
        max_depth=4,
        subsample=0.8,
        random_state=42
    ))
])

final_model.fit(X, y)

# ===============================
# 10. Predict on Test Set
# ===============================
test_predictions = final_model.predict(X_test_final)
print("Test predictions shape:", test_predictions.shape)

# ===============================
# 11. Create Submission File
# ===============================

submission = sample_submission.copy()
submission["charges"] = test_predictions

display(submission.head())

submission.to_csv("submission.csv", index=False)
print("submission.csv has been saved!")


Train shape: (2217, 8)
Test shape: (555, 7)
Sample submission shape: (555, 2)


Unnamed: 0,id,age,sex,bmi,children,smoker,region,charges
0,1865,21,male,36.85,0,no,southeast,1534.3045
1,1998,38,female,34.8,2,no,southwest,6571.544
2,1337,21,female,25.8,0,no,southwest,2007.945
3,656,52,female,25.3,2,yes,southeast,24667.419
4,262,20,female,26.84,1,yes,southeast,17085.2676


Unnamed: 0,id,age,sex,bmi,children,smoker,region,charges,bmi_cat,age_group,age_smoker,age_bmi
0,1865,21,male,36.85,0,no,southeast,1534.3045,obese,young,0,773.85
1,1998,38,female,34.8,2,no,southwest,6571.544,obese,adult,0,1322.4
2,1337,21,female,25.8,0,no,southwest,2007.945,overweight,young,0,541.8
3,656,52,female,25.3,2,yes,southeast,24667.419,overweight,mid_age,52,1315.6
4,262,20,female,26.84,1,yes,southeast,17085.2676,overweight,young,20,536.8


Train features shape: (2217, 10)
Test features shape: (555, 10)
X_train: (1773, 10) X_valid: (444, 10)
Validation RMSE (Improved Gradient Boosting): 2926.4962
Test predictions shape: (555,)


Unnamed: 0,id,charges
0,1107,9021.084679
1,1322,28702.03472
2,2314,12207.114327
3,2275,1436.851418
4,1433,4682.551092


submission.csv has been saved!


In [23]:
import os
os.listdir()



['.virtual_documents', 'submission.csv']

In [24]:
submission.to_csv("/kaggle/working/submission.csv", index=False)


In [25]:
import os
os.listdir("/kaggle/working")


['.virtual_documents', 'submission.csv']

In [26]:
# Transform target
y_log = np.log1p(y)


In [27]:
xgb_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", XGBRegressor(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        tree_method='hist'
    ))
])


In [28]:
xgb_model.fit(X_train, np.log1p(y_train))
y_pred_valid_log = xgb_model.predict(X_valid)
y_pred_valid = np.expm1(y_pred_valid_log)

rmse_xgb = mean_squared_error(y_valid, y_pred_valid, squared=False)
rmse_xgb


2999.6775151522206

In [29]:
final_xgb = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", XGBRegressor(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        tree_method='hist'
    ))
])

final_xgb.fit(X, y_log)


In [30]:
test_pred_log = final_xgb.predict(X_test_final)
test_predictions = np.expm1(test_pred_log)


In [31]:
submission = sample_submission.copy()
submission["charges"] = test_predictions
submission.to_csv("submission.csv", index=False)
print("submission.csv saved!")


submission.csv saved!
