In [1]:
!pip install category_encoders -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from category_encoders import TargetEncoder
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

# === Load Data ===
train_df = pd.read_csv('data\\final_train_v3.csv')
test_df  = pd.read_csv('data\\final_train_v3.csv')

upper = train_df['Total_Expense'].quantile(0.95)
for df in [train_df, test_df]:
    df.drop(df[(df['Total_Expense'] > upper)].index, inplace=True)

train_df = train_df[train_df.head_gender != 3].drop(columns=['HH_ID'])
test_df  = test_df[test_df.head_gender != 3].drop(columns=['HH_ID'])

train_weights = train_df.household_size
test_weights  = test_df.household_size

# === Column Lists ===
binary_cols = ['Sector','mobile','head_gender','internet_use']
high_card_cols = ['State','NSS_region','District','household_type','head_religion','head_social','profession','industry','head_education']
int_cols = ['household_size','head_age','head_education_years','Is_couple']
float_cols = [c for c in train_df.columns if c not in binary_cols + high_card_cols + int_cols + ['Total_Expense']]

# === Binary Encoding ===
for col in binary_cols:
    uniques = pd.concat([train_df[col], test_df[col]]).dropna().unique()
    mapping = {uniques[0]:0, uniques[1]:1}
    train_df[col] = train_df[col].map(mapping).astype(int)
    test_df[col]  = test_df[col].map(mapping).astype(int)

# === Target Encoding ===
te = TargetEncoder(cols=high_card_cols)
y_train = train_df.pop('Total_Expense')
train_df = te.fit_transform(train_df, y_train)
y_test = test_df.pop('Total_Expense')
test_df = te.transform(test_df)

# === Scaling ===
scaler = StandardScaler()
train_df[int_cols + float_cols] = scaler.fit_transform(train_df[int_cols + float_cols])
test_df[int_cols + float_cols]  = scaler.transform(test_df[int_cols + float_cols])

X_train, X_test = train_df, test_df

# === Train XGB with Best Hyperparameters ===
best_params = {
    'n_estimators': 750,
    'max_depth': 10,
    'learning_rate': 0.05,
    'subsample': 1.0,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'objective': 'reg:squarederror'
}

model = XGBRegressor(**best_params)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test  = model.predict(X_test)

# === Evaluation Function ===
def eval_metrics(y_true, y_pred, weights=None):
    if weights is not None:
        y_true, y_pred = y_true * weights, y_pred * weights
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, np.nan, y_true))) * 100
    r2 = r2_score(y_true, y_pred)
    return mae, mape, rmse, mse, r2

metrics_names = ['MAE','MAPE','RMSE','MSE','R2']

print("\n=== Unweighted Metrics ===")
for label, y_t, y_p in [('Train', y_train, y_pred_train), ('Test', y_test, y_pred_test)]:
    vals = eval_metrics(y_t, y_p)
    print(f"{label}: " + ", ".join(f"{n}={v:.4f}" for n, v in zip(metrics_names, vals)))

print("\n=== Weighted Metrics ===")
for label, y_t, y_p, w in [
    ('Train Weighted', y_train, y_pred_train, train_weights),
    ('Test Weighted',  y_test,  y_pred_test,  test_weights)
]:
    vals = eval_metrics(y_t, y_p, w)
    print(f"{label}: " + ", ".join(f"{n}={v:.4f}" for n, v in zip(metrics_names, vals)))



=== Unweighted Metrics ===
Train: MAE=629.1631, MAPE=15.0724, RMSE=874.5580, MSE=764851.6489, R2=0.8729
Test: MAE=1026.5303, MAPE=23.3578, RMSE=1413.2334, MSE=1997228.6665, R2=0.6682

=== Weighted Metrics ===
Train Weighted: MAE=2546.3126, MAPE=15.0724, RMSE=3625.8289, MSE=13146635.1171, R2=0.8893
Test Weighted: MAE=4123.7400, MAPE=23.3578, RMSE=5978.6455, MSE=35744202.2905, R2=0.7002


In [None]:
import joblib

# Save the trained model to disk
joblib.dump(model, 'model\\xgb_regressor_model_below95.pkl')

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from category_encoders import TargetEncoder
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

# === Load Data ===
train_df = pd.read_csv('data\\final_train_v3.csv')
test_df  = pd.read_csv('data\\final_train_v3.csv')

upper = train_df['Total_Expense'].quantile(0.95)
for df in [train_df, test_df]:
    df.drop(df[(df['Total_Expense'] < upper)].index, inplace=True)

train_df = train_df[train_df.head_gender != 3].drop(columns=['HH_ID'])
test_df  = test_df[test_df.head_gender != 3].drop(columns=['HH_ID'])

train_weights = train_df.household_size
test_weights  = test_df.household_size

# === Column Lists ===
binary_cols = ['Sector','mobile','head_gender','internet_use']
high_card_cols = ['State','NSS_region','District','household_type','head_religion','head_social','profession','industry','head_education']
int_cols = ['household_size','head_age','head_education_years','Is_couple']
float_cols = [c for c in train_df.columns if c not in binary_cols + high_card_cols + int_cols + ['Total_Expense']]

# === Binary Encoding ===
for col in binary_cols:
    uniques = pd.concat([train_df[col], test_df[col]]).dropna().unique()
    mapping = {uniques[0]:0, uniques[1]:1}
    train_df[col] = train_df[col].map(mapping).astype(int)
    test_df[col]  = test_df[col].map(mapping).astype(int)

# === Target Encoding ===
te = TargetEncoder(cols=high_card_cols)
y_train = train_df.pop('Total_Expense')
train_df = te.fit_transform(train_df, y_train)
y_test = test_df.pop('Total_Expense')
test_df = te.transform(test_df)

# === Scaling ===
scaler = StandardScaler()
train_df[int_cols + float_cols] = scaler.fit_transform(train_df[int_cols + float_cols])
test_df[int_cols + float_cols]  = scaler.transform(test_df[int_cols + float_cols])

X_train, X_test = train_df, test_df

# === Grid Search ===
xgb = XGBRegressor(random_state=42, objective='reg:squarederror')
param_grid = {
    'n_estimators': [50, 100, 250, 500],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.85, 1.0],
    'colsample_bytree': [0.5, 0.7, 0.85]
}

grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2
)
grid.fit(X_train, y_train)

print("✅ Best XGB Params:", grid.best_params_)
model = grid.best_estimator_

# Retrain on full training set
model.fit(X_train, y_train)

# Predictions
y_pred_train = model.predict(X_train)
y_pred_test  = model.predict(X_test)

# === Evaluation Function ===
def eval_metrics(y_true, y_pred, weights=None):
    if weights is not None:
        y_true, y_pred = y_true * weights, y_pred * weights
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, np.nan, y_true))) * 100
    r2 = r2_score(y_true, y_pred)
    return mae, mape, rmse, mse, r2

metrics_names = ['MAE','MAPE','RMSE','MSE','R2']

print("\n=== Unweighted Metrics ===")
for label, y_t, y_p in [('Train', y_train, y_pred_train), ('Test', y_test, y_pred_test)]:
    vals = eval_metrics(y_t, y_p)
    print(f"{label}: " + ", ".join(f"{n}={v:.4f}" for n, v in zip(metrics_names, vals)))

print("\n=== Weighted Metrics ===")
for label, y_t, y_p, w in [
    ('Train Weighted', y_train, y_pred_train, train_weights),
    ('Test Weighted',  y_test,  y_pred_test,  test_weights)
]:
    vals = eval_metrics(y_t, y_p, w)
    print(f"{label}: " + ", ".join(f"{n}={v:.4f}" for n, v in zip(metrics_names, vals)))


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
✅ Best XGB Params: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100, 'subsample': 1.0}

=== Unweighted Metrics ===
Train: MAE=4577.4831, MAPE=21.7988, RMSE=7772.6244, MSE=60413689.7613, R2=0.3683
Test: MAE=4779.1203, MAPE=22.4898, RMSE=8180.0686, MSE=66913521.7936, R2=0.1488

=== Weighted Metrics ===
Train Weighted: MAE=9799.6516, MAPE=21.7988, RMSE=18926.8572, MSE=358225922.0112, R2=0.6617
Test Weighted: MAE=10403.6411, MAPE=22.4898, RMSE=20659.9468, MSE=426833401.0541, R2=0.5385


In [None]:
import joblib

# Save the trained model to disk
joblib.dump(model, 'model\\xgb_regressor_model_above95.pkl')