In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error
import scipy.sparse as sp

# =============================================================================
# Custom transformers
# =============================================================================
class ToDataFrame(BaseEstimator, TransformerMixin):
    """ Convert an array to a pandas DataFrame. """
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return X[self.columns]
        return pd.DataFrame(X, columns=self.columns)

class DenseTransformer(BaseEstimator, TransformerMixin):
    """ Convert sparse matrix to dense. """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.toarray() if sp.issparse(X) else X

# =============================================================================
# Load Data
# =============================================================================
train_df = pd.read_csv('kaggle_train.csv')
test_df = pd.read_csv('kaggle_test.csv')

# =============================================================================
# Define Feature Lists
# =============================================================================
categorical_features = [
    'Hospital Service Area', 'Hospital County', 'Operating Certificate Number',
    'Permanent Facility Id', 'Facility Name', 'Age Group', 'Zip Code - 3 digits',
    'Race', 'Ethnicity', 'Type of Admission', 'Patient Disposition',
    'CCSR Diagnosis Code', 'CCSR Procedure Code', 'APR DRG Code',
    'APR MDC Code', 'APR Severity of Illness Description',
    'APR Risk of Mortality', 'APR Medical Surgical Description',
    'Payment Typology 1', 'Payment Typology 2', 'Payment Typology 3',
    'Emergency Department Indicator'
]
numerical_features = ['Length of Stay', 'Birth Weight']

# =============================================================================
# Build Preprocessing Pipelines
# =============================================================================
# Numeric pipeline: impute missing values and scale.
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
    ('scaler', RobustScaler())
])

# Categorical pipeline: impute, encode, and convert.
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
    ('to_df', ToDataFrame(columns=categorical_features)),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

# Combine both pipelines.
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop')

# =============================================================================
# Split Data
# =============================================================================
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)
X_train = train_data.drop('Total Costs', axis=1)
y_train = train_data['Total Costs']
X_val = val_data.drop('Total Costs', axis=1)
y_val = val_data['Total Costs']
X_test = test_df  # Test set (without target)

# =============================================================================
# Preprocess Data
# =============================================================================
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_test_preprocessed = preprocessor.transform(X_test)

# Convert sparse matrices if needed
if sp.issparse(X_train_preprocessed):
    X_train_preprocessed = X_train_preprocessed.tocsr()
    X_val_preprocessed = X_val_preprocessed.tocsr()
    X_test_preprocessed = X_test_preprocessed.tocsr()

# =============================================================================
# Train XGBoost Model
# =============================================================================


num_round = 1000  # Number of boosting rounds


# Set up XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.05,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 45,
}

# Create DMatrix objects
dtrain = xgb.DMatrix(X_train_preprocessed, label=y_train)
dval = xgb.DMatrix(X_val_preprocessed, label=y_val)

# Train model
model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,  # Correct way to set number of iterations
    evals=[(dtrain, 'train'), (dval, 'eval')],
    early_stopping_rounds=10,  # Pass as a function argument, not in params
    verbose_eval=100
)

# Predict and evaluate
preds = model.predict(dval)
mse = mean_squared_error(y_val, preds)
print(f"Validation MSE: {mse:.4f}")

# Save model
model.save_model('xgboost_model.json')

[0]	train-rmse:52712.26430	eval-rmse:60991.19501
[100]	train-rmse:14191.68110	eval-rmse:28013.17287
[200]	train-rmse:12010.14304	eval-rmse:27202.56032
[300]	train-rmse:10951.45154	eval-rmse:26872.48656
[369]	train-rmse:10408.31722	eval-rmse:26799.44309
Validation MSE: 718109966.8685


In [None]:
dtest = xgb.DMatrix(data=X_test_preprocessed)
# Generate predictions on the test set.
test_predictions = model.predict(dtest)

print("Test Predictions from XGBoost (trained model):")
print(test_predictions)

# 'test_predictions' is assumed to be the array of predictions obtained from your XGBoost model.
# Create a DataFrame that includes both the ID column and the predictions.
submission = pd.DataFrame({
    'ID': test_df['ID'],            # Use the ID column from the original test file
    'Total Costs': test_predictions # Predictions from the model
})

# Display the first few rows of the submission DataFrame
print(submission.head())

# Optionally, save the submission DataFrame to a CSV file:
submission.to_csv('submission.csv', index=False)


Test Predictions from XGBoost (trained model):
[26442.68   17785.828  17512.256  ... 20956.748  70944.266   2601.7563]
   ID   Total Costs
0   1  26442.679688
1   2  17785.828125
2   3  17512.255859
3   4  50592.515625
4   5  42943.617188
