In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import joblib
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load the dataset
logger.info("Loading dataset...")
try:
    data = pd.read_csv("/content/final_merged_synthea_cleaned98.csv")
    if data.empty:
        raise ValueError("Dataset is empty.")
    logger.info(f"Dataset columns: {data.columns.tolist()}")
except Exception as e:
    logger.error(f"Error loading dataset: {e}")
    raise

# Rename columns for consistency
data = data.rename(columns={"TOTAL_CLAIM_COST": "TOTALCOST"})

# Calculate ENCOUNTER_DURATION from START and STOP
if "START" in data.columns and "STOP" in data.columns:
    try:
        logger.info("Calculating ENCOUNTER_DURATION from START and STOP...")
        data["ENCOUNTER_DURATION"] = (pd.to_datetime(data["STOP"], errors="coerce") -
                                      pd.to_datetime(data["START"], errors="coerce")).dt.days.fillna(0)
        logger.info(f"ENCOUNTER_DURATION calculated. First 5 values: {data['ENCOUNTER_DURATION'].head().tolist()}")
    except Exception as e:
        logger.error(f"Failed to calculate ENCOUNTER_DURATION: {e}. Filling with 0.")
        data["ENCOUNTER_DURATION"] = 0
else:
    logger.warning("START or STOP columns not found. Filling ENCOUNTER_DURATION with 0.")
    data["ENCOUNTER_DURATION"] = 0

# Add missing columns with default values if needed (based on actual dataset)
model_expected_columns = ["PAYER_COVERAGE", "BASE_ENCOUNTER_COST", "STATE"]
for col in model_expected_columns:
    if col not in data.columns:
        logger.warning(f"{col} not found in dataset. Filling with default value.")
        data[col] = 0 if col != "STATE" else "MA"

# Fill NaN values
data = data.fillna(0)

# Derive CHRONIC_CONDITION
data["CHRONIC_CONDITION"] = 0
if "DIAGNOSIS1" in data.columns:
    data.loc[data["DIAGNOSIS1"].str.lower().str.contains("diabetes|hypertension|asthma|heart disease", na=False), "CHRONIC_CONDITION"] = 1
if "DIAGNOSIS2" in data.columns:
    data.loc[data["DIAGNOSIS2"].str.lower().str.contains("diabetes|hypertension|asthma|heart disease", na=False), "CHRONIC_CONDITION"] = 1
logger.info(f"CHRONIC_CONDITION values (first 5): {data['CHRONIC_CONDITION'].head().tolist()}")

# Add default columns if missing
for col, default in [("REASONCODE", "General"), ("CODE_1", "None"), ("DESCRIPTION", "General Checkup"),
                     ("PROVIDERID", "General Provider"), ("DIAGNOSIS1", "General Diagnosis"), ("DIAGNOSIS2", "None")]:
    if col not in data.columns:
        data[col] = default
        logger.warning(f"{col} not found. Filled with default: {default}")

# Select the features (updated to match dataset)
features = ["AGE", "GENDER", "RACE", "ETHNICITY", "INCOME", "ENCOUNTERCLASS", "CODE", "ENCOUNTER_DURATION",
            "PAYER_COVERAGE", "BASE_ENCOUNTER_COST", "STATE", "HEALTHCARE_EXPENSES",
            "REASONCODE", "CODE_1", "DESCRIPTION", "PROVIDERID", "DIAGNOSIS1", "DIAGNOSIS2", "CHRONIC_CONDITION"]

# Prepare the feature matrix (X) and target (y)
try:
    X = data[features]
    y = data["TOTALCOST"]
    logger.info(f"Features selected: {features}")
except KeyError as e:
    logger.error(f"Missing feature(s) in dataset: {e}. Available columns: {data.columns.tolist()}")
    raise

# One-hot encode categorical variables
categorical_cols = ["GENDER", "RACE", "ETHNICITY", "ENCOUNTERCLASS", "CODE", "STATE",
                   "REASONCODE", "CODE_1", "DESCRIPTION", "PROVIDERID", "DIAGNOSIS1", "DIAGNOSIS2"]
for col in categorical_cols:
    if col in X.columns:
        X[col] = X[col].astype(str)
X_encoded = pd.get_dummies(X, columns=categorical_cols)
logger.info(f"Encoded columns: {X_encoded.columns.tolist()}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train the XGBoost model
logger.info("Training XGBoost model...")
try:
    model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    model.fit(X_train, y_train)
    logger.info("Model training completed.")
except Exception as e:
    logger.error(f"Error training model: {e}")
    raise

# Save the model
try:
    joblib.dump(model, "xgb_model_updated.pkl")
    logger.info("Model saved as 'xgb_model_updated.pkl'")
except Exception as e:
    logger.error(f"Error saving model: {e}")
    raise

# Evaluate the model
try:
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    logger.info(f"Mean Squared Error: {mse:.2f}")
    logger.info(f"R-squared Score: {r2:.2f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared Score: {r2:.2f}")
except Exception as e:
    logger.error(f"Error evaluating model: {e}")
    raise

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

Mean Squared Error: 3321376.83
R-squared Score: 0.94


In [11]:
from google.colab import files
files.download('xgb_model_updated.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>