<a href="https://colab.research.google.com/github/SeminiNethra/Healthcare-Cost-Management/blob/main/Model_Expalinability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# model_training.py
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt

# Load and preprocess data
df = pd.read_csv('/content/Final Year Project Dataset - Sheet1 (1).csv')

# Clean and convert the target column to numeric
target_col = 'What is the average halthcare bill amount?'

def bill_to_num(x):
    if pd.isna(x) or x == 'Nothing':
        return np.nan
    x = str(x).replace(',', '').replace(' ', '').lower()
    if 'under10000' in x:
        return 5000
    elif '10000-50000' in x:
        return 30000
    elif '50000-100000' in x:
        return 75000
    elif 'morethan100000' in x:
        return 150000
    try:
        return float(x)
    except:
        return np.nan

print("Unique values in target before mapping:", df[target_col].unique())
df[target_col] = df[target_col].apply(bill_to_num)
print("Unique values in target after mapping:", df[target_col].unique())

# Drop rows where target is missing
df = df.dropna(subset=[target_col])
print("Shape after dropping missing target:", df.shape)

# Define categorical columns (use only those that exist)
cat_cols = [
    'Sex',
    'City You Live In',
    'Monthly Income Level',
    'Do You Have Any Chronic Disease',
    'Do you have any allergies?',
    'Do you consume alcoholic beverages?',
    'Do you smoke or use tobacco products?',
    'what type of hospital do you typically spend on medication per month?'
]
cat_cols = [col for col in cat_cols if col in df.columns]
print("Categorical columns used:", cat_cols)

# Replace 'Nothing' with np.nan throughout the entire DataFrame
df = df.replace('Nothing', np.nan)

# Fill missing values in categorical columns
for col in cat_cols:
    df[col] = df[col].fillna('Missing')

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
print("Shape after encoding:", df.shape)

# Drop unnecessary columns if present
drop_cols = [
    'Name',
    'If Yes, please specify',
    'Have you undergone any prior surgeries or procedures?',
    'If yes, please specify',
    'Do you have any other medical history that we should be aware of?'
]
for col in drop_cols:
    if col in df.columns:
        df = df.drop(col, axis=1)

# Convert all remaining string columns to numeric
for col in df.columns:
    if df[col].dtype == object:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill any remaining missing values with 0
df = df.fillna(0)

# Check if df is empty before proceeding
print("Final DataFrame shape before split:", df.shape)
if df.shape[0] == 0:
    raise ValueError("No data left after cleaning! Check the cleaning steps and your raw data.")

# Prepare features and target
x = df.drop([target_col], axis=1)
y = df[target_col]

print("Shape of x before split:", x.shape)
print("Shape of y before split:", y.shape)

# Now split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
print("Train/test split shapes:", x_train.shape, x_test.shape, y_train.shape, y_test.shape)


# Save feature names after preprocessing
joblib.dump(list(x.columns), 'train_features.pkl')

# Model comparison with hyperparameter tuning
models = {
    'Random Forest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5],
            'max_features': ['sqrt', 'log2']
        }
    },
    'XGBoost': {
        'model': xgb.XGBRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 6],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5]
        }
    },
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}
    }
}

results = []

for name, config in models.items():
    print(f"\n{'='*40}\nTraining {name}\n{'='*40}")

    # Hyperparameter tuning
    if config['params']:
        gs = GridSearchCV(config['model'], config['params'],
                         cv=3, scoring='r2', n_jobs=-1, verbose=1)
        gs.fit(x_train, y_train)
        best_model = gs.best_estimator_
        print(f"Best parameters: {gs.best_params_}")
    else:
        best_model = config['model'].fit(x_train, y_train)

    # Evaluation
    y_pred = best_model.predict(x_test)
    metrics = {
        'Model': name,
        'R2': r2_score(y_test, y_pred),
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'Best Params': str(gs.best_params_) if config['params'] else 'N/A'
    }
    results.append(metrics)

    # Save model and tuning results
    joblib.dump(best_model, f'model_{name.lower().replace(" ", "_")}.pkl')
    joblib.dump(gs.cv_results_, f'{name.lower()}_cv_results.pkl')

# Create comparison report
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('R2', ascending=False)
print("\nModel Comparison Report:")
print(results_df.to_string(index=False))

# Save comparison plot
plt.figure(figsize=(10, 6))
results_df.set_index('Model')['R2'].plot(kind='barh')
plt.title('Model Comparison (R² Score)')
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.close()


Unique values in target before mapping: ['Under 10, 000 ' '50, 000 - 100, 000' '10, 000 - 50,000' 'Under 10, 019'
 'Under 10, 020']
Unique values in target after mapping: [ 5000. 75000. 30000.    nan]
Shape after dropping missing target: (96, 17)
Categorical columns used: ['Sex', 'City You Live In', 'Monthly Income Level', 'Do You Have Any Chronic Disease', 'Do you have any allergies?', 'Do you consume alcoholic beverages?', 'Do you smoke or use tobacco products?', 'what type of hospital do you typically spend on medication per month?']
Shape after encoding: (96, 42)
Final DataFrame shape before split: (96, 37)
Shape of x before split: (96, 36)
Shape of y before split: (96,)
Train/test split shapes: (72, 36) (24, 36) (72,) (24,)

Training Random Forest
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best parameters: {'max_depth': 10, 'max_features': 'log2', 'min_samples_split': 5, 'n_estimators': 100}

Training XGBoost
Fitting 3 folds for each of 32 candidates, totalling 9

In [2]:
#app.py
%%writefile app.py
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import joblib
import shap

st.title('Healthcare Cost Prediction')
st.write("Enter patient details to predict estimated healthcare cost.")

# Load the trained model and feature names
try:
    train_features = joblib.load('train_features.pkl')

    # Try to load multiple models with error handling
    models = {}
    available_models = ['Random Forest', 'XGBoost', 'Gradient Boosting', 'Linear Regression']
    for model_name in available_models:
        try:
            model_filename = f"model_{model_name.lower().replace(' ', '_')}.pkl"
            models[model_name] = joblib.load(model_filename)
        except FileNotFoundError:
            pass

    if not models:
        st.error("No model files found. Please ensure models are trained and available.")
        st.stop()

    # Set default selected model
    selected_model = list(models.keys())[0]

    # Try to load model comparison results
    try:
        results_df = pd.read_csv('model_comparison.csv')
        st.subheader("Performance Comparison")
        col1, col2 = st.columns(2)
        with col2:
            st.dataframe(results_df.style.format({'R2': '{:.3f}', 'MAE': '{:.1f}', 'RMSE': '{:.1f}'}))
        with col1:
            try:
                st.image('model_comparison.png')
            except:
                plt.figure(figsize=(10, 6))
                plt.bar(results_df['Model'], results_df['R2'])
                plt.title('Model Comparison (R² Score)')
                plt.ylim(0, 1)
                st.pyplot(plt)
    except FileNotFoundError:
        st.warning("Model comparison data not available")

    # Hyperparameter details
    if len(models) > 1:
        st.subheader("Best Hyperparameters")
        selected_model = st.selectbox("Select model to view parameters", list(models.keys()))
    model = models[selected_model]
    try:
        params = model.get_params()
        st.json(params)
    except AttributeError:
        st.warning("Hyperparameters not available for this model")

    # Feature importance visualization
    st.subheader("Feature Importance")
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        fi_df = pd.DataFrame({'Feature': train_features, 'Importance': importances})
        fi_df = fi_df.nlargest(10, 'Importance').sort_values('Importance', ascending=True)
        plt.figure(figsize=(10, 6))
        plt.barh(fi_df['Feature'], fi_df['Importance'])
        plt.title('Top 10 Important Features')
        st.pyplot(plt)
    else:
        st.warning("Feature importance not available for this model type")

    # Prediction interface
    st.header("Cost Prediction Interface")
    # User input
    age = st.slider("Age", 0, 100, 30)
    gender = st.selectbox("Gender", ["Male", "Female"])
    bmi = st.slider("BMI", 10.0, 50.0, 22.0)
    city = st.selectbox("City You Live In", [
        "Biyagama", "Dehiwala", "Kalmunai", "Vavuniya", "Galle", "Trincomalee",
        "Batticalo", "Jaffna", "Matale", "Katunayaka", "Dambulla", "Kolonnawa",
        "Anuradhapura", "Rathnapura", "Moratuwa", "Sri Jayawardanapura Kotte",
        "Chilaw", "Colombo", "Homagama", "Kandy", "Negombo", "Other"
    ])
    income = st.selectbox("Monthly Income Level", [
        "Less than 25,000", "25,000-50,000", "50,000-100,000",
        "More than 100,000"])
    chronic_disease = st.selectbox("Do You Have Any Chronic Disease", ["Yes", "No"])
    allergies = st.selectbox("Do you have any allergies?", ["Yes", "No"])
    alcohol = st.selectbox("Do you consume alcoholic beverages?", ["Yes", "No"])
    tobacco = st.selectbox("Do you smoke or use tobacco products?", ["Yes", "No"])
    hospital_type = st.selectbox("What type of hospital do you typically spend on medication per month?",
                              ["Government", "Private", "Both"])

    # Initialize a dictionary with zeros for all expected columns
    input_dict = {col: 0 for col in train_features}
    # Set numerical features
    if "Age" in train_features: input_dict["Age"] = age
    if "BMI" in train_features: input_dict["BMI"] = bmi
    # Set one-hot encoded categorical variables
    if (col := f"Sex_{gender}") in train_features: input_dict[col] = 1
    if (col := f"City You Live In_{city}") in train_features: input_dict[col] = 1
    if (col := f"Monthly Income Level_{income}") in train_features: input_dict[col] = 1
    if (col := f"Do You Have Any Chronic Disease_{chronic_disease}") in train_features: input_dict[col] = 1
    if (col := f"Do you have any allergies?_{allergies}") in train_features: input_dict[col] = 1
    if (col := f"Do you consume alcoholic beverages?_{alcohol}") in train_features: input_dict[col] = 1
    if (col := f"Do you smoke or use tobacco products?_{tobacco}") in train_features: input_dict[col] = 1
    if (col := f"what type of hospital do you typically spend on medication per month?_{hospital_type}") in train_features: input_dict[col] = 1

    input_df = pd.DataFrame([input_dict])[train_features]

    if st.button("Predict"):
        try:
            prediction = model.predict(input_df)[0]
            st.success(f"💰 Estimated Healthcare Cost: LKR {int(prediction):,}")
            st.subheader("Model Explainability (SHAP)")
            try:
                # Use TreeExplainer for tree-based models
                explainer = shap.TreeExplainer(model)
                shap_values = explainer.shap_values(input_df)
                # Global explanation
                st.write("**Global Feature Impact:**")
                shap.summary_plot(shap_values, input_df, plot_type="bar", show=False)
                st.pyplot(plt.gcf())
                plt.clf()
                # Local explanation (for this prediction)
                st.write("**This Prediction's Explanation:**")
                shap.initjs()
                force_plot = shap.force_plot(explainer.expected_value, shap_values[0], input_df.iloc[0], matplotlib=True, show=False)
                st.pyplot(force_plot.figure)
            except Exception as e:
                st.warning(f"SHAP explanation not available: {e}")
        except Exception as e:
            st.error(f"Prediction error: {str(e)}")
            st.info("Please check that all feature names match exactly with training data.")

except FileNotFoundError as e:
    st.error(f"Required file not found: {str(e)}")
    st.info("Please make sure you've run the model training code that creates train_features.pkl and model files.")


Writing app.py


In [3]:
!pip install -q streamlit
!npm install -g localtunnel

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K
added 22 packages in 2s
[1G[0K⠋[1G[0K
[1G[0K⠋[1G[0K3 packages are looking for funding
[1G[0K⠋[1G[0K  run `npm fund` for details
[1G[0K⠋[1G[0K

In [4]:
!wget -q -o - ipv4.icanhazip.com

In [None]:
!streamlit run app.py & npx localtunnel --port 8501

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠏[1G[0Kyour url is: https://shiny-showers-fly.loca.lt
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.106.161.213:8501[0m
[0m
