<a href="https://colab.research.google.com/github/SeminiNethra/Healthcare-Cost-Management/blob/main/Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

# Load the datasetcsvt
df = pd.read_csv('/content/Final Year Project Dataset - Sheet1 (1).csv')
print("Shape after loading:", df.shape)

# Clean and convert the target column to numeric
target_col = 'What is the average halthcare bill amount?'

def bill_to_num(x):
    if pd.isna(x) or x == 'Nothing':
        return np.nan
    x = str(x).replace(',', '').replace(' ', '').lower()
    if 'under10000' in x:
        return 5000
    elif '10000-50000' in x:
        return 30000
    elif '50000-100000' in x:
        return 75000
    elif 'morethan100000' in x:
        return 150000
    try:
        return float(x)
    except:
        return np.nan

print("Unique values in target before mapping:", df[target_col].unique())
df[target_col] = df[target_col].apply(bill_to_num)
print("Unique values in target after mapping:", df[target_col].unique())

# Drop rows where target is missing
df = df.dropna(subset=[target_col])
print("Shape after dropping missing target:", df.shape)

# Define categorical columns (use only those that exist)
cat_cols = [
    'Sex',
    'City You Live In',
    'Monthly Income Level',
    'Do You Have Any Chronic Disease',
    'Do you have any allergies?',
    'Do you consume alcoholic beverages?',
    'Do you smoke or use tobacco products?',
    'what type of hospital do you typically spend on medication per month?'
]
cat_cols = [col for col in cat_cols if col in df.columns]
print("Categorical columns used:", cat_cols)

# Replace 'Nothing' with np.nan throughout the entire DataFrame
df = df.replace('Nothing', np.nan)

# Fill missing values in categorical columns
for col in cat_cols:
    df[col] = df[col].fillna('Missing')

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
print("Shape after encoding:", df.shape)

# Drop unnecessary columns if present
drop_cols = [
    'Name',
    'If Yes, please specify',
    'Have you undergone any prior surgeries or procedures?',
    'If yes, please specify',
    'Do you have any other medical history that we should be aware of?'
]
for col in drop_cols:
    if col in df.columns:
        df = df.drop(col, axis=1)

# Convert all remaining string columns to numeric
for col in df.columns:
    if df[col].dtype == object:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill any remaining missing values with 0
df = df.fillna(0)

# Check if df is empty before proceeding
print("Final DataFrame shape before split:", df.shape)
if df.shape[0] == 0:
    raise ValueError("No data left after cleaning! Check the cleaning steps and your raw data.")

# Prepare features and target
x = df.drop([target_col], axis=1)
y = df[target_col]

print("Shape of x before split:", x.shape)
print("Shape of y before split:", y.shape)

# Now split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
print("Train/test split shapes:", x_train.shape, x_test.shape, y_train.shape, y_test.shape)

# Model comparison
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    # Fixed: Changed verbosity from 10 to 0 (within valid range of 0-3)
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

results = []

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    results.append({
        'Model': name,
        'R2 Score': r2,
        'Mean Absolute Error': mae,
        'Mean Squared Error': mse,
        'Root Mean Squared Error': rmse
    })

results_df = pd.DataFrame(results).sort_values(by='R2 Score', ascending=False)
print("\nModel Performance:")
print(results_df)

# Save the best model
best_model_name = results_df.iloc[0]['Model']
best_model = models[best_model_name]
joblib.dump(best_model, f'model_{best_model_name.replace(" ", "_").lower()}.pkl')
joblib.dump(x_train.columns, 'train_features.pkl')
print(f"\nBest model ({best_model_name}) saved as model_{best_model_name.replace(' ', '_').lower()}.pkl")



Shape after loading: (98, 17)
Unique values in target before mapping: ['Under 10, 000 ' '50, 000 - 100, 000' '10, 000 - 50,000' 'Under 10, 019'
 'Under 10, 020']
Unique values in target after mapping: [ 5000. 75000. 30000.    nan]
Shape after dropping missing target: (96, 17)
Categorical columns used: ['Sex', 'City You Live In', 'Monthly Income Level', 'Do You Have Any Chronic Disease', 'Do you have any allergies?', 'Do you consume alcoholic beverages?', 'Do you smoke or use tobacco products?', 'what type of hospital do you typically spend on medication per month?']
Shape after encoding: (96, 42)
Final DataFrame shape before split: (96, 37)
Shape of x before split: (96, 36)
Shape of y before split: (96,)
Train/test split shapes: (72, 36) (24, 36) (72,) (24,)
Training Linear Regression...
Training Random Forest...
Training Gradient Boosting...
Training XGBoost...

Model Performance:
               Model  R2 Score  Mean Absolute Error  Mean Squared Error  \
1      Random Forest  0.457770

**Dashboard**

In [45]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib

# Load the trained model and feature names used during training
model = joblib.load('model_random_forest.pkl')
train_features = joblib.load('train_features.pkl')

st.title('Healthcare Cost Prediction')
st.write("Enter patient details to predict estimated healthcare cost.")

# User input
age = st.slider("Age", 0, 100, 30)
gender = st.selectbox("Gender", ["Male", "Female"])
bmi = st.slider("BMI", 10.0, 50.0, 22.0)
city = st.selectbox("City You Live In", [
    "Biyagama", "Dehiwala", "Kalmunai", "Vavuniya", "Galle", "Trincomalee",
    "Batticalo", "Jaffna", "Matale", "Katunayaka", "Dambulla", "Kolonnawa",
    "Anuradhapura", "Rathnapura", "Moratuwa", "Sri Jayawardanapura Kotte",
    "Chilaw", "Colombo", "Homagama", "Kandy", "Negombo", "Other"
])
income = st.selectbox("Monthly Income Level", [
    "Less than 25,000", "25,000-50,000", "50,000-100,000",
    "More than 100,000"])
chronic_disease = st.selectbox("Do You Have Any Chronic Disease", ["Yes", "No"])
allergies = st.selectbox("Do you have any allergies?", ["Yes", "No"])
alcohol = st.selectbox("Do you consume alcoholic beverages?", ["Yes", "No"])
tobacco = st.selectbox("Do you smoke or use tobacco products?", ["Yes", "No"])
hospital_type = st.selectbox("What type of hospital do you typically spend on medication per month?",
                           ["Government", "Private", "Both"])

# Initialize a dictionary with zeros for all expected columns
input_dict = {col: 0 for col in train_features}

# Set numerical features (if they exist in train_features)
if "Age" in train_features:
    input_dict["Age"] = age
if "BMI" in train_features:
    input_dict["BMI"] = bmi

# Set one-hot encoded categorical variables by matching exact column names
# Sex/Gender
sex_col = f"Sex_{gender}"
if sex_col in train_features:
    input_dict[sex_col] = 1

# City
city_col = f"City You Live In_{city}"
if city_col in train_features:
    input_dict[city_col] = 1

# Income
income_col = f"Monthly Income Level_{income}"
if income_col in train_features:
    input_dict[income_col] = 1

# Chronic Disease
disease_col = f"Do You Have Any Chronic Disease_{chronic_disease}"
if disease_col in train_features:
    input_dict[disease_col] = 1

# Allergies
allergies_col = f"Do you have any allergies?_{allergies}"
if allergies_col in train_features:
    input_dict[allergies_col] = 1

# Alcohol
alcohol_col = f"Do you consume alcoholic beverages?_{alcohol}"
if alcohol_col in train_features:
    input_dict[alcohol_col] = 1

# Tobacco
tobacco_col = f"Do you smoke or use tobacco products?_{tobacco}"
if tobacco_col in train_features:
    input_dict[tobacco_col] = 1

# Hospital Type
hospital_col = f"what type of hospital do you typically spend on medication per month?_{hospital_type}"
if hospital_col in train_features:
    input_dict[hospital_col] = 1

# Create DataFrame with exact training features and order
input_df = pd.DataFrame([input_dict])[train_features]

if st.button("Predict"):
    try:
        prediction = model.predict(input_df)[0]
        st.success(f"💰 Estimated Healthcare Cost: LKR {int(prediction):,}")
    except Exception as e:
        st.error(f"Prediction error: {str(e)}")
        st.info("Please check that all feature names match exactly with training data.")


Overwriting app.py


Display the dashboard

In [46]:
!pip install -q streamlit
!npm install -g localtunnel





[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K
changed 22 packages in 2s
[1G[0K⠙[1G[0K
[1G[0K⠙[1G[0K3 packages are looking for funding
[1G[0K⠙[1G[0K  run `npm fund` for details
[1G[0K⠙[1G[0K

In [47]:
!wget -q -o - ipv4.icanhazip.com

In [None]:
!streamlit run app.py & npx localtunnel --port 8501



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.90.224:8501[0m
[0m
[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0Kyour url is: https://hot-candles-raise.loca.lt
