<a href="https://colab.research.google.com/github/UsmanShafeeq/Advanced-Machine-Learning-Project/blob/main/Medical_Cost_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Medical Cost Analysis

# 1. Introduction
The Medical Cost Analysis project aims to predict healthcare costs using machine learning techniques. It identifies key cost-driving factors and provides insights to optimize medical expenditures. The project leverages real-world datasets for analysis and prediction.

# 2. Important Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

# 3- Data Collection and Description

In [None]:
pip install kaggle

In [None]:
import os
import pandas as pd

# Download the Kaggle dataset
def download_dataset():
    dataset = "mirichoi0218/insurance"
    download_path = "datasets/insurance"
    os.makedirs(download_path, exist_ok=True)

    # Use Kaggle API to download the dataset
    os.system(f"kaggle datasets download -d {dataset} -p {download_path} --unzip")
    print(f"Dataset downloaded to {download_path}")
    return f"{download_path}/insurance.csv"

# Load the dataset
def load_dataset():
    path = download_dataset()
    df = pd.read_csv(path)
    return df

# Use the function to read the dataset
df = load_dataset()

# Display the first few rows of the dataset
print(df.head())


# 3- Display Basic Information

In [None]:
df1 = df.copy()

In [None]:
# Display last row about data
df1.tail()

In [None]:
# Display random sample about data
df1.sample(5)

In [None]:
# Display basic information about the dataset
df1.info()

In [None]:
# Check for missing values
df1.isnull().sum()

In [None]:
# Display column names
df1.columns.tolist()

In [None]:
# Display shape of the dataset (number of rows and columns)
print(f"Rows: {df1.shape[0]}, Columns: {df1.shape[1]}")

In [None]:
# Display statistical summary of numerical columns
df1.describe(exclude = 'object').style.background_gradient(cmap='BuPu')

In [None]:
# Display unique values in each column
print("\nUnique Values in Each Column:")
for column in df1.columns:
  print(f"{column}:{df1[column].nunique()})")

# 4- Exploratory Data Analysis (EDA)

In [None]:
# Scatterplot: charges vs. bmi
plt.figure(figsize=(8, 6))
sns.scatterplot(x='bmi', y='charges', data=df, color='blue', alpha=0.6)
plt.title('Medical Charges vs. BMI')
plt.xlabel('BMI')
plt.ylabel('Medical Charges')
plt.grid(True)
plt.show()

In [None]:
# Scatterplot: charges vs. age
plt.figure(figsize=(8, 6))
sns.scatterplot(x='age', y='charges', data=df, color='green', alpha=0.6)
plt.title('Medical Charges vs. Age')
plt.xlabel('Age')
plt.ylabel('Medical Charges')
plt.grid(True)
plt.show()

In [None]:
# Scatterplot: charges vs. children
plt.figure(figsize=(8, 6))
sns.scatterplot(x='children', y='charges', data=df, color='orange', alpha=0.6)
plt.title('Medical Charges vs. Number of Children')
plt.xlabel('Number of Children')
plt.ylabel('Medical Charges')
plt.grid(True)
plt.show()

In [None]:
# Pairplot to visualize relationships among numerical variables
sns.pairplot(df[['age', 'bmi', 'children', 'charges']])
plt.suptitle('Pairplot of Numerical Features', y=1.02)
plt.show()

In [None]:
#Compute the correlation matrix
corr_matrix = df[['age', 'bmi', 'children', 'charges']].corr()


sns.set(style='white', palette='muted')


plt.figure(figsize=(10, 8))
heatmap = sns.heatmap(corr_matrix, annot=True, cmap='YlGnBu',fmt='.2f',cbar=True, linewidths=0.5, linecolor='white',annot_kws={'size': 12, 'weight': 'bold', 'color': 'black'},
    square=True,
    vmin=-1, vmax=1
)


plt.title('Correlation Matrix of Numerical Features', fontsize=16, weight='bold', color='darkblue')


plt.tight_layout()
plt.show()

In [None]:
# Create boxplot for 'bmi'
plt.figure(figsize=(8, 6))
sns.boxplot(x='bmi', data=df, color='skyblue', linewidth=1.5)
plt.title('Boxplot of BMI')
plt.xlabel('BMI')
plt.grid(True)
plt.show()

In [None]:
# Create boxplot for 'charges'
plt.figure(figsize=(8, 6))
sns.boxplot(x='charges', data=df, color='lightgreen', linewidth=1.5)
plt.title('Boxplot of Medical Charges')
plt.xlabel('Medical Charges')
plt.grid(True)
plt.show()

In [None]:
# Calculate IQR for 'bmi'
Q1_bmi = df['bmi'].quantile(0.25)
Q3_bmi = df['bmi'].quantile(0.75)
IQR_bmi = Q3_bmi - Q1_bmi
lower_bound_bmi = Q1_bmi - 1.5 * IQR_bmi
upper_bound_bmi = Q3_bmi + 1.5 * IQR_bmi

# Find outliers in 'bmi'
bmi_outliers = df[(df['bmi'] < lower_bound_bmi) | (df['bmi'] > upper_bound_bmi)]

# Calculate IQR for 'charges'
Q1_charges = df['charges'].quantile(0.25)
Q3_charges = df['charges'].quantile(0.75)
IQR_charges = Q3_charges - Q1_charges
lower_bound_charges = Q1_charges - 1.5 * IQR_charges
upper_bound_charges = Q3_charges + 1.5 * IQR_charges

# Find outliers in 'charges'
charges_outliers = df[(df['charges'] < lower_bound_charges) | (df['charges'] > upper_bound_charges)]

# Display the number of outliers detected
print(f"Number of BMI outliers: {bmi_outliers.shape[0]}")
print(f"Number of Charges outliers: {charges_outliers.shape[0]}")


In [None]:
# Calculate IQR for 'bmi'
Q1_bmi = df['bmi'].quantile(0.25)
Q3_bmi = df['bmi'].quantile(0.75)
IQR_bmi = Q3_bmi - Q1_bmi
lower_bound_bmi = Q1_bmi - 1.5 * IQR_bmi
upper_bound_bmi = Q3_bmi + 1.5 * IQR_bmi

# Calculate IQR for 'charges'
Q1_charges = df['charges'].quantile(0.25)
Q3_charges = df['charges'].quantile(0.75)
IQR_charges = Q3_charges - Q1_charges
lower_bound_charges = Q1_charges - 1.5 * IQR_charges
upper_bound_charges = Q3_charges + 1.5 * IQR_charges

# Filter out the outliers from 'bmi' and 'charges'
df_cleaned = df[(df['bmi'] >= lower_bound_bmi) & (df['bmi'] <= upper_bound_bmi) &
                (df['charges'] >= lower_bound_charges) & (df['charges'] <= upper_bound_charges)]

# Check the shape of the dataset before and after removing outliers
print(f"Original dataset shape: {df.shape}")
print(f"Cleaned dataset shape: {df_cleaned.shape}")

# Display first few rows of the cleaned dataset
df_cleaned.head()


# Model Development

**Train-Test Split Code:**

In [None]:
from sklearn.model_selection import train_test_split

# Define feature columns (X) and target column (y)
X = df_cleaned[['age', 'bmi', 'children']]  # Independent variables
y = df_cleaned['charges']  # Dependent variable (target)

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting datasets
print(f"Training data shape: {X_train.shape}, Test data shape: {X_test.shape}")


# 5.3 Baseline Model Selection

**Baseline Model: Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Instantiate the model
baseline_model = LinearRegression()

# Train the model on the training data
baseline_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = baseline_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE) for Baseline Model: {mse}")
print(f"R-squared (R2) for Baseline Model: {r2}")


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Instantiate the model
rf_model = RandomForestRegressor(random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest - Mean Squared Error: {mse_rf}")
print(f"Random Forest - R-squared: {r2_rf}")


# 5.4 Model Evaluation Metrics


1. Mean Squared Error (MSE):

In [None]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


2. Root Mean Squared Error (RMSE):

In [None]:
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")


3. R-squared (RÂ²):

In [None]:
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")


# 5.5 Hyperparameter Tuning


In [None]:
from sklearn.model_selection import GridSearchCV

# Set up the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate the Random Forest Regressor model
rf = RandomForestRegressor(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearchCV
print(f"Best parameters: {grid_search.best_params_}")

# Get the best model
best_rf_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_best_rf = best_rf_model.predict(X_test)
mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)
r2_best_rf = r2_score(y_test, y_pred_best_rf)

print(f"Best Random Forest - Mean Squared Error: {mse_best_rf}")
print(f"Best Random Forest - R-squared: {r2_best_rf}")


# 5.6 Model Comparison

In [None]:
# Store model evaluation metrics
models = {
    "Linear Regression": {"mse": mse, "r2": r2},
    "Random Forest": {"mse": mse_rf, "r2": r2_rf},
    "Best Random Forest": {"mse": mse_best_rf, "r2": r2_best_rf}
}

# Create a DataFrame to display results
model_comparison = pd.DataFrame(models).T
print(model_comparison)


# 5.7 Feature Importance Analysis

In [None]:
# Get feature importances from the Random Forest model
feature_importances = best_rf_model.feature_importances_

# Create a DataFrame for better readability
feature_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": feature_importances
})

# Sort the features by importance
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

# Display the feature importance
print(feature_importance_df)


# 5.8 Model Deployment (Optional)

In [None]:
import joblib

# Save the model
joblib.dump(best_rf_model, 'medical_cost_prediction_model.pkl')

# Load the model (for later use)
model = joblib.load('medical_cost_prediction_model.pkl')

# Use the model to make predictions
predictions = model.predict(X_test)


# 5.9 Cross-Validation (Optional)

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation with the random forest model
cv_scores = cross_val_score(best_rf_model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE to positive MSE
cv_scores = -cv_scores

# Print the cross-validation scores
print(f"Cross-validation MSE scores: {cv_scores}")
print(f"Mean MSE across all folds: {cv_scores.mean()}")


# 5.10 Learning Curves (Optional)

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

# Get the learning curve
train_sizes, train_scores, test_scores = learning_curve(best_rf_model, X, y, cv=5)

# Plot the learning curves
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores.mean(axis=1), label='Train Score', color='blue')
plt.plot(train_sizes, test_scores.mean(axis=1), label='Test Score', color='red')
plt.xlabel('Training Size')
plt.ylabel('Score')
plt.title('Learning Curves for Random Forest Model')
plt.legend(loc='best')
plt.grid(True)
plt.show()


# 5.11 Feature Engineering (Optional)

In [None]:
# Create a new feature: age times BMI interaction
df['age_bmi_interaction'] = df['age'] * df['bmi']

# Log transform the charges (since charges might have a skewed distribution)
df['log_charges'] = np.log(df['charges'])


In [None]:
# Bin age into categories: 'Young', 'Middle-Aged', 'Old'
df['age_category'] = pd.cut(df['age'], bins=[0, 30, 50, np.inf], labels=['Young', 'Middle-Aged', 'Old'])


In [None]:
from flask import Flask, request, jsonify
import joblib
import numpy as np

# Load the pre-trained model
model = joblib.load('medical_cost_prediction_model.pkl')

app = Flask(__name__)

# Define an API endpoint to make predictions
@app.route('/predict', methods=['POST'])
def predict():
    # Get the input data from the request
    data = request.get_json()

    # Convert the data into a NumPy array (assuming the features are passed as JSON)
    input_features = np.array([data['age'], data['bmi'], data['children'], data['sex_female'], data['sex_male'], data['smoker_no'], data['smoker_yes'], data['region_northeast'], data['region_northwest'], data['region_southeast'], data['region_southwest']]).reshape(1, -1)

    # Make a prediction
    prediction = model.predict(input_features)

    # Return the prediction as JSON
    return jsonify({"predicted_charges": prediction[0]})

if __name__ == '__main__':
    app.run(debug=True)




 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
