In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../../Data/Medical-Cost-Data/medical_cost.csv")

In [3]:
# Perform classification where charges higher than the median are classified as yes/no based on other features
median_charge = df['charges'].median()
print(f"The median charge is: ${median_charge}")

# print("-------------------------------------")

X = df.drop(columns=['charges'])
# Add additional feature
X['is_obese'] = (df['bmi'] > 30).astype(int)
y = (df['charges'] > median_charge).astype(int)

The median charge is: $9382.033


In [4]:
# Perform cross validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Scale data
scaler = StandardScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Train Logistic Regression model for classification
model = LogisticRegression(random_state=0)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

In [6]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print(f"Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Model Accuracy: 93.28%
Confusion Matrix:
[[130   8]
 [ 10 120]]
