In [119]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import joblib

In [120]:
df = pd.read_csv("../../Data/Medical-Cost-Data/medical_cost.csv")

In [121]:
# Perform classification where charges higher than the median are classified as yes/no based on other features
median_charge = df['charges'].median()
print(f"The median charge is: ${median_charge}")

X = df.drop(columns=['charges'])
y = (df['charges'] > median_charge).astype(int)

The median charge is: $9382.033


In [122]:
# Perform cross validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Scale data
scaler = StandardScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [123]:
# Train Logistic Regression model for classification
# model = LogisticRegression(random_state=0)
model = LogisticRegression(class_weight='balanced', random_state=0)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

In [None]:
model = LogisticRegression(class_weight='balanced', random_state=0)

param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear", "saga"],
}

grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
)

grid.fit(X_train_scaled, y_train)

print("Best params:", grid.best_params_)
print("Best CV accuracy:", round(grid.best_score_, 4))

best_model = grid.best_estimator_

y_pred = best_model.predict(X_test_scaled)

scores = cross_val_score(best_model, X_train_scaled, y_train, cv=10, scoring="accuracy")

print(f"Overall accuracy:")
print(scores.mean())

print(f"Accuracy standard deviation:")
print(scores.std())

print(f"Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"Saving model")
joblib.dump(best_model, "./Saved-Models/medical_cost_logistic_model.pkl")

Best params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best CV accuracy: 0.9252
Score mean:
0.9242990654205607
Score standard deviation:
0.026251536271379702
Confusion Matrix:
[[134   4]
 [ 12 118]]
Saving model


['./Saved-Models/medical_cost_logistic_model.pkl']