In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # More powerful model
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV


insurance = pd.read_csv('insurance.csv')

# %% [code]
# Improved preprocessing
# Handle categorical features correctly
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'bmi', 'children']),
        ('cat', OneHotEncoder(drop='first'), ['sex', 'smoker', 'region'])
    ])

# Define features and target
X = insurance.drop('charges', axis=1)
y = insurance['charges']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# Create modeling pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# %% [code]
# Evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Test R²: {r2_score(y_test, y_pred):.4f}")
print(f"MAE: ${mean_absolute_error(y_test, y_pred):.2f}")

# %% [code]
# Feature importance analysis
# Extract feature names after preprocessing
cat_features = best_model.named_steps['preprocessor']\
    .named_transformers_['cat']\
    .get_feature_names_out(['sex', 'smoker', 'region'])

feature_names = np.concatenate([
    ['age', 'bmi', 'children'],
    cat_features
])

importances = best_model.named_steps['regressor'].feature_importances_
pd.Series(importances, index=feature_names).sort_values(ascending=False)

import joblib
joblib.dump(best_model, 'optimized_insurance_model.pkl')

# Load for predictions
loaded_model = joblib.load('optimized_insurance_model.pkl')

Best Parameters: {'regressor__max_depth': 10, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 200}
Test R²: 0.8696
MAE: $2533.65


In [4]:
sample_data = pd.DataFrame({
    'age': [35],
    'sex': ['male'],
    'bmi': [26.5],
    'children': [2],
    'smoker': ['no'],
    'region': ['northwest']
})

prediction = loaded_model.predict(sample_data)
print(f"Predicted Cost: ${prediction[0]:.2f}")

Predicted Cost: $6832.18
