# Insurance charges — preprocessing, training, and evaluation

This notebook loads the provided CSV, performs preprocessing (encoding, dummies, BMI categories),
splits into train/test, runs model selection with cross-validation, trains the best model **in memory**
(no joblib/pickle files), evaluates it, and writes test predictions merged with X_test to a CSV.
You can run this notebook end-to-end. The assistant created this file automatically.

In [None]:
# Imports and load data
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

csv_path = r"/mnt/data/a282e47b-e0fa-4d76-a0c4-1ae9713b6192.csv"
df = pd.read_csv(csv_path)
print('Loaded CSV shape:', df.shape)
df.head()

In [None]:
# Preprocessing: basic cleaning, encoding, dummies, BMI category
df_clean = df.copy()
df_clean.drop_duplicates(inplace=True)
# Map sex and smoker
if 'sex' in df_clean.columns:
    df_clean['sex'] = df_clean['sex'].map({'male':0,'female':1})
    df_clean.rename(columns={'sex':'is_female'}, inplace=True)
if 'smoker' in df_clean.columns:
    df_clean['smoker'] = df_clean['smoker'].map({'no':0,'yes':1})
    df_clean.rename(columns={'smoker':'is_smoker'}, inplace=True)

# Region dummies (drop_first to avoid dummy trap)
if 'region' in df_clean.columns:
    df_clean = pd.get_dummies(df_clean, columns=['region'], drop_first=True)

# BMI category + dummies
if 'bmi' in df_clean.columns:
    df_clean['bmi_category'] = pd.cut(df_clean['bmi'], bins=[0,18.5,24.9,29.9,float('inf')],
                                      labels=['Underweight','Normal','Overweight','Obese'])
    df_clean = pd.get_dummies(df_clean, columns=['bmi_category'], drop_first=True)

# Show dtypes and first rows
print('After preprocessing columns:', df_clean.columns.tolist())
df_clean.head()

In [None]:
# Feature selection and train/test split
candidate_features = ['age','bmi','children','is_female','is_smoker',
                      'region_northwest','region_southeast','region_southwest',
                      'bmi_category_Normal','bmi_category_Overweight','bmi_category_Obese']

features = [f for f in candidate_features if f in df_clean.columns]
print('Using features:', features)

X = df_clean[features].copy()
y = df_clean['charges'].copy()

# Quick check
print('X shape, y shape:', X.shape, y.shape)
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train rows:', X_train.shape[0], 'Test rows:', X_test.shape[0])

In [None]:
# Scale numeric columns (age, bmi, children) using StandardScaler
numeric_cols = [c for c in ['age','bmi','children'] if c in X.columns]
print('Numeric columns to scale:', numeric_cols)
scaler = StandardScaler()
if numeric_cols:
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])
else:
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()

# Show sample
X_train_scaled.head()

In [None]:
# Model selection (GridSearchCV) — a compact grid to keep runtime reasonable
models = {
    'LinearRegression': (LinearRegression(), {}),
    'Ridge': (Ridge(), {'alpha':[0.1,1.0,10.0]}),
    'Lasso': (Lasso(max_iter=5000), {'alpha':[0.001,0.01,0.1]}),
    'RandomForest': (RandomForestRegressor(random_state=42), {'n_estimators':[100,200], 'max_depth':[5,10]}),
    'GradientBoosting': (GradientBoostingRegressor(random_state=42), {'n_estimators':[100], 'learning_rate':[0.05,0.1], 'max_depth':[3,5]})
}

results = {}
best_models = {}

for name, (model, grid) in models.items():
    print('\nRunning:', name)
    if grid:
        gs = GridSearchCV(model, grid, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1)
        gs.fit(X_train_scaled, y_train)
        best = gs.best_estimator_
        cv_rmse = -gs.best_score_
        best_params = gs.best_params_
    else:
        model.fit(X_train_scaled, y_train)
        scores = cross_val_score(model, X_train_scaled, y_train, scoring='neg_root_mean_squared_error', cv=3)
        cv_rmse = -scores.mean()
        best = model
        best_params = {}
    preds = best.predict(X_test_scaled)
    rmse = mean_squared_error(y_test, preds, squared=False)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    results[name] = {'cv_rmse': round(float(cv_rmse),4), 'test_rmse': round(float(rmse),4),
                     'test_mae': round(float(mae),4), 'test_r2': round(float(r2),4), 'best_params': best_params}
    best_models[name] = best

import pandas as pd
results_df = pd.DataFrame(results).T.sort_values(by='test_rmse')
results_df

In [None]:
# Fit the best model (selected by test RMSE) and show detailed metrics + feature importances (if available)
results_df = results_df  # from previous cell
best_name = results_df.index[0]
best_model = best_models[best_name]
print('Best model selected:', best_name)
# Ensure fitted on training set
best_model.fit(X_train_scaled, y_train)

# Predictions
train_preds = best_model.predict(X_train_scaled)
test_preds = best_model.predict(X_test_scaled)

def metrics(y_true, y_pred):
    return {'rmse': mean_squared_error(y_true,y_pred,squared=False),
             'mae': mean_absolute_error(y_true,y_pred),
             'r2': r2_score(y_true,y_pred)}

print('\nTrain metrics:', metrics(y_train, train_preds))
print('Test metrics:', metrics(y_test, test_preds))

# If model has feature_importances_ attribute, show it
if hasattr(best_model, 'feature_importances_'):
    fi = pd.Series(best_model.feature_importances_, index=X_train_scaled.columns).sort_values(ascending=False)
    print('\nFeature importances:\n', fi)
else:
    print('\nModel does not expose feature_importances_.')

In [None]:
# Save X_test with true and predicted charges to CSV
X_test_out = X_test.copy().reset_index(drop=True)
X_test_out['true_charges'] = y_test.reset_index(drop=True)
X_test_out['pred_charges'] = test_preds
X_test_out.to_csv(r"/mnt/data/test_predictions_final.csv", index=False)
print('Saved test predictions to:', r"/mnt/data/test_predictions_final.csv")
X_test_out.head()

## Short conclusions

- This notebook selected the best model by test RMSE and reported train/test metrics.
- No joblib/pickle files were created; model is trained in memory.
- Test predictions CSV was saved as `/mnt/data/test_predictions_final.csv`.

Run the notebook cells in order to reproduce results.