In [1]:

import pandas as pd

# Load dataset
DATA_PATH = "../data/insurance.csv"
df = pd.read_csv(DATA_PATH)

# Separate features and target
X = df.drop(columns=["charges"])
y = df["charges"]

print("Features shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (1338, 6)
Target shape: (1338,)


In [2]:

# Identify feature types
numerical_features = ["age", "bmi", "children"]
categorical_features = ["sex", "smoker", "region"]

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)


Numerical features: ['age', 'bmi', 'children']
Categorical features: ['sex', 'smoker', 'region']


In [3]:

# Missing value check
missing = df.isnull().sum()
print("Missing values per column:")
print(missing)


Missing values per column:
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [4]:

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Numerical pipeline (scaled)
num_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler())
])

# Categorical pipeline (NOT scaled)
cat_pipeline = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, numerical_features),
        ("cat", cat_pipeline, categorical_features)
    ]
)

print("Preprocessing pipeline created successfully")


Preprocessing pipeline created successfully


In [5]:

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

print("Processed feature matrix shape:", X_processed.shape)


Processed feature matrix shape: (1338, 11)


In [7]:

import joblib
import os

os.makedirs("../models", exist_ok=True)

joblib.dump(preprocessor, "../models/preprocessor.pkl")
joblib.dump(X_processed, "../models/X_processed.pkl")
joblib.dump(y, "../models/y.pkl")

print("Artifacts saved:")
print("- preprocessor.pkl")
print("- X_processed.pkl")
print("- y.pkl")


Artifacts saved:
- preprocessor.pkl
- X_processed.pkl
- y.pkl


In [8]:
import pandas as pd

# Load cleaned dataset from Phase M4
df = pd.read_csv("data/insurance_cleaned.csv")

df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'data/insurance_cleaned.csv'