In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

file_path="./insurance.csv"
data=pd.read_csv(file_path)

print("Missing values:\n", data.isnull().sum())
#No missing values found.

X = data.drop("charges", axis=1)
y = data["charges"]

X_train, X_valid, y_train, y_valid=train_test_split(X,y,test_size=0.2,random_state=0)

categorical_features = [col for col in X_train.columns if X_train[col].dtype == "object"]
numerical_features = [col for col in X_train.columns if col not in categorical_features]

from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) #sparse has been changed to sparse_output
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[categorical_features]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[categorical_features]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(categorical_features, axis=1)
num_X_valid = X_valid.drop(categorical_features, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    print(mean_absolute_error(y_valid, preds))
    print(r2_score(y_valid,preds))
    return

score_dataset(OH_X_train, OH_X_valid, y_train, y_valid)

Missing values:
 age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64
2630.4977237984763
0.8775876145381236


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load dataset
file_path="./insurance.csv"
data = pd.read_csv(file_path)

print("Missing values:\n", data.isnull().sum())

# Split into features and target
X = data.drop("charges", axis=1)
y = data["charges"]

# Train/test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# Separate categorical and numerical
categorical_features = [col for col in X_train.columns if X_train[col].dtype == "object"]
numerical_features = [col for col in X_train.columns if col not in categorical_features]

# One-hot encode categorical
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[categorical_features]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[categorical_features]))

# One-hot encoding removed index; We put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Drop original categorical
num_X_train = X_train.drop(categorical_features, axis=1)
num_X_valid = X_valid.drop(categorical_features, axis=1)

# Combine numeric + one-hot
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# Fix column name types
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

# Standardize the features cuz models like SVR are sensitive to the size of the features.
#Helps in maintaining the contibution by each feature.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(OH_X_train)
X_valid_scaled = scaler.transform(OH_X_valid)

# SVR Model
model = SVR(kernel='linear')
model.fit(X_train_scaled, y_train)

# Predict and evaluate
preds = model.predict(X_valid_scaled)
print("MAE:", mean_absolute_error(y_valid, preds))
print("R² Score:", r2_score(y_valid, preds))


Missing values:
 age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64
MAE: 7763.940217836734
R² Score: 0.06737368052555748
