In [22]:
""" Cited source:
    - https://www.kaggle.com/code/sadafpj/insurance-prediction-using-regression-regulation
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler  
from sklearn.linear_model import LinearRegression

In [23]:
# Dataset contains age, sex, BMI, how many children, if person smokes, region, and medical costs.
# We're attempting to predict the insurance costs.

# Read dataset using pandas.
data = pd.read_csv("C:\\Users\\gaibo\\OneDrive\\Escritorio\\Python Projects\\CS_Alberto\\sklearn-umb-cs-workshop\\datasets\\insurance.csv")
print(data.head)


data_frame_insurance = pd.DataFrame(data)

<bound method NDFrame.head of       age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1338 rows x 7 columns]>


In [24]:

cols = ['age', 'bmi', 'children', 'sex_male', 'smoker_yes', 'region_northwest', 'region_southeast', 'region_southwest']

# Convert categorical features into numerical features, as ML models can't work with categorical figs
# (e.g., red, big, True, etc). We want to encode the columns with the categorical values using one-hot encoding.
data_frame_insurance_converted = pd.get_dummies(data_frame_insurance, 
                                                columns = ['sex', 'smoker', 'region'], 
                                                drop_first = True)


data_frame_insurance_converted = data_frame_insurance_converted.astype(int)

In [25]:
# Select X and Y.
X = pd.DataFrame(data_frame_insurance_converted, columns = cols)
Y = data_frame_insurance_converted['charges']

# Split data into train and test set.
x_train, x_test, y_train, y_test = train_test_split(X, Y, 
                                                    train_size = 0.7, 
                                                    test_size = 0.3)

In [26]:
# Build model.
model = LinearRegression()

In [34]:
# Scale the train and test values for X.
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Train linear regression model.
model.fit(x_train_scaled, y_train)

# Predict against x_test_scaled.
y_pred = model.predict(x_test_scaled)

In [35]:
# Evaluate the model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

# Calculate Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2) Score: {r2:.2f}")


Mean Absolute Error (MAE): 4028.97
Mean Squared Error (MSE): 33169223.16
Root Mean Squared Error (RMSE): 5759.27
R-squared (R2) Score: 0.73
