In [None]:
import pandas as pd
import numpy as np

# Access the insurance csv file, accessed from https://www.kaggle.com/datasets/willianoliveiragibin/healthcare-insurance
insurance_df = pd.read_csv('insurance.csv')
insurance_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [22]:
# Find minimum and maximum values for all continuous columns... are there any outliers?
print("Minimum value for 'age' column: " + str(insurance_df['age'].min()) + 
      "\nMaximum value for 'age' column: " + str(insurance_df['age'].max()) +
      "\nMinimum value for 'bmi' column: " + str(insurance_df['bmi'].min()) + 
      "\nMaximum value for 'bmi' column: " + str(insurance_df['bmi'].max()) +
      "\nMinimum value for 'children' column: " + str(insurance_df['children'].min()) + 
      "\nMaximum value for 'children' column: " + str(insurance_df['children'].max()) +      
      "\nMinimum value for 'charges' column: " + str(insurance_df['charges'].min()) + 
      "\nMaximum value for 'charges' column: " + str(insurance_df['charges'].max()))

print("-----------------")
# Find # of unique values for each categorical column.
print("Unique values (" + str(len(insurance_df['sex'].unique())) + ") for 'sex' column: \n" + str(insurance_df['sex'].unique()))
print("Unique values (" + str(len(insurance_df['smoker'].unique())) + ") for 'smoker' column: \n" + str(insurance_df['smoker'].unique()))
print("Unique values (" + str(len(insurance_df['region'].unique())) + ") for 'region' column: \n" + str(insurance_df['region'].unique()))

Minimum value for 'age' column: 18
Maximum value for 'age' column: 64
Minimum value for 'bmi' column: 15.96
Maximum value for 'bmi' column: 53.13
Minimum value for 'children' column: 0
Maximum value for 'children' column: 5
Minimum value for 'charges' column: 1121.8739
Maximum value for 'charges' column: 63770.42801
-----------------
Unique values (2) for 'sex' column: 
['female' 'male']
Unique values (2) for 'smoker' column: 
['yes' 'no']
Unique values (4) for 'region' column: 
['southwest' 'southeast' 'northwest' 'northeast']


In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize all the continuous data received to have a mean of 0 and a standard deviation of 1. 
standard_scaler = StandardScaler()

#Saving all means and standard deviations for later

scaled_df = insurance_df.copy()
print(scaled_df['age'].head())
scaled_df['age'] = standard_scaler.fit_transform(scaled_df[['age']])

print(scaled_df['age'].head())

scaled_df['bmi'] = standard_scaler.fit_transform(scaled_df[['bmi']])
scaled_df['children'] = standard_scaler.fit_transform(scaled_df[['children']])

0    19
1    18
2    28
3    33
4    32
Name: age, dtype: int64
0   -1.438764
1   -1.509965
2   -0.797954
3   -0.441948
4   -0.513149
Name: age, dtype: float64


In [46]:
# Encode the categorical columns by creating a new column for each unique category in each categorical column. 
# If the row has that specific category, the new column should have a 1 in that category, and a 0 in the others.
print("First row: \n", str(scaled_df.head(1)))

encoded_df = scaled_df.copy()
encoded_df = pd.get_dummies(data=encoded_df, columns=['region'], drop_first=True)
# Drop first drops the first newly created column. This is because if the rest of the new columns' values are 0,
# it implies that the category is the one that isn't present. This saves computation.

# Convert to 0s and 1s rather than True or False
encoded_df[['region_southwest', 'region_northwest', 'region_southeast']] = encoded_df[['region_southwest', 'region_northwest', 'region_southeast']].astype(int)

encoded_df = pd.get_dummies(data=encoded_df, columns=['sex', 'smoker'], drop_first=True)

encoded_df[['sex_male', 'smoker_yes']] = encoded_df[['sex_male', 'smoker_yes']].astype(int)
print("\n\n\nNew first row: \n", str(encoded_df.head(1)))

First row: 
         age     sex      bmi  children smoker     region    charges
0 -1.438764  female -0.45332 -0.908614    yes  southwest  16884.924



New first row: 
         age      bmi  children    charges  region_northwest  region_southeast   
0 -1.438764 -0.45332 -0.908614  16884.924                 0                 0  \

   region_southwest  sex_male  smoker_yes  
0                 1         0           1  


In [None]:
from sklearn.model_selection import train_test_split

# Drop the target column from the dataset, and set that target column to be the y value.
X = encoded_df.drop(columns=['charges'])
y = encoded_df['charges']
X


# Split the X and y data into 2 sets, a training set and a testing set.
# In this case, we'll use a test size of 0.2, so the testing set has 20% of the total data entries.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error


# Using 3 different models to test the quality of the prediction. LinearRegression, RandomForestRegressor, and GradientBoostingRegressor.
# For all models, we will be using the loss function of root mean squared error (RMSE) 

lr_model = LinearRegression()
rfr_model = RandomForestRegressor(max_depth=5)
gbr_model = GradientBoostingRegressor()

models = [lr_model, rfr_model, gbr_model]
y_preds = []
errors = []
for model in models:
    #Fit the model to the training data and the labels (charge)
    model.fit(X_train, y_train)
    #Predict what the charges will be, given X_test
    y_pred = model.predict(X_test)
    #Find the error between the predicted charge (y_pred) and actual charge (y_test) 
    error = mean_absolute_error(y_test, y_pred)
    errors.append(error)
    y_preds.append(y_pred)

print("Error of Linear Regression: " + str(errors[0]))
print("Error of Random Forest Regression: " + str(errors[1]))
print("Error of Gradient Boosting Regression: " + str(errors[2]))

Error of Linear Regression: 4181.194473753652
Error of Random Forest Regression: 2538.100750875908
Error of Gradient Boosting Regression: 2444.001275820506
