In [4]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import xgboost as xg
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

from utils.encoding import encode
from utils.preprocessing import preprocess
from utils.scaling import scale


preprocess, encode, scale

In [5]:
csv_path = Path('data/raw/insurance.csv')
labels_to_drop = ['region']
save_path = Path('data/processed/insurance_processed.csv')

processed_csv_path = Path('data/processed/insurance_processed.csv')
encodings_save_path = Path('data/encoded/encodings.json')
encoded_save_path = Path('data/encoded/insurance_encoded.csv')

encoded_csv_path = Path('data/encoded/insurance_encoded.csv')
stats_save_path = Path('data/scaled/stats.json')
scaled_save_path = Path('data/scaled/insurance_scaled.csv')

In [6]:
preprocess(
    csv_path=csv_path,
    labels_to_drop=labels_to_drop,
    save_path=save_path
)


In [7]:
encode(
    csv_path=processed_csv_path,
    csv_save_path=encoded_save_path,
    json_save_path=encodings_save_path
)

In [8]:
scale(
    csv_path=encoded_csv_path,
    csv_save_path=scaled_save_path,
    json_save_path=stats_save_path
)

train

In [9]:
with open('data/scaled/stats.json', 'r') as file:
    stats = json.load(file)

In [10]:
df = pd.read_csv(scaled_save_path)

In [11]:
X = df.drop(columns=["charges"])
y = df["charges"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [15]:
# Instantiation
xgb_r = xg.XGBRegressor(objective ='reg:squarederror',
                  n_estimators = 10, seed = 42)
 
# Fitting the model
xgb_r.fit(X_train.values, y_train.values)
 
# Predict the model
pred = xgb_r.predict(X_test)
 
# RMSE Computation
rmse = np.sqrt(MSE(y_test, pred))
print("RMSE : % f" %(rmse))
abs_error = rmse*stats['charges']['std'] + stats['charges']['std']
print(f"abs_error : {abs_error} $")


RMSE :  0.371959
abs_error : 16614.922977070735 $


inference

In [None]:
with open('data/scaled/stats.json', 'r') as file:
    stats = json.load(file)

age = 35 # years old
height = 1.78 # m
weight = 75 # kg
bmi = weight / (height**2)
children = 2
smoker = 1
gender = 0

age = (age - stats["age"]["mean"]) / stats["age"]["std"]
bmi = (bmi - stats["bmi"]["mean"]) / stats["bmi"]["std"]
children = (children - stats["children"]["mean"]) / stats["children"]["std"]

inputs = [age, bmi, children, smoker, gender]

prediction = xgb_r.predict(pd.DataFrame(inputs).to_numpy().T)
prediction = prediction * stats["charges"]["std"] + stats["charges"]["mean"]
prediction