In [1]:
# Import files
file_url = "https://raw.githubusercontent.com/aso-uts/labs_datasets/main/36120-adv_mla/lab01/insurance.csv"


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Import Packages
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv(file_url)

In [5]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,18,female,33.820,0,no,southeast,1630.66170
1,19,female,23.480,1,no,southeast,1836.80430
2,46,male,30.570,2,no,southeast,6632.35130
3,54,male,32.050,1,yes,southeast,31922.42950
4,21,male,21.345,4,no,northeast,1638.37255
...,...,...,...,...,...,...,...
49995,50,female,28.880,2,no,southeast,10306.54830
49996,58,male,36.110,2,no,northwest,4716.96000
49997,28,male,24.195,1,yes,northeast,29703.17305
49998,57,male,28.600,2,no,southwest,10035.39600


In [6]:
df.shape

(50000, 7)

In [7]:
df.describe(include='all')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,50000.0,50000,50000.0,50000.0,50000,50000,50000.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,25176,,,38976,14197,
mean,39.46312,,30.713734,1.11376,,,13343.216363
std,14.117142,,6.092727,1.212835,,,12131.222744
min,18.0,,17.291,0.0,,,1137.5359
25%,27.0,,26.6,0.0,,,4694.4318
50%,40.0,,30.3,1.0,,,9399.232775
75%,51.0,,34.57,2.0,,,17340.746925


In [8]:
## Prepare Data
df_cleaned = df.copy()

In [9]:
target = df_cleaned.pop('charges')


In [10]:
# Create a list of numerical columns and categorical columns
num_cols = list(df_cleaned.select_dtypes('number').columns)
cat_cols = list(set(df_cleaned.columns) - set(num_cols))

In [11]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [12]:
ohe = OneHotEncoder(sparse_output=False, drop='first')

In [13]:
features = ohe.fit_transform(df_cleaned[cat_cols])

In [14]:
# Convert features into DataFrame
features = pd.DataFrame(features, columns=ohe.get_feature_names_out())

In [15]:
features

Unnamed: 0,sex_male,region_northwest,region_southeast,region_southwest,smoker_yes
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,1.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
49995,0.0,0.0,1.0,0.0,0.0
49996,1.0,1.0,0.0,0.0,0.0
49997,1.0,0.0,0.0,0.0,1.0
49998,1.0,0.0,0.0,1.0,0.0


In [16]:
scaler = StandardScaler()

In [17]:
# Scale numerical columns
features[num_cols] = scaler.fit_transform(df_cleaned[num_cols])

In [18]:
# Import dump
from joblib import dump

In [20]:
# Save the one-hot encoder and scaler into the folder models and call the files respectively ohe.joblib and scaler.joblib
dump(ohe, 'adv_mla_lab_1/models/ohe.joblib')
dump(scaler, 'adv_mla_lab_1/models/scaler.joblib')

['adv_mla_lab_1/models/scaler.joblib']

In [21]:
# Split Dataset
from sklearn.model_selection import train_test_split

In [22]:
X_data, X_test, y_data, y_test = train_test_split(features, target, test_size=0.2, random_state=8)
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=8)

In [23]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(32000, 8)
(8000, 8)
(10000, 8)


In [24]:
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(32000,)
(8000,)
(10000,)


In [25]:
# Save the sets into the folder data/processed
X_train.to_csv('adv_mla_lab_1/data/processed/X_train.csv', index=False)
X_val.to_csv('adv_mla_lab_1/data/processed/X_val.csv', index=False)
X_test.to_csv('adv_mla_lab_1/data/processed/X_test.csv', index=False)
y_train.to_csv('adv_mla_lab_1/data/processed/y_train.csv', index=False)
y_val.to_csv('adv_mla_lab_1/data/processed/y_val.csv', index=False)
y_test.to_csv('adv_mla_lab_1/data/processed/y_test.csv', index=False)

In [26]:
# Get a baseline model
pred_value = y_train.mean()

In [27]:
y_base = np.full((len(y_train), 1), pred_value)

In [28]:
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.metrics import mean_absolute_error as mae

In [29]:
print(rmse(y_base, y_train))
print(mae(y_base, y_train))

12116.584822448176
9118.852804794265
