In [114]:
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

In [15]:
df = pd.read_csv("medical_insurance.csv")

# Preprocessing the data and splitting


In [16]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [17]:
df.duplicated().sum()

np.int64(1435)

In [18]:
df.shape

(2772, 7)

In [19]:
df.sort_values(['charges'])

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
2759,18,male,23.210,0,no,southeast,1121.87390
2326,18,male,23.210,0,no,southeast,1121.87390
940,18,male,23.210,0,no,southeast,1121.87390
1373,18,male,23.210,0,no,southeast,1121.87390
2194,18,male,30.140,0,no,southeast,1131.50660
...,...,...,...,...,...,...,...
2616,52,male,34.485,3,yes,northwest,60021.39897
2686,45,male,30.360,0,yes,southeast,62592.87309
1300,45,male,30.360,0,yes,southeast,62592.87309
543,54,female,47.410,0,yes,southeast,63770.42801


In [20]:
df= df.drop_duplicates()

In [21]:
df.shape

(1337, 7)

In [23]:
df.sample(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1078,28,male,31.68,0,yes,southeast,34672.1472
191,36,female,26.2,0,no,southwest,4883.866
1058,24,female,39.49,0,no,southeast,2480.9791
587,34,female,30.21,1,yes,northwest,43943.8761
845,60,female,32.45,0,yes,southeast,45008.9555
1156,19,male,44.88,0,yes,southeast,39722.7462
1250,24,male,29.83,0,yes,northeast,18648.4217
978,45,female,39.995,3,no,northeast,9704.66805
1123,27,female,32.395,1,no,northeast,18903.49141
1292,21,male,23.21,0,no,southeast,1515.3449


In [24]:
df['sex'].unique()

array(['female', 'male'], dtype=object)

In [25]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [26]:
df["sex"] = np.where(df["sex"] == "female", 0, 1)

In [27]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [28]:
df["smoker"] = np.where(df["smoker"] == "yes", 1, 0)

In [29]:
df_encoded = pd.get_dummies(df,columns=['region'])

In [30]:
for col in ['region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']:
    df_encoded[col] = df_encoded[col].astype(int)

display(df_encoded.head())

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


In [31]:
df_encoded.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


In [32]:
df_copy = df_encoded.copy()
df = df_encoded

In [33]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [34]:
X = df.drop(columns=['charges'])

In [35]:
y = df['charges']

In [36]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,0,0,0,1
1,18,1,33.770,1,0,0,0,1,0
2,28,1,33.000,3,0,0,0,1,0
3,33,1,22.705,0,0,0,1,0,0
4,32,1,28.880,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,0,1,0,0
1334,18,0,31.920,0,0,1,0,0,0
1335,18,0,36.850,0,0,0,0,1,0
1336,21,0,25.800,0,0,0,0,0,1


In [37]:
y

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [38]:
df[df['charges'] < 0] # just to confirm that the target does not have any negative values

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest


In [39]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [40]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2, random_state=42)

In [41]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
1114,23,1,24.510,0,0,1,0,0,0
968,21,1,25.745,2,0,1,0,0,0
599,52,0,37.525,2,0,0,1,0,0
170,63,1,41.470,0,0,0,0,1,0
275,47,0,26.600,2,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
1096,51,0,34.960,2,1,1,0,0,0
1131,27,1,45.900,2,0,0,0,0,1
1295,20,1,22.000,1,0,0,0,0,1
861,38,0,28.000,3,0,0,0,0,1


In [42]:
y_train

Unnamed: 0,charges
1114,2396.09590
968,3279.86855
599,33471.97189
170,13405.39030
275,9715.84100
...,...
1096,44641.19740
1131,3693.42800
1295,1964.78000
861,7151.09200


In [43]:
scaler = StandardScaler()
# scailing X using standard scaler
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
# applying log transformation on target columns since it's highly right skewed and has no negative values
y_train_transformed = np.log1p(y_train)
y_test_transformed = np.log1p(y_test)

In [45]:
X_train_scaled

array([[-1.1576804 ,  0.97140947, -0.99692768, ..., -0.57266946,
        -0.60581158, -0.57410974],
       [-1.30061876,  0.97140947, -0.79276204, ..., -0.57266946,
        -0.60581158, -0.57410974],
       [ 0.91492586, -1.029432  ,  1.15466402, ...,  1.74620801,
        -0.60581158, -0.57410974],
       ...,
       [-1.37208794,  0.97140947, -1.4118716 , ..., -0.57266946,
        -0.60581158,  1.74182728],
       [-0.08564268, -1.029432  , -0.41997378, ..., -0.57266946,
        -0.60581158,  1.74182728],
       [-0.30005022, -1.029432  ,  0.87941237, ..., -0.57266946,
         1.65067825, -0.57410974]])

In [48]:
y_train_transformed

Unnamed: 0,charges
1114,7.782013
968,8.095863
599,10.418494
170,9.503487
275,9.181616
...,...
1096,10.706435
1131,8.214581
1295,7.583644
861,8.875160


In [47]:
X_train_scaled.shape

(1069, 9)

# Training the model

In [49]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [56]:
model = Sequential()

In [57]:
model.add(Dense(5, activation='relu', input_dim=9))
model.add(Dense(4, activation='relu'))
model.add(Dense(2, activation='relu'))
model.add(Dense(1,activation='linear'))

In [58]:
model.summary()

In [59]:
model.compile(optimizer='adam',loss='mse',metrics=['mse'])

In [62]:
model.fit(X_train_scaled,y_train_transformed,epochs=200,validation_data=(X_test_scaled,y_test_transformed))

Epoch 1/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.1409 - mse: 0.1409 - val_loss: 0.1229 - val_mse: 0.1229
Epoch 2/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1476 - mse: 0.1476 - val_loss: 0.1227 - val_mse: 0.1227
Epoch 3/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1634 - mse: 0.1634 - val_loss: 0.1243 - val_mse: 0.1243
Epoch 4/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1481 - mse: 0.1481 - val_loss: 0.1255 - val_mse: 0.1255
Epoch 5/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1494 - mse: 0.1494 - val_loss: 0.1253 - val_mse: 0.1253
Epoch 6/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1440 - mse: 0.1440 - val_loss: 0.1216 - val_mse: 0.1216
Epoch 7/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0

<keras.src.callbacks.history.History at 0x78ec39d86720>

In [65]:
y_pred_transformed = model.predict(X_test_scaled)
y_pred = np.expm1(y_pred_transformed)  # exp(y) - 1, reverses log1p

# Compute metrics on original scale
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_pred, y_test)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [66]:
mae

2665.6805229771167

In [67]:
row = X_train_scaled[0]
pred = model.predict(row.reshape(1,-1))
ypred = np.expm1(pred)
ypred

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


array([[2498.7856]], dtype=float32)

In [68]:
original_values = scaler.inverse_transform(row.reshape(1, -1))

In [69]:
original_values

array([[ 2.30000000e+01,  1.00000000e+00,  2.45100000e+01,
         0.00000000e+00,  2.77555756e-17,  1.00000000e+00,
        -2.77555756e-17,  0.00000000e+00,  0.00000000e+00]])

In [70]:
df_copy.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


In [71]:
df[df['bmi'] == 36.85]

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
161,18,0,36.85,0,1,36149.4835,0,0,1,0
478,21,1,36.85,0,0,1534.3045,0,0,1,0
903,49,1,36.85,0,0,8125.7845,0,0,1,0
997,63,0,36.85,0,0,13887.9685,0,0,1,0
1335,18,0,36.85,0,0,1629.8335,0,0,1,0


My actual output and the model's output is very very close. let's try that with more data.

In [72]:
row = X_train_scaled[752]
pred = model.predict(row.reshape(1,-1))
ypred = np.expm1(pred)
ypred

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


array([[2330.5247]], dtype=float32)

In [73]:
original_values = scaler.inverse_transform(row.reshape(1, -1))
original_values

array([[ 2.30000000e+01,  0.00000000e+00,  2.80000000e+01,
         0.00000000e+00,  2.77555756e-17,  0.00000000e+00,
        -2.77555756e-17,  0.00000000e+00,  1.00000000e+00]])

In [74]:
df[df['bmi'] == 32.12]['charges']

Unnamed: 0,charges
564,2801.2588


In [None]:
pip install keras-tuner --upgrade

In [76]:
import keras_tuner as kt
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam


def build_model(hp):

  model = Sequential()
  # Input layer
  model.add(Dense(
      units = hp.Int('input_units', min_value = 16, max_value=256, step=16),
      activation='relu',
      input_shape=(X_train_scaled.shape[1],)
  ))

  # Tuning number of hidden layers
  for i in range(hp.Int('num_layers', 1,4)):
    model.add(Dense(
        units = hp.Int(f'units{i}', min_value=16, max_value=256, step = 16),
        activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    ))

  # output layer

  model.add(Dense(1, activation='linear'))

  model.compile(
      optimizer=Adam(hp.Float('learning_rate',0.0001,0.001, sampling='log')),
      loss='mse',
      metrics=['mae']
  )

  return model


tuner = kt.RandomSearch(
    build_model,
    objective='val_mae',
    max_trials=30,
    executions_per_trial=1,
    directory='tuner_results',
    project_name='insurance_cost_results'
)

# early stoping callback
early_stop = EarlyStopping(monitor='val_mae', patience=10, restore_best_weights=True)



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [77]:
tuner.search(
    X_train_scaled,y_train_transformed,
    epochs=50,
    validation_split=0.2,
    callbacks=[early_stop],
    validation_data=(X_test_scaled,y_test_transformed)
)

Trial 30 Complete [00h 00m 08s]
val_mae: 0.22451399266719818

Best val_mae So Far: 0.1667594164609909
Total elapsed time: 00h 07m 11s


In [78]:
best_hp = tuner.get_best_hyperparameters()[0]
print(best_hp.values)

{'input_units': 160, 'num_layers': 3, 'units0': 256, 'activation': 'tanh', 'learning_rate': 0.0006427827530702848, 'units1': 32, 'units2': 128, 'units3': 64}


In [79]:

best_model = tuner.get_best_models()[0]

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


In [80]:
best_model.fit(X_train_scaled,y_train_transformed,epochs=250, initial_epoch=50,validation_data=(X_test_scaled,y_test_transformed))

Epoch 51/250
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - loss: 0.1518 - mae: 0.2524 - val_loss: 0.1242 - val_mae: 0.2377
Epoch 52/250
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1574 - mae: 0.2295 - val_loss: 0.1121 - val_mae: 0.1679
Epoch 53/250
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1419 - mae: 0.2066 - val_loss: 0.1228 - val_mae: 0.1774
Epoch 54/250
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1531 - mae: 0.2127 - val_loss: 0.1131 - val_mae: 0.2028
Epoch 55/250
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1417 - mae: 0.2108 - val_loss: 0.1102 - val_mae: 0.2045
Epoch 56/250
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1518 - mae: 0.2289 - val_loss: 0.1138 - val_mae: 0.2222
Epoch 57/250
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step -

<keras.src.callbacks.history.History at 0x78ec1554a2d0>

In [102]:
best_model.evaluate(X_test_scaled, y_test_transformed)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1290 - mae: 0.2057 


[0.1630263477563858, 0.2270517200231552]

In [209]:
y_pred_transformed_tuned = best_model.predict(X_test_scaled)
y_pred_tuned = np.expm1(y_pred_transformed_tuned)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [210]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred_tuned)
print(f"R2 Score with tuned modle: {r2}")

R2 Score with tuned model: 0.8783349677311486


array([[23.   ,  1.   , 24.51 , ..., -0.   ,  0.   ,  0.   ],
       [21.   ,  1.   , 25.745, ..., -0.   ,  0.   ,  0.   ],
       [52.   ,  0.   , 37.525, ...,  1.   ,  0.   ,  0.   ],
       ...,
       [20.   ,  1.   , 22.   , ..., -0.   ,  0.   ,  1.   ],
       [38.   ,  0.   , 28.   , ..., -0.   ,  0.   ,  1.   ],
       [35.   ,  0.   , 35.86 , ..., -0.   ,  1.   ,  0.   ]])

In [208]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.843236506532191

# Some manual calculations

In [203]:
X_train_scaled[756].shape

(9,)

In [204]:
scaler.inverse_transform(X_train_scaled[756].reshape(1,-1))

array([[18.   ,  0.   , 35.625,  0.   ,  0.   ,  1.   , -0.   ,  0.   ,
         0.   ]])

In [90]:
X_train_scaled[756].reshape(1,-1).shape

(1, 9)

In [91]:
X_train_scaled[756].reshape(1,-1)

array([[-1.51502631, -1.029432  ,  0.84056304, -0.90790804, -0.50029231,
         1.79591103, -0.57266946, -0.60581158, -0.57410974]])

In [193]:
row = X_train_scaled[433]
pred = best_model.predict(row.reshape(1,-1))
ypred = np.expm1(pred)
ypred

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


array([[2050.7017]], dtype=float32)

In [194]:
prediction_value = ypred[0][0]

In [195]:
original_values = scaler.inverse_transform(row.reshape(1, -1))
original_values

array([[28.  ,  1.  , 38.06,  0.  ,  0.  ,  0.  , -0.  ,  1.  ,  0.  ]])

In [196]:
pointer = original_values.reshape(-1,1)

In [197]:
pointer

array([[28.  ],
       [ 1.  ],
       [38.06],
       [ 0.  ],
       [ 0.  ],
       [ 0.  ],
       [-0.  ],
       [ 1.  ],
       [ 0.  ]])

In [198]:
pointer[0]

array([28.])

In [199]:
a = pointer[0]

In [200]:
true_charges = df[(df['bmi'] == pointer[2][0]) & (df['age'] == pointer[0][0])]['charges'].values[0]

In [201]:
print(prediction_value)
print(true_charges)
print(true_charges-prediction_value)
print((prediction_value/true_charges)*100)

2050.7017
2689.4954
638.7937398437498
76.24856544302884
