In [None]:
import pandas as pd 
import numpy as np
import tensorflow as tf

# Data preprocessing

## Data exploring 

In [None]:
df = pd.read_csv("/content/insurance.csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [None]:
df.isnull().values.any()

False

In [None]:
duplicate = df[df.duplicated()]
print("Duplicate Rows :")
duplicate

Duplicate Rows :


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
581,19,male,30.59,0,no,northwest,1639.5631


In [None]:
df = df.drop(labels=581, axis=0)

data insights on [Kaggle Medical Cost Personal Datasets](https://www.kaggle.com/datasets/mirichoi0218/insurance)

## Data transformation 



1.   normalization 
2.   encode categorical data 



In [None]:
df["age"] =(df["age"]-df["age"].min())/df["age"].max()

In [None]:
df["bmi"] =(df["bmi"]-df["bmi"].min())/df["bmi"].max()

In [None]:
df["charges"] =(df["charges"]-df["charges"].min())/df["charges"].max()

In [None]:
# 0 -> male    1 -> female 	 
df['sex'].replace(['male', 'female'],[0, 1], inplace=True)

In [None]:
# 0 -> yes    1 -> no 
df['smoker'].replace(['yes', 'no'],[0, 1], inplace=True)

In [None]:
region=df["region"].unique()
region

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [None]:
# 'southwest'-> 0  'southeast'-> 1 'northwest'-> 2 'northeast' ->3
df['region'].replace(['southwest', 'southeast', 'northwest', 'northeast'],[0,1,2,3], inplace=True)

In [None]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,0.015625,1,0.224732,0,0,0,0.247184
1,0.000000,0,0.335216,1,1,1,0.009466
2,0.156250,0,0.320723,3,1,1,0.052181
3,0.234375,0,0.126953,0,1,2,0.327152
4,0.218750,0,0.243177,0,1,2,0.043045
...,...,...,...,...,...,...,...
1333,0.500000,0,0.282515,3,1,2,0.148637
1334,0.000000,1,0.300395,0,1,3,0.017000
1335,0.000000,1,0.393187,0,1,1,0.007965
1336,0.046875,1,0.185206,0,1,0,0.013895


## Pipeline

In [None]:
X = df.values[:,:-1]
Y = df.values[:,-1]

In [None]:
Y.shape

(1337,)

In [None]:
class Pipeline(tf.keras.utils.Sequence):
  def __init__(self,input_x,labels,batch_size,shuffle=True):
    self.x = input_x  # pipeline input 
    self.y = labels   # pipeline output
  
    # The pipeline needs to take ''' batch size ( 8 examples , 16 examples, 32 example, 48 example)
    # and shuffle paremeter [ true - false ] to shuffle or not shuffle the data
    self.batch_size = batch_size  
    self.shuffle    = shuffle 
    self.on_epoch_end()
    
  def __len__(self):
    # This function determines the number of batches
    return int(np.floor(len(self.y) / self.batch_size))

  def __getitem__(self, index):
    # Get the current batch 
    indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
    X, y = self.__get_batch(indexes)
    return X,y

  def on_epoch_end(self):
    self.indexes = np.arange(len(self.x))
    if self.shuffle == True:
      np.random.shuffle(self.indexes)
    

  def __get_batch(self, list_IDs_temp):
    '''
    Does the following three main things:
    1- Create two arrays for input and output with correct shapes
    '''
    
    X = np.empty((self.batch_size, self.x.shape[1]),dtype=np.float32) #working with b&w or ir
    
    y = np.empty((self.batch_size,1))

    # Generate data
    for i, ID in enumerate(list_IDs_temp):
      # Get expression
      X[i,:] = self.x[ID]
      # store label
      y[i,]    = self.y[ID]
    
    return X, y

In [None]:
x_train, y_train = X[:int(.8*len(X))],Y[:int(.8*len(Y))]
x_val  , y_val   = X[int(.8*len(X)):int(9*len(X))],Y[int(.8*len(Y)):int(9*len(Y))]
x_test , y_test  = X[int(.9*len(X)):],Y[int(.9*len(Y)):]

train_generator = Pipeline(input_x= x_train,labels=y_train,batch_size=16,shuffle=True)
validation_generator = Pipeline(input_x= x_val,labels=y_val,batch_size=16,shuffle=True)
test_generator = Pipeline(input_x= x_test,labels=y_test,batch_size=1,shuffle=True)

In [None]:
print(len(list(X)),len(list(Y)))
print(len(list(x_train)),len(list(y_train)))
print(len(list(x_val))  ,len(list(y_val))  )
print(len(list(x_test)) ,len(list(y_test)) )

1337 1337
1069 1069
268 268
134 134


# Model

In [None]:
def Create_model(in_shape):

  input = tf.keras.layers.Input(shape=in_shape,name='input',dtype="float32")
  # 1 - Fully connected layers
  fc_layer = tf.keras.layers.Dense(128, activation="relu", name="dense6")(input)

  fc_layer_2 = tf.keras.layers.Dense(64, activation="relu", name="dense7")(fc_layer)
  dropout  = tf.keras.layers.Dropout(0.2)(fc_layer_2)
 
  fc_layer = tf.keras.layers.Dense(1, activation="relu", name="dense8")(dropout)

  model = tf.keras.models.Model(
      inputs= input, outputs= fc_layer, name="regression",
  )

  # Optimizer
  opt = tf.keras.optimizers.Adam()
  # Compile the model and return
  model.compile(optimizer=opt, loss = "MeanSquaredError",metrics=["cosine_similarity"])
  return model

In [None]:
# Train the model
model = Create_model(X.shape[1])
epochs = 50
history = model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=epochs,
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
 1/66 [..............................] - ETA: 1s - loss: 0.0098 - cosine_similarity: 0.9375

KeyboardInterrupt: ignored

In [None]:
model.evaluate(test_generator)



[0.007524648681282997, 0.9776119589805603]