# Predicting Medical Insurance using Artificial Neural Network (Regression)
### 18z213 - Bhavadharinie S
### 18z218 - Gayathri B
### 18z227 - Kripaa Harikumar
### 18z237 - Praja S S
### 18z252 - Sneha S

### Importing the libraries

In [1]:
import numpy as np #For working with Arrays
import pandas as pd #For Data Analysis and Manipulation
import tensorflow as tf #For Developing and Training Deep Learning Model

##  Data Preprocessing

### Importing the dataset

In [2]:
dataset = pd.read_csv('insurance.csv') #Imports dataset using read_csv function of pandas library
X = dataset.iloc[:, :-1].values #Extracts all columns excluding last column (Dependent Variable) 
y = dataset.iloc[:, -1].values #Extracts the Dependent Variable

### Encoding Categorical data

In [3]:
from sklearn.compose import ColumnTransformer #Imports ColumnTransformer class from compose module of sklearn library 
from sklearn.preprocessing import OneHotEncoder #Imports OneHotEncoder class from preprocessing module of sklearn library
ct1 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough') #Performs One Hot Encoding for the first column alone (sex)
X = np.array(ct1.fit_transform(X)) #Returns a new array with new feature columns
X[0] #Prints the first record in encoded format

array([1.0, 0.0, 19, 27.9, 0, 'yes', 'southwest'], dtype=object)

In [4]:
ct2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [6])], remainder='passthrough') #Performs One Hot Encoding for the sixth column alone (region)
X = np.array(ct2.fit_transform(X)) #Returns a new array with new feature columns
X[0] #Prints the first record in encoded format

array([0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 19, 27.9, 0, 'yes'], dtype=object)

In [5]:
from sklearn.preprocessing import LabelEncoder #Imports LabelEncoder class from preprocessing module of sklearn library
le = LabelEncoder() #Creates an object for label encoder class
X[:,9] = le.fit_transform(X[:,9]) #Label encodes the ninth column (smoker) by assigning 1 for smoker and 0 for non-smoker
X[0] #Prints the first record in encoded format

array([0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 19, 27.9, 0, 1], dtype=object)

In [6]:
X = np.asarray(X).astype(np.float32) #Converts the entire array as floating data
X[0] #Prints the first record in final encoded format

array([ 0. ,  0. ,  0. ,  1. ,  1. ,  0. , 19. , 27.9,  0. ,  1. ],
      dtype=float32)

### Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split #Imports train_test_split class from model_selection module of sklearn library
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) #Splits the dataset into four sets (20% - test set, 80% - train set)

In [8]:
X_train #Prints X_train

array([[ 0.  ,  0.  ,  0.  , ..., 34.1 ,  4.  ,  1.  ],
       [ 0.  ,  0.  ,  1.  , ..., 34.43,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  , ..., 36.67,  2.  ,  1.  ],
       ...,
       [ 0.  ,  0.  ,  1.  , ..., 25.08,  0.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  , ..., 35.53,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ..., 18.5 ,  1.  ,  0.  ]], dtype=float32)

In [9]:
y_train #Prints y_train

array([40182.246 ,  1137.4697, 38511.6283, ...,  5415.6612,  1646.4297,
        4766.022 ])

In [10]:
X_test #Prints x_test

array([[ 0.   ,  0.   ,  0.   , ..., 30.2  ,  1.   ,  0.   ],
       [ 0.   ,  0.   ,  1.   , ..., 29.37 ,  1.   ,  0.   ],
       [ 0.   ,  1.   ,  0.   , ..., 40.565,  2.   ,  1.   ],
       ...,
       [ 1.   ,  0.   ,  0.   , ..., 40.28 ,  0.   ,  0.   ],
       [ 0.   ,  0.   ,  1.   , ..., 39.05 ,  3.   ,  1.   ],
       [ 1.   ,  0.   ,  0.   , ..., 24.795,  3.   ,  0.   ]],
      dtype=float32)

In [11]:
y_test #Prints y_test

array([ 9724.53    ,  8547.6913  , 45702.02235 , 12950.0712  ,
        9644.2525  ,  4500.33925 ,  2198.18985 , 11436.73815 ,
        7537.1639  ,  5425.02335 ,  6753.038   , 10493.9458  ,
        7337.748   ,  4185.0979  , 18310.742   , 10702.6424  ,
       12523.6048  ,  3490.5491  ,  6457.8434  , 33475.81715 ,
       23967.38305 , 12643.3778  , 23045.56616 , 23065.4207  ,
        1674.6323  ,  4667.60765 ,  3732.6251  ,  7682.67    ,
        3756.6216  ,  8413.46305 ,  8059.6791  , 48970.2476  ,
       12979.358   , 20630.28351 , 14571.8908  ,  4137.5227  ,
        8347.1643  , 51194.55914 , 40003.33225 ,  1880.487   ,
        5458.04645 ,  2867.1196  , 20149.3229  , 47496.49445 ,
       36149.4835  , 26018.95052 , 19749.38338 ,  6940.90985 ,
        4718.20355 , 22192.43711 ,  2899.48935 , 18838.70366 ,
       23568.272   , 46255.1125  , 24227.33724 ,  3268.84665 ,
        2322.6218  ,  8827.2099  , 14478.33015 , 13112.6048  ,
        1253.936   , 46718.16325 , 13919.8229  ,  9630.

## Building the ANN

### Initializing the ANN

In [12]:
ann = tf.keras.models.Sequential() #Arranges keras layers in sequential order

### Adding the input layer 
- The rectified linear activation function or 'relu' is a linear function that will output the input if it is positive, otherwise output will be zero. Φ(x)=max(x,0)

In [13]:
ann.add(tf.keras.layers.Dense(units=120, activation='relu')) #Adds the input layer with 120 hidden neurons and rectified linear activation function

### Adding the first hidden layer 

In [14]:
ann.add(tf.keras.layers.Dense(units=60, activation='relu')) #Adds the first hidden layer with 60 hidden neurons and rectified linear activation function

### Adding the second hidden layer

In [15]:
ann.add(tf.keras.layers.Dense(units=60, activation='relu')) #Adds the second hidden layer with 60 hidden neurons and rectified linear activation function

### Adding the output layer

In [16]:
ann.add(tf.keras.layers.Dense(units=1)) #Adds the output layer with one output (Insurance)

## Training the ANN

### Compiling the ANN
- Error backpropation using the cost function-Mean Squared Error is the average squared error between the estimated value and the actual value.

In [17]:
ann.compile(optimizer = 'adam', loss = 'mean_squared_error') #Optimiser 'adam' performs stocastic gradient descent with cost function as mean squared error

### Training the ANN model on the Training set

In [18]:
ann.fit(X_train, y_train, batch_size = 32, epochs = 1000) #Fits 32 records in 1 pass recursively with 1000 epochs.Each epoch is one forward and backward pass for all the training records. 

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x230246a83c8>

### Predicting the results of the Test set

In [19]:
import sklearn.metrics as sm #Imports metrics from sklearn library
y_pred = ann.predict(X_test) #Predicts the output for each record in the test set

In [20]:
np.set_printoptions(precision=2) #Sets precision of print statement to 2
print(round(sm.r2_score(y_test, y_pred), 2)) #Prints the R2 Score i.e.,a Statistical measure of how close the data is to the fitted regression line. 

0.87


In [21]:
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) #Concatenates the predicted and actual values of X_test to compare the results

[[11135.5   9724.53]
 [10183.35  8547.69]
 [50612.03 45702.02]
 [14306.98 12950.07]
 [ 9966.88  9644.25]
 [ 6085.1   4500.34]
 [ 4171.09  2198.19]
 [14522.82 11436.74]
 [ 9691.33  7537.16]
 [ 8389.64  5425.02]
 [ 7058.16  6753.04]
 [11160.32 10493.95]
 [ 8092.18  7337.75]
 [ 5860.87  4185.1 ]
 [25602.45 18310.74]
 [14259.54 10702.64]
 [13983.09 12523.6 ]
 [ 5564.84  3490.55]
 [ 8337.87  6457.84]
 [26847.32 33475.82]
 [26400.57 23967.38]
 [15266.75 12643.38]
 [12367.31 23045.57]
 [29815.25 23065.42]
 [ 3929.98  1674.63]
 [ 7051.8   4667.61]
 [ 4118.8   3732.63]
 [ 8459.68  7682.67]
 [ 5346.68  3756.62]
 [10477.89  8413.46]
 [10008.2   8059.68]
 [55858.24 48970.25]
 [13609.58 12979.36]
 [10934.59 20630.28]
 [14807.87 14571.89]
 [ 6384.59  4137.52]
 [ 9036.15  8347.16]
 [38274.59 51194.56]
 [38566.7  40003.33]
 [ 2239.98  1880.49]
 [ 6257.62  5458.05]
 [ 5123.19  2867.12]
 [26810.77 20149.32]
 [50236.34 47496.49]
 [33729.3  36149.48]
 [ 6252.74 26018.95]
 [14166.78 19749.38]
 [ 8184.47  6

### Predicting insurance value for user input

In [24]:
l=[] #Creates an empty input list
dict_sex={'Male':[0.0,1.0],'Female':[1.0,0.0]} #Maps gender to its encoded value using dictionary
dict_region={'Southwest':[1.0,0.0,0.0,0.0],'Southeast':[0.0,1.0,0.0,0.0],'Northwest':[0.0,0.0,1.0,0.0],'Northeast':[0.0,0.0,0.0,1.0]} #Maps region to its encoded value using dictionary
region=input("Region: ") #Takes Region as input (Accepted values - Southwest, Southeast, Northwest, Northeast)
l.extend(dict_region[region]) #Appends the encoded value of region to the input list
sex=input("Sex: ") #Takes Sex as input (Accepted values - Female, Male)
l.extend(dict_sex[sex]) #Appends the encoded value of sex to the input list
age=float(input("Age: ")) #Takes Age as input (Accepted values - Valid age)
l.append(age) #Appends age to the input list
bmi=float(input("BMI: ")) #Takes BMI as input (Accepted values - Valid BMI value)
l.append(bmi) #Appends bmi to the input list
child=float(input("Children: ")) #Takes Children as input (Accepted values - Whole numbers)
l.append(child) #Appends number of children to the input list
smoker=input("Smoker: ") #Takes Smoker as input (Accepted values - Yes, No)
if smoker=="Yes": #Checks if smoker value is Yes
    l.append(1.0) #Appends 1.0 if Yes
else: #If smoker is not Yes i.e. value is No
    l.append(0.0) #Appends 0.0 if No
print(ann.predict([l])) #Predict insurance value for input list

Region: Northwest
Sex: Female
Age: 45
BMI: 22.1
Children: 0
Smoker: No
[[9406.67]]
