In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import data set into a dataframe
insurance_df = pd.read_csv('insurance.csv')
insurance_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
# Explore the data set
# Check for categorical data and Null values
print(insurance_df.info())

# Check for duplicates and remove them
duplicated_rows = insurance_df[insurance_df.duplicated()]
print(duplicated_rows)

display(insurance_df.drop_duplicates())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None
     age   sex    bmi  children smoker     region    charges
581   19  male  30.59         0     no  northwest  1639.5631


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
# Check the statistics of the data set
insurance_df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [5]:
# Identify categorical variables
print(insurance_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None


In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [7]:
# Encode categorical data
inputs = insurance_df.drop(['charges'], axis='columns')
inputs['sex_n'] = le.fit_transform(inputs['sex'])
inputs['smoker_n'] = le.fit_transform(inputs['smoker'])
inputs['region_n'] = le.fit_transform(inputs['region'])
inputs.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,sex_n,smoker_n,region_n
0,19,female,27.9,0,yes,southwest,0,1,3
1,18,male,33.77,1,no,southeast,1,0,2
2,28,male,33.0,3,no,southeast,1,0,2
3,33,male,22.705,0,no,northwest,1,0,1
4,32,male,28.88,0,no,northwest,1,0,1


In [8]:
# Define the features
inputs_n = inputs.drop(['sex', 'smoker', 'region'], axis='columns')
display(inputs_n)
# Define the target
target = insurance_df.charges

Unnamed: 0,age,bmi,children,sex_n,smoker_n,region_n
0,19,27.900,0,0,1,3
1,18,33.770,1,1,0,2
2,28,33.000,3,1,0,2
3,33,22.705,0,1,0,1
4,32,28.880,0,1,0,1
...,...,...,...,...,...,...
1333,50,30.970,3,1,0,1
1334,18,31.920,0,0,0,0
1335,18,36.850,0,0,0,2
1336,21,25.800,0,0,0,3


In [9]:
# Train- and Test- Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs_n, target, test_size = 0.3, random_state = 42)
print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)

Train set: (936, 6) (936,)
Test set: (402, 6) (402,)


In [10]:
# Train the model using Linear Regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

# Test the model using the test data set
y_pred = model.predict(X_test)

# Score the model
result = model.score(X_test, y_test)
print('The score of the model is', result)

The score of the model is 0.7694415927057692


In [11]:
# Train the model using Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rfmodel = RandomForestRegressor(random_state = 42)
rfmodel.fit(X_train, y_train)

# Test the model using the test data set
y_pred2 = rfmodel.predict(X_test)

# Score the model
result2 = rfmodel.score(X_test, y_test)
print('The score of the model is', result2)

from sklearn.metrics import r2_score
r2result = r2_score(y_test, y_pred2)
print('The r2 Score of the model is', r2result)

The score of the model is 0.8526342040289154
The r2 Score of the model is 0.8526342040289154


In [12]:
# Save the model using pickle
import pickle
filename = 'project_medical_insurance_prediction_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [13]:
# Load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
new_X_test = np.array([[19., 27.9, 0., 0., 1., 3.]])
result = loaded_model.predict(new_X_test)
print(result)

[24860.7122057]


