In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import pickle
import json

In [14]:
Df = pd.read_csv("medical_insurance.csv")
Df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [15]:
Df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [16]:
Df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [17]:
Df['sex'].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [20]:
Df = pd.get_dummies(Df, columns=['region'])
Df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,female,27.900,0,yes,16884.92400,0,0,0,1
1,18,male,33.770,1,no,1725.55230,0,0,1,0
2,28,male,33.000,3,no,4449.46200,0,0,1,0
3,33,male,22.705,0,no,21984.47061,0,1,0,0
4,32,male,28.880,0,no,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830,0,1,0,0
1334,18,female,31.920,0,no,2205.98080,1,0,0,0
1335,18,female,36.850,0,no,1629.83350,0,0,1,0
1336,21,female,25.800,0,no,2007.94500,0,0,0,1


In [22]:
Df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   sex               1338 non-null   object 
 2   bmi               1338 non-null   float64
 3   children          1338 non-null   int64  
 4   smoker            1338 non-null   object 
 5   charges           1338 non-null   float64
 6   region_northeast  1338 non-null   uint8  
 7   region_northwest  1338 non-null   uint8  
 8   region_southeast  1338 non-null   uint8  
 9   region_southwest  1338 non-null   uint8  
dtypes: float64(2), int64(2), object(2), uint8(4)
memory usage: 68.1+ KB


In [24]:
Df['sex'].replace({'female':0,'male':1}, inplace=True)
Df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,yes,16884.92400,0,0,0,1
1,18,1,33.770,1,no,1725.55230,0,0,1,0
2,28,1,33.000,3,no,4449.46200,0,0,1,0
3,33,1,22.705,0,no,21984.47061,0,1,0,0
4,32,1,28.880,0,no,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,no,10600.54830,0,1,0,0
1334,18,0,31.920,0,no,2205.98080,1,0,0,0
1335,18,0,36.850,0,no,1629.83350,0,0,1,0
1336,21,0,25.800,0,no,2007.94500,0,0,0,1


In [25]:
Df['smoker'].replace({'no':0,'yes':1}, inplace=True)
Df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,16884.92400,0,0,0,1
1,18,1,33.770,1,0,1725.55230,0,0,1,0
2,28,1,33.000,3,0,4449.46200,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,0,1,0,0
1334,18,0,31.920,0,0,2205.98080,1,0,0,0
1335,18,0,36.850,0,0,1629.83350,0,0,1,0
1336,21,0,25.800,0,0,2007.94500,0,0,0,1


In [26]:
Df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   sex               1338 non-null   int64  
 2   bmi               1338 non-null   float64
 3   children          1338 non-null   int64  
 4   smoker            1338 non-null   int64  
 5   charges           1338 non-null   float64
 6   region_northeast  1338 non-null   uint8  
 7   region_northwest  1338 non-null   uint8  
 8   region_southeast  1338 non-null   uint8  
 9   region_southwest  1338 non-null   uint8  
dtypes: float64(2), int64(4), uint8(4)
memory usage: 68.1 KB


In [27]:
X = Df.drop('charges', axis=1)
Y = Df['charges']

In [28]:
X_Train, X_test, Y_Train, Y_Test = train_test_split(X,Y,test_size=0.3,random_state=2)

In [29]:
X_Train

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
355,46,1,27.600,0,0,0,0,0,1
477,25,1,35.625,0,0,0,1,0,0
1156,19,1,44.880,0,1,0,0,1,0
663,18,1,33.660,0,0,0,0,1,0
1216,40,1,25.080,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...
466,60,0,28.700,1,0,0,0,0,1
299,48,0,28.880,1,0,0,1,0,0
493,61,1,43.400,0,0,0,0,0,1
527,51,0,25.800,1,0,0,0,0,1


In [30]:
Model = LinearRegression()
Model.fit(X_Train, Y_Train)

In [31]:
Y_Pred = Model.predict(X_test)

MSE = mean_squared_error(Y_Test,Y_Pred)
print("THE MEAN SQUARED ERROR IS:",MSE)

RMSE = np.sqrt(MSE)
print("THE ROOT MEAN SQUARED ERROR IS:",RMSE)

MAE = mean_absolute_error(Y_Test,Y_Pred)
print("THE MEAN ABSOLUTE ERROR IS:",MAE)

R2_SCORE = r2_score(Y_Test, Y_Pred)
print("THE R2 SCORE IS:",R2_SCORE)

THE MEAN SQUARED ERROR IS: 38108732.48980024
THE ROOT MEAN SQUARED ERROR IS: 6173.227072593413
THE MEAN ABSOLUTE ERROR IS: 4292.580187720697
THE R2 SCORE IS: 0.7642348545269876


In [32]:
Y_Pred_Train = Model.predict(X_Train)

MSE = mean_squared_error(Y_Train, Y_Pred_Train)
print("THE MEAN SQUARED ERROR IS:",MSE)

RMSE = np.sqrt(MSE)
print("THE ROOT MEAN SQUARED ERROR IS:",RMSE)

MAE = mean_absolute_error(Y_Train, Y_Pred_Train)
print("THE MEAN ABSOLUTE ERROR IS:",MAE)

R2_SCORE = r2_score(Y_Train, Y_Pred_Train)
print("THE R2 SCORE IS:",R2_SCORE)

THE MEAN SQUARED ERROR IS: 36021638.309960455
THE ROOT MEAN SQUARED ERROR IS: 6001.802921619507
THE MEAN ABSOLUTE ERROR IS: 4186.525053236722
THE R2 SCORE IS: 0.7428108911901881


In [33]:
X.head(1).T

Unnamed: 0,0
age,19.0
sex,0.0
bmi,27.9
children,0.0
smoker,1.0
region_northeast,0.0
region_northwest,0.0
region_southeast,0.0
region_southwest,1.0


In [34]:
Age = 19
Sex = 'female'
Bmi = 27.9
Children = 0
Smoker = 'Yes'
Region = 'Southeast'

In [35]:
Test_Array = np.array([Age,0.0,Bmi,Children,0.0,0.0,0.0,1.0,0.0], ndmin=2)
print(Test_Array)

[[19.   0.  27.9  0.   0.   0.   0.   1.   0. ]]


In [36]:
print(Model.predict(Test_Array))

[1852.30317532]




In [37]:
print(X.columns)
print("\nThe Length of the columns list is:",len(X.columns))

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

The Length of the columns list is: 9


In [38]:
DATA = {'sex': {'female':0, 'male':1},
        'Smoker':{'no':0, 'Yes':1},
        'columns': list(X.columns)}

DATA

{'sex': {'female': 0, 'male': 1},
 'Smoker': {'no': 0, 'Yes': 1},
 'columns': ['age',
  'sex',
  'bmi',
  'children',
  'smoker',
  'region_northeast',
  'region_northwest',
  'region_southeast',
  'region_southwest']}

In [39]:
Input_Entry = np.zeros(9)
Input_Entry[7] = 1
Input_Entry

array([0., 0., 0., 0., 0., 0., 0., 1., 0.])

In [40]:
Columns_Names = X.columns
Columns_Names

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [41]:
list(Columns_Names).index('region_southeast')

7

In [42]:
New_Region = 'region_' + Region
Region_Index = np.where(Columns_Names == New_Region)
Region_Index

(array([], dtype=int64),)

In [43]:
Test_Array = np.zeros(X.shape[1])
Test_Array[0] = Age
Test_Array[1] = DATA['sex'][Sex]
Test_Array[2] = Bmi
Test_Array[3] = Children
Test_Array[4] = DATA['Smoker'][Smoker]
Test_Array[Region_Index] = 1

Test_Array

array([19. ,  0. , 27.9,  0. ,  1. ,  0. ,  0. ,  0. ,  0. ])

In [44]:
Result = np.around(Model.predict([Test_Array]), 2)
print(f"Medical Insurance Charges are:{Result}")

Medical Insurance Charges are:[25295.89]




In [45]:
with open('The_Linear_Model.pkl', 'wb') as f:
    pickle.dump(Model, f)

In [46]:
with open('Project_Data.json', 'w') as f:
    json.dump(DATA, f)