In [64]:
import pandas as pd
import numpy as np

#Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Warnings
import warnings
warnings.filterwarnings("ignore")

# Store
import pickle 
import json

### Problem Statement

### Data Gathering

In [2]:
# CSV Files
# Excel File

df = pd.read_csv("medical_insurance.csv")
df

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


### Data Understanding

In [65]:
# No of Columns
df.shape[1] 

# No of Rows
df.shape[0] 

# No. of Missing Values
df.isna().sum().sum() 

# No. of Duplicate Rows
df.duplicated().sum()

# No. of Duplicate Columns
df.T.duplicated().sum()

# How Data Looks
df.sample(5)

# info of Data
df.info()

# Mathematical Info of Data
df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1337 non-null   int64  
 1   gender            1337 non-null   int64  
 2   bmi               1337 non-null   float64
 3   children          1337 non-null   int64  
 4   smoker            1337 non-null   int64  
 5   charges           1337 non-null   float64
 6   region_northeast  1337 non-null   uint8  
 7   region_northwest  1337 non-null   uint8  
 8   region_southeast  1337 non-null   uint8  
 9   region_southwest  1337 non-null   uint8  
dtypes: float64(2), int64(4), uint8(4)
memory usage: 110.6 KB


Unnamed: 0,age,gender,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
count,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0,1337.0
mean,39.222139,0.504862,30.663452,1.095737,0.204936,13279.121487,0.242334,0.242334,0.272251,0.243082
std,14.044333,0.500163,6.100468,1.205571,0.403806,12110.359656,0.428655,0.428655,0.445285,0.429104
min,18.0,0.0,15.96,0.0,0.0,1121.8739,0.0,0.0,0.0,0.0
25%,27.0,0.0,26.29,0.0,0.0,4746.344,0.0,0.0,0.0,0.0
50%,39.0,1.0,30.4,1.0,0.0,9386.1613,0.0,0.0,0.0,0.0
75%,51.0,1.0,34.7,2.0,0.0,16657.71745,0.0,0.0,1.0,0.0
max,64.0,1.0,53.13,5.0,1.0,63770.42801,1.0,1.0,1.0,1.0


In [66]:
df.sample(2)

Unnamed: 0,age,gender,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
367,42,0,24.985,2,0,8017.06115,0,1,0,0
109,63,1,35.09,0,1,47055.5321,0,0,1,0


### EDA

#### Categorical Col

In [5]:
# sns.countplot(x=df["smoker"])
# sns.countplot(x=df["gender"])
# sns.countplot(x = df["children"])
df["children"].value_counts()
# df["children"].value_counts().plot(kind = "pie")
# sns.countplot(x=df["region"])
df["region"].value_counts()
df["smoker"].value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

#### Numeric Col

In [6]:
# plt.hist(df["age"])
# plt.hist(df["charges"])
# plt.hist(df["bmi"])
# sns.distplot(df["bmi"])
# sns.boxplot(df["bmi"])
# sns.boxplot(df["charges"])

#### Analysis of 2 Col

In [7]:
# sns.scatterplot(x = df["bmi"], y = df["charges"])
# sns.barplot(x = df["gender"],y = df["charges"])
# sns.barplot(x = df["children"],y = df["charges"])

# sns.barplot(x = df["smoker"],y = df["charges"])    # Important Analysis 

### Feature Enggineering

In [8]:
# Drop Duplicates Rows
df.drop_duplicates(inplace=True)

In [9]:
# Remove Misssing VAlue
df.dropna(inplace=True)

In [10]:
# Remove Misssing VAlue
df.dropna(inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   gender    1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   charges   1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


#### Label Encoding

In [12]:
df.head(2)

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523


In [13]:
# Gender Column

df["gender"].value_counts()
df["gender"].replace({"male":1,"female":0},inplace = True)

In [14]:
# Smoker Column

df["smoker"].value_counts()
df["smoker"].replace({"yes":1,"no":0},inplace = True)

In [15]:
df.sample(2)

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
1240,52,1,41.8,2,1,southeast,47269.854
303,28,0,33.0,2,0,southeast,4349.462


#### One Hot Encoding

In [37]:
# Region Column

df = pd.get_dummies(df,columns = ["region"])

In [38]:
# Final Data Frame
df.sample(5)

Unnamed: 0,age,gender,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
213,34,0,26.73,1,0,5002.7827,0,0,1,0
246,60,0,38.06,0,0,12648.7034,0,0,1,0
628,58,1,38.0,0,0,11365.952,0,0,0,1
916,43,0,26.885,0,1,21774.32215,0,1,0,0
1025,21,0,34.6,0,0,2020.177,0,0,0,1


### Feature Selection

#### Linearity

#### 1. Coefficient of Correlaion (R)

In [39]:
# Correlation Between Inputs & Output Columns
# R must be (0.7 to 1.0) & (-0.7 to -1.0)

df.corr().iloc[[5],:]
# Here Smoker has strong Correlation
# Means + correlation
# sns.pairplot(df)
# sns.heatmap(df.corr()[["charges"]],annot = True)

Unnamed: 0,age,gender,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
charges,0.298308,0.058044,0.198401,0.067389,0.787234,1.0,0.005945,-0.038695,0.073578,-0.043637


#### No Multicolinearity

In [40]:
X = df.drop("charges",axis=1)
X.head(2)

Unnamed: 0,age,gender,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,0,0,0,1
1,18,1,33.77,1,0,0,0,1,0


In [22]:
# VIF >> Varieance Influation Factors

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [41]:
vif = []

for i in range(X.shape[1]):
    vif.append(variance_inflation_factor(X,i))
# vif

#### Final Data

In [42]:
df.sample(2)

Unnamed: 0,age,gender,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
1252,20,1,27.3,0,1,16232.847,0,0,0,1
490,19,0,32.9,0,0,1748.774,0,0,0,1


### Split Data

In [43]:
# Input Columns
x = df.drop("charges",axis = 1)

# Output Column
y = df["charges"]

In [29]:
from sklearn.model_selection import train_test_split

In [45]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=4)

In [44]:
x_train.shape
y_train.shape

(1069,)

### Model Training

In [46]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

#### Train Model on Training Data

In [47]:
lr.fit(x_train,y_train)

#### Check Predictios on hidden / Testing data

In [48]:
y_pred = lr.predict(x_test)

In [None]:
x_test[5:7]

In [None]:
# For this 2 input model pred. 
y_pred[5:7]

In [None]:
# Actual Result is
y_test[5:7]

In [None]:
# For Input 270 >>

# Ya = 1719 Rs
# Yp = 1577 Rs

### Model Evaluation

In [50]:
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error

In [51]:
# Training dataset

y_pred_train = lr.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :", rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print("MAE :",mae)

r2 = r2_score(y_train, y_pred_train)
print("R Squared :", r2)

MSE : 37130688.144804604
RMSE : 6093.495560415598
MAE : 4250.192416235062
R Squared : 0.754837987179542


In [52]:
# Testing Dataset

y_pred = lr.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :", rmse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE :",mae)

r2 = r2_score(y_test, y_pred)
print("R Squared :", r2)

MSE : 34847537.912482776
RMSE : 5903.180321867423
MAE : 4126.465266241459
R Squared : 0.7225097139160521


### Single Row Testing

In [None]:
# No. of Input Columns
x.shape[1]

In [None]:
# Input Columns
x.columns

In [None]:
x.head(2)

In [53]:
#  what user will provide inputs

age = 40
gender = "male"
bmi = 22.5
children = 2
smoker = "no"
region = "southeast"

# charges = ?

In [54]:
gender_data = {"male":1,"female":0}
smoker_data = {'yes': 1, 'no': 0}

gender = gender_data[gender]
smoker = smoker_data[smoker]

print(gender)
print(smoker)

1
0


In [55]:
project_data = {"gender":gender_data , "smoker":smoker_data, "column":list(x.columns)}
project_data

{'gender': {'male': 1, 'female': 0},
 'smoker': {'yes': 1, 'no': 0},
 'column': ['age',
  'gender',
  'bmi',
  'children',
  'smoker',
  'region_northeast',
  'region_northwest',
  'region_southeast',
  'region_southwest']}

In [57]:
region1 = "region_" + region

In [None]:
region1

In [58]:
list(x.columns)
region_index = list(x.columns).index(region1)
region_index

7

In [None]:
x.columns

In [59]:
test_array = np.zeros([1,x.shape[1]])

test_array[0][0] = age
test_array[0][1] = gender
test_array[0][2] = bmi
test_array[0][3] = children
test_array[0][4] = smoker
test_array[0,region_index] = 1

test_array

array([[40. ,  1. , 22.5,  2. ,  0. ,  0. ,  0. ,  1. ,  0. ]])

In [60]:
lr.predict(test_array)[0].round(2)

5537.69

### Load Model 

In [61]:
with open("model.pkl","wb") as A:
    pickle.dump(lr,A)

### Load Data

In [62]:
with open("Project_data.json","w") as B:
    json.dump(project_data,B)

### VS Code 

In [63]:

# import pickle
# import json
# import pandas as pd 
# import numpy as np
# import config 

class MedicalInsurance():
    def __init__(self,age,gender,bmi,children,smoker,region):
        self.age=age
        self.gender=gender
        self.bmi=bmi
        self.children=children
        self.smoker=smoker
        self.region="region_"+ region

    def load_model(self):
        with open("model.pkl","rb") as f:
            self.model=pickle.load(f)

        with open("project_data.json","r") as f:
            self.json_data=json.load(f)

    def get_predicted_price(self):
        self.load_model()

        len(self.json_data["column"])
        region_index=self.json_data["column"].index(self.region)

        array = np.zeros([1,len(self.json_data["column"])])
        array[0][0] = self.age
        array[0][1] = self.json_data['gender'][self.gender]
        array[0][2] = self.bmi
        array[0][3] = self.children
        array[0][4] = self.json_data['smoker'][self.smoker]
        array[0,region_index] = 1
        
        # EX
        # A = {"gender": {"male": 1, "female": 0}}
        # A["gender"]["male"] >> 1

        print("test array - ",array)
        predicted_charges = self.model.predict(array)
        
        return np.around(predicted_charges,3)

if __name__ == "__main__":
    
    age = 29
    gender = "male"
    bmi = 22
    children = 0
    smoker = "no"
    region = "northeast"

    
    med_ins = MedicalInsurance(age,gender,bmi,children,smoker,region)
    charges = med_ins.get_predicted_price()
    print("charges for medicial insurance is --- ",charges)

test array -  [[29.  1. 22.  0.  0.  1.  0.  0.  0.]]
charges for medicial insurance is ---  [2661.095]
