In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

# from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.filterwarnings("ignore")

## Problem Statement

## Data Gathering 

In [None]:
df = pd.read_csv("medical_insurance.csv")
df

## Exploratory Data Analysys

In [None]:
df.shape

#### 1. Missing Values 

In [None]:
df.isna().sum()    

In [None]:
df.info()

### 1.Sex

In [None]:
df["sex"].unique()

In [None]:
df["sex"].nunique()

In [None]:
df["sex"].value_counts()

In [None]:
df["sex"].replace({"male":0,"female":1},inplace=True)
sex_dict = {"male":0,"female":1}
sex_dict

### smoker

In [None]:
df["smoker"].value_counts()

In [None]:
df["smoker"].replace({"no":0,"yes":1},inplace=True)
smoker_dict = {"no":0,"yes":1}
smoker_dict

### region

In [None]:
df["region"].value_counts()
df

In [None]:
df = pd.get_dummies(df,columns=["region"])
df

In [None]:
df.isna().mean()*100

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True)

In [None]:
sns.pairplot(df.corr())

In [None]:
df.boxplot()

In [None]:
df

#### Split train and test data 

In [None]:
x = df.drop("charges",axis=1)
y = df["charges"]

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.3,random_state=10)

In [None]:
model = LinearRegression()
model

In [None]:
model.fit(x_train,y_train)

### Model Evaluation

In [None]:
y_pred = model.predict(x_test)
y_pred[:5]

In [None]:
y_test[:5]

In [None]:
# Testing Dataset
y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE :",mae)

r2 = r2_score(y_test, y_pred)
print("R Squared :",r2)

In [None]:
# training Data Set
y_pred_train = model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print("MAE :",mae)

r2 = r2_score(y_train, y_pred_train)
print("R Squared :",r2)

#### Predict charges on the basis of single row 

In [None]:
print(sex_dict)
# print(region_dict)
print(smoker_dict)

In [None]:
column_name = x.columns
len(column_name)

In [None]:
column_name

In [None]:
x.head(1).T

In [None]:
age = 19
sex = "male"
bmi = 27.9
children = 0
smoker = "no"

region = "southwest"
 

In [None]:
data = {"sex_dict":sex_dict,"smoker_dict":smoker_dict,
       "column_name":list(column_name)}
data

In [None]:
region = "region_" + region
region

In [None]:
region_index = np.where(column_name == region)[0][0]
region_index

In [None]:
array = np.zeros(x.shape[1],dtype=int)
array

In [None]:
array[0] = age
array[1] = data["sex_dict"][sex]
array[2] = bmi
array[3] = children
array[region_index] = 1


In [None]:
medical_charges = model.predict([array])[0]
medical_charges

In [None]:
import json

with open ("data.json","w")as f:
    json.dump(data,f)

In [None]:
import pickle

with open("Medical_model.pkl","wb")as f:
    pickle.dump(model,f)