In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Feature Selection
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Model Building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Evaluation Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from scipy.stats import zscore

# 1. Data Gathering

In [2]:
df = pd.read_csv("medical_insurance.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: 'medical_insurance.csv'

# 2. Explioratorty Data Analysis and Feature Engg

## 1. Age

In [None]:
df["age"]

In [None]:
df["age"].info()

In [None]:
df["age"].value_counts()

### Detecting Outliers

In [None]:
sns.boxplot(df["age"])

In [None]:
# No OUtliers.

In [None]:
df.info()

## 2. sex

In [None]:
df["sex"].value_counts()

In [None]:
df["sex"].replace({"male": 1, "female": 0}, inplace = True)
df

In [None]:
df["sex"].isna().sum()

In [None]:
df.info()

## 3. bmi

In [None]:
df["bmi"].isna().sum()

In [None]:
# No null values

In [None]:
sns.boxplot(df["bmi"])

In [None]:
# Outliers Detected
# Outliers are only in the upper tail
# Getting the outliers

In [None]:
q1 = df['bmi'].quantile(0.25)
q2 = df['bmi'].quantile(0.50)
q3 = df['bmi'].quantile(0.75)
median = df['bmi'].median()

iqr = q3 - q1

upper_tail = q3 + 1.5 * iqr
lower_tail = q1 - 1.5 * iqr

print("Q1 :", q1)
print("Q2 :", q2)
print("Q3 :", q3)
print("Median :",median)

print("upper_tail :", upper_tail)
print("lower_tail :", lower_tail)

In [None]:
df["bmi"].loc[df["bmi"] > upper_tail]

In [None]:
# Handling the outliers by tranformation

In [None]:
# Log Tranformation

In [None]:
sns.boxplot(np.log(df["bmi"]))

In [None]:
# Sqrt tranformation

In [None]:
sns.boxplot(np.sqrt(df["bmi"]))

In [None]:
# cube root tranformation

In [None]:
sns.boxplot(np.cbrt(df["bmi"]))

In [None]:
# reciprocal

In [None]:
sns.boxplot(1/(df["bmi"]))

In [None]:
# standardization

In [None]:
z_score_values = zscore(df['bmi'])
z_score_values

In [None]:
sns.boxplot(z_score_values)

In [None]:
# Normlization

In [None]:
min_value = np.min(df['bmi'])
max_value = np.max(df['bmi'])

In [None]:
normalized_list = []

for i in df['bmi']:
    normalized_value = (i- min_value)/(max_value - min_value)
    normalized_list.append(normalized_value)
    
normalized_list

In [None]:
sns.boxplot(normalized_list)

In [None]:
# Lowest outliers are there in SQrt and Cbrt tranformation
# we will find the lowest of both

In [None]:
# 1. Sqrt

In [None]:
q1 = np.sqrt(df['bmi']).quantile(0.25)
q2 = np.sqrt(df['bmi']).quantile(0.50)
q3 = np.sqrt(df['bmi']).quantile(0.75)
median = np.sqrt(df['bmi']).median()

iqr = q3 - q1

upper_tail = q3 + 1.5 * iqr
lower_tail = q1 - 1.5 * iqr

print("Q1 :", q1)
print("Q2 :", q2)
print("Q3 :", q3)
print("Median :",median)

print("upper_tail :", upper_tail)
print("lower_tail :", lower_tail)

In [None]:
np.sqrt(df['bmi']).loc[np.sqrt(df['bmi']) > upper_tail]

In [None]:
# So we will do Sqrt tranformation
# 

In [None]:
df["bmi"] = np.sqrt(df['bmi'])
df

In [None]:
# now the remaining 3 outliers will be replaced by mean().

In [None]:
df['bmi'].loc[df['bmi'] > upper_tail] = df['bmi'].mean()

In [None]:
# Now checking for Outliers

In [None]:
sns.boxplot(df["bmi"])

In [None]:
# Only one outlier but not that signifcant as compared to size of data

In [None]:
df.info()

## 4. children

In [None]:
df["children"].value_counts()

In [None]:
df["children"].isna().sum()

## 5. smoker

In [None]:
df["smoker"].value_counts()

In [None]:
df["smoker"].isna().sum()

In [None]:
df["smoker"].replace({"yes": 1 , "no" : 0}, inplace = True)
df

In [None]:
df.info()

## 6. region

In [None]:
df["region"].value_counts()

In [None]:
df = pd.get_dummies(df, columns=['region'])
df

In [None]:
df.info()

## Checking the Target column

## Charges 

In [None]:
df["charges"].isna().sum()

In [None]:
# Now whole data is numeric are ready of linear alogorithms

# 3. Feature Selection

## Checking the assumptions

### 1. Linearity 

In [None]:
corr = df.corr()
corr

In [None]:
sns.heatmap(corr[["charges"]], annot = True)

In [None]:
# Conclusion

### 2. No Multi Colinearity 

In [None]:
vif_list = []

for i in range(df.shape[1]):
    vif = variance_inflation_factor(df.to_numpy(),i)
    vif_list.append(vif)
    
s1 = pd.Series(vif_list, index=df.columns)
s1.sort_values().plot(kind = 'barh')

In [None]:
# Vif values are less than 5 for each feature
# We can see that all the features are not multi colinear

# 4 Model Building

In [None]:
# Splitting the data

In [None]:
x = df.drop("charges", axis = 1) 
x

In [None]:
y = df["charges"]
y

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=.2 ,random_state=48)
x_train


In [None]:
x_test

In [None]:
display(y_train,y_test)

In [None]:
Lin_model = LinearRegression()

In [None]:
Lin_model.fit(x_train,y_train)

# Model Evaluation

In [None]:
# Testing

y_pred = Lin_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Sqaured Error :",mse)

rmse = np.sqrt(mse)
print("Root Mean Sqaured Error :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error :",mae)

r_squared_value = r2_score(y_test, y_pred)
print("R Squared Value :",r_squared_value)

r2 =  Lin_model.score(x_test, y_test)
print("Direct R Squared Value :",r2)

adj_r2  = 1 - (((1 - r_squared_value) * (x_test.shape[0] - 1)) / (x_test.shape[0] - x_test.shape[1] - 1))
print("Adjusted R Squared Value :",adj_r2)

In [None]:
# Model Evaluation for Training Data

y_pred_train = Lin_model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train)
print("Mean Sqaured Error :",mse)

rmse = np.sqrt(mse)
print("Root Mean Sqaured Error :",rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print("Mean Absolute Error :",mae)

r_squared_value = r2_score(y_train, y_pred_train)
print("R Squared Value :",r_squared_value)

# Direct calculation
r2 = Lin_model.score(x_train, y_train)
print("Direct R Squared Value :",r2)

adj_r2  = 1 - (((1 - r_squared_value) * (x_train.shape[0] - 1)) / (x_train.shape[0] - x_train.shape[1] - 1))
print("Adjusted R Squared Value :",adj_r2)

In [None]:
# R2Test = 0.7464426208244624
# R2Train = 0.7507459874931361

# variance = 0.7507459874931361 - 0.7464426208244624 
# variance = 0.004303366668673703

In [None]:
# Model has underfitted

In [None]:
import pickle

with open("linear_medical.pkl", "wb") as f:
    pickle.dump(Lin_model, f)


In [None]:
x.columns

In [None]:
json_data = {"sex" : {"male": 1, "female": 0},
             "smoker" : {"yes": 1 , "no" : 0},
             "columns" : list(x.columns)}
json_data

In [None]:
age = 45
sex = "male"
bmi = 29
children = 0.0
smoker = "no"
region = "northwest"

region = "region_" + region
region

region_index = np.where(x.columns == region)[0][0]
region_index

In [None]:
test_array = np.zeros(len(x.columns))
test_array

In [None]:
test_array[0] = age
test_array[1] = json_data['sex'][sex]
test_array[2] = np.sqrt(bmi)
test_array[3] = children
test_array[4] = json_data['smoker'][smoker]
test_array[region_index] = 1

test_array

In [None]:
charges = round(Lin_model.predict([test_array])[0],2)
print("Predicted Medical Insurance Charges is :", charges, "/- Rs. Only")

In [None]:
# with open("Linear model.pkl", "rb") as f:
#     model = pickle.load(f)


In [None]:
model.predict([test_array])

In [None]:
pip 