# Predicting Medical Cost 📜

#### If you like my work, It will be really great of you to upvote this notebook!
#### If not then you leaving a comment on what do I need to work on and improve will be really helpful!

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

## Loading up the data

In [None]:
df = pd.read_csv("../input/insurance/insurance.csv")
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
# male : 0, female : 1
df["sex"] = df["sex"].map({"male":0, "female":1})

# yes : 0, no : 1
df["smoker"] = df["smoker"].map({"yes":0, "no":1})

In [None]:
df.head()

In [None]:
# Having a look at the correlation matrix

fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, fmt='.1g', cmap="viridis", cbar=False);

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(7,7))
plt.pie(x=df["sex"].value_counts(), 
        colors=["skyblue","pink"], 
        labels=["Male","Female"], 
        shadow = True, 
        autopct="%1.2f%%", 
        explode = (0, 0.1)
        )
plt.show()

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,6))
sns.countplot(df["children"], palette="hls");

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(20,6))
sns.countplot(df["age"]);

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,6))
sns.histplot(x = df["bmi"], color="purple", kde=True);

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(7,7))
plt.pie(x=df["smoker"].value_counts(), 
        colors=["royalblue","orangered"], 
        labels=["Non-Smoker","Smoker"], 
        shadow = True, 
        autopct="%1.2f%%", 
        explode = (0, 0.1)
        )
plt.show()

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,6))
sns.countplot(df["region"]);

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,6))
sns.histplot(x = df["charges"], color="darkgreen", kde=True);

In [None]:
df.drop("region", axis=1, inplace=True)

In [None]:
plt.style.use("seaborn")
fig, ax =plt.subplots(1,2, figsize=(15,5)) 

sns.scatterplot(x = df['age'], y = df['charges'], ax=ax[0]);

sns.scatterplot(x = df['bmi'], y = df['charges'], ax=ax[1]);

## Splitting the data into training and test datasets
Here, we are trying to predict the Insurance Charges has diabetes or not using the given data. Hence, the `charges` will be the y label and rest of the data will be the X or the input data.

In [None]:
# X data
X = df.drop("charges",axis=1)
X.head()

In [None]:
# y data
y = df["charges"]
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

In [None]:
# Scaling the data 

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
LinearRegressionScore = lr.score(X_test, y_test)
print("Accuracy obtained by Linear Regression model:",LinearRegressionScore*100)

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 100)
rf.fit(X_train,y_train)

In [None]:
RandomForestRegressorScore = rf.score(X_test, y_test)
print("Accuracy obtained by Random Forest Regressor model:",RandomForestRegressorScore*100)

## Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
tree.fit(X_train,y_train)

In [None]:
DecisionTreeRegressorScore = tree.score(X_test, y_test)
print("Accuracy obtained by Decision Tree Regressor model:",DecisionTreeRegressorScore*100)

## KNeighborsRegressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

In [None]:
KNeighborsRegressorScore = knn.score(X_test, y_test)
print("Accuracy obtained by K Neighbors Regressor model:",KNeighborsRegressorScore*100)

## AdaBoost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor()
ada.fit(X_train, y_train)

In [None]:
AdaBoostRegressorScore = ada.score(X_test, y_test)
print("Accuracy obtained by AdaBoost Regressor model:",AdaBoostRegressorScore*100)

## Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

In [None]:
GradientBoostingRegressorScore = gbr.score(X_test, y_test)
print("Accuracy obtained by Gradient Boosting Regressor model:",GradientBoostingRegressorScore*100)

In [None]:
plt.style.use("seaborn")

x = ["DecisionTreeRegressor",
     "LinearRegression", 
     "AdaBoostRegressor",
     "KNeighborsRegressor", 
     "RandomForestRegressor", 
     "GradientBoostingRegressor"]

y = [DecisionTreeRegressorScore,
     LinearRegressionScore, 
     AdaBoostRegressorScore, 
     KNeighborsRegressorScore, 
     RandomForestRegressorScore, 
     GradientBoostingRegressorScore]

fig, ax = plt.subplots(figsize=(8,6))
sns.barplot(x=x,y=y, palette="crest");
plt.ylabel("Model Accuracy")
plt.xticks(rotation=40)
plt.title("Model Comparison - Model Accuracy", fontsize=14, fontname="Helvetica", y=1.03);