# Health insurance cost prediction

![Insurance cost](https://www.getinsuredonline.com/wp-content/uploads/2020/08/the-importance-of-having-health-insurance.jpg)

# Build different 5 models on insurance data :


### * 1- linear regression 
### * 2-linear regression -after removing the outliers- 
### * 3-Ensemble - bagging-
### * 4-Lofistic regression 
### * 5-logistic regression -after removing the outliers- 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df=pd.read_csv("../input/insurance/insurance.csv")


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull()

In [None]:
df.isnull().sum()

### No null values found in the dataset!

In [None]:
fig, ax = plt.subplots(figsize=(9,5))
sns.heatmap(df.isnull(), cbar=False, cmap="YlGnBu_r")
plt.show()

### From graph we can make sure there are no null values !

In [None]:
df.shape

### Extract unique values of region attribute so later we can conert to numeric and use it in the model!

In [None]:
df.region.unique()

In [None]:
#df.describe()
df.describe().transpose() #for more organization

In [None]:
print(df['age'].median())
print(df['bmi'].median())

In [None]:
df.describe(include=['O'])

In [None]:
c=df.corr()
c

### Check duplicates rows then drop them !

In [None]:
if len(df[df.duplicated()]) > 0:
    print("No. of duplicated entries: ", len(df[df.duplicated()]))
    print(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head())
else:
    print("No duplicated entries found")

In [None]:
df.drop(df.index[581], inplace=True)

### Plot to discover outliers!

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(data=df)

### Frome graph we can notice thet there is some outliers in charges attributes ! , However we will see accuracy with and without removing outliers to if see our models are roubust to them and give a good accuracy? or not ?

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum()

In [None]:
df.shape[0]

In [None]:
#convert sex to 1 for female and 0 male:

df.sex.loc[df['sex']=='female']=1
df.sex.loc[df['sex']=='male']=0

#convert smoker to 1 for yes and 0 for no :
df.smoker.loc[df['smoker']=='yes']=1
df.smoker.loc[df['smoker']=='no']=0

#convert region to numeric :
regions={'region':{'southwest':1, 'southeast':2, 'northwest':3, 'northeast':4}}
df.replace(regions,inplace=True)
df.head()
    

In [None]:
df.to_csv('CleanedInsurance.csv', index = False)

In [None]:
#Correlation with output variable
cor_target = abs(c["charges"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.2]
relevant_features

#### From above and correaltopn tabe we can see that there is no strong corelations on the data , However there are many cases where  variables might not show a strong bivariate correlation but may show a strong association in regression , so lets build our model and see the accuracy!

## visualisation

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(c, annot=True)

In [None]:
labels = ['Female', 'Male']
size = df['sex'].value_counts()
colors = ['red', 'blue']
explode = [0, 0.1]

plt.rcParams['figure.figsize'] = (5, 5)
plt.pie(size, colors = colors, explode = explode, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('sex', fontsize = 20)
plt.axis('off')
plt.legend()
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (15, 8)
sns.countplot(df['age'], palette = 'hsv')
plt.title('Distribution of Age', fontsize = 20)
plt.show()

In [None]:
sns.catplot(x="smoker", kind="count",hue = 'sex', palette="cool", data=df)

In [None]:
sns.catplot(x="children", kind="count", palette="pink", data=df, size = 6)

In [None]:
sns.pairplot(df)
plt.title('Pairplot for the Data', fontsize = 20)
plt.show()

In [None]:
sns.scatterplot(x="bmi", y="charges", data=df, palette='Set2', hue='sex')

In [None]:
sns.distplot(df['charges'])

In [None]:
sns.distplot(df['age'])

In [None]:
sns.distplot(df['bmi'])

In [None]:
feature_cols = ['age', 'bmi','children']
# multiple scatter plots, note that we're not including 'sex' and 'smoker', why? because it is catogoricalة
sns.pairplot(df, x_vars=feature_cols, y_vars='charges', kind='reg')

In [None]:
#Scatter plot to seeif there is a dependency between attributes smoker and charges accross different ages
plt.figure(figsize=(8,6))
sns.scatterplot(df.age, df.charges,hue=df.smoker,palette= ['red','green'] ,alpha=0.6)
plt.show()

In [None]:
#Scatter plot to see if there is a dependency between attributes sex and charges accross different ages
plt.figure(figsize=(8,6))
plt.figure(figsize=(8,6))
sns.scatterplot(df.age, df.charges,hue=df.sex,palette= ['blue','red'] )
plt.show()

In [None]:
#Scatter plot to see if there is a dependency between attributes sex and charges accross different ages
plt.figure(figsize=(8,6))
plt.figure(figsize=(8,6))
sns.scatterplot(df.bmi, df.charges,hue=df.sex,palette= 'Set2' )
plt.show()

In [None]:
plt.figure(figsize=(14,6))
plt.title('Relation between children and Charges')
#sns.regplot(x=df['children'],y=df['charges'])
sns.barplot(x=df['children'], y=df['charges'])

In [None]:
charges = df['charges'].groupby(df.region).sum().sort_values(ascending = True)
f, ax = plt.subplots(1, 1, figsize=(8, 6))
ax = sns.barplot(charges.head(), charges.head().index, palette='Purples')

In [None]:
f, ax = plt.subplots(1, 1, figsize=(12, 8))
ax = sns.barplot(x='region', y='charges', hue='sex', data=df, palette='cool')

In [None]:
ax = sns.lmplot(x = 'age', y = 'charges', data=df, hue='smoker', palette='Set1')
ax = sns.lmplot(x = 'bmi', y = 'charges', data=df, hue='smoker', palette='Set2')
ax = sns.lmplot(x = 'children', y = 'charges', data=df, hue='smoker', palette='Set3')

In [None]:
print(df['charges'].min())
print(df['charges'].max())


In [None]:
df['charges_bins'] = pd.cut(df['charges'], bins=[0, 1300, 26000, 39000, 52000, 65000])
df.head()


In [None]:
#Creating a countplot based on the amount of charges
plt.figure(figsize=(12,4))
sns.countplot(x='charges_bins', data=df, palette='husl') 
plt.title('Number of people paying x amount\n for each charges category', size='23')
plt.xticks(rotation='25')
plt.ylabel('Count',size=18)
plt.xlabel('Charges',size=18)
plt.show()

### Start Regression

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
df.head()

### 1- build model without removing outliers!

In [None]:
feature_cols = ['age','sex','bmi','children','smoker','region'] # a lsit of the predictors
X1 = df[feature_cols] # subsetting our data to only the predictors
y1 = df['charges'] # our response variable
#X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1,train_size=0.75)

In [None]:

# Pick a new random training and test set

linreg = LinearRegression()
linreg.fit(X_train1, y_train1)
y_pred = linreg.predict(X_test1)

In [None]:
# pair the feature names with the coefficients
zip(feature_cols, linreg.coef_)
print(feature_cols)
print(linreg.intercept_)
print(linreg.coef_)

In [None]:
# score it on our test set to get a better sense of out of sample performance
linreg.score(X_test1, y_test1)

In [None]:
linreg.score(X_train1, y_train1)

In [None]:

print('MAE:', metrics.mean_absolute_error(y_test1, y_pred))
print('MSE:', metrics.mean_squared_error(y_test1, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test1, y_pred)))

In [None]:
diff = y_test1 - y_pred
diff.hist(bins = 40)
plt.title('Histogram of prediction errors')
plt.xlabel('cost prediction error')
plt.ylabel('Frequency')

In [None]:
                fig = px.scatter(x=y_test1, y=y_pred, labels={'x': 'ground truth', 'y': 'prediction'})
                fig.add_shape(
                    type="line", line=dict(dash='dash'),
                    x0=y1.min(), y0=y1.min(),
                    x1=y1.max(), y1=y1.max()
                )
                fig.show()

In [None]:
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import learning_curve

In [None]:
### Bundling our previous work into a function ###
def learning_curves(estimator, data, features, target, train_sizes, cv):
    train_sizes, train_scores, validation_scores = learning_curve(
    estimator, data[features], data[target], train_sizes =
    train_sizes,
    cv = cv, scoring = 'neg_mean_squared_error')
    train_scores_mean = -train_scores.mean(axis = 1)
    validation_scores_mean = -validation_scores.mean(axis = 1)

    plt.plot(train_sizes, train_scores_mean, label = 'Training error')
    plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')

    plt.ylabel('MSE', fontsize = 14)
    plt.xlabel('Training set size', fontsize = 14)
    title = 'Learning curves for a ' + str(estimator).split('(')[0] + ' model'
    plt.title(title, fontsize = 18, y = 1.03)
    plt.legend()

### Plotting the two learning curves ###

from sklearn.ensemble import RandomForestRegressor
train_sizes = [1, 100, 500, 800, 900, 1000]
features = ['age','sex','bmi','children','smoker','region']
target = ['charges']
plt.figure(figsize = (16,5))

for model, i in [(DecisionTreeRegressor(), 1), (LinearRegression(),2)]:
    plt.subplot(1,2,i)
    learning_curves(model, df, features, target, train_sizes, 5)

### using null model 

In [None]:
average_charges = df['charges'].mean()
average_charges

In [None]:
num_rows = df.shape[0]
num_rows

In [None]:
null_model_predictions = [average_charges]*num_rows
null_model_predictions

In [None]:
print('MAE:', metrics.mean_absolute_error(y1, null_model_predictions))
print('MSE:', metrics.mean_squared_error(y1, null_model_predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y1, null_model_predictions)))

So, we are beating the null model 

### Train data using ensemble (bagging) and see its performance !


In [None]:
from sklearn.ensemble import BaggingRegressor
#from sklearn import tree
#model = BaggingRegressor(tree.DecisionTreeRegressor(random_state=1))
#model.fit(X_train1, y_train1)
#model.score(X_test1,y_test1)

In [None]:
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# from sklearn import model_selection
# from sklearn.ensemble import BaggingClassifier
# from sklearn.tree import DecisionTreeClassifier

# seed = 7
# kfold = model_selection.KFold(n_splits=10, random_state=seed)
# cart = DecisionTreeClassifier()
# num_trees = 100
# model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
# results = model_selection.cross_val_score(model, X1, y1, cv=kfold)
# print(results.mean())

In [None]:
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeRegressor
estimators = list(range(1, 20))
accuracyTest = []
aacuracyTrain=[]
for n_estimators in estimators:
    clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                            max_samples=0.2,
                            n_estimators=n_estimators).fit(X_train1, y_train1)
    acc = clf.score(X_test1, y_test1)
    accuracyTest.append(acc)
    acc1 = clf.score(X_train1, y_train1)
    aacuracyTrain.append(acc1)
    fig = px.scatter(x=y_test1, y=clf.predict(X_test1), labels={'x': 'ground truth', 'y': 'prediction'})
    fig.add_shape(
    type="line", line=dict(dash='dash'),
    x0=y1.min(), y0=y1.min(),
    x1=y1.max(), y1=y1.max()
    )
    fig.show()

plt.plot(estimators, accuracyTest)
plt.xlabel("Number of estimators")
plt.ylabel("Accuracy")
plt.show()

In [None]:
accuracyTest

In [None]:
np.mean(accuracyTest)

In [None]:
np.max(accuracyTest)

In [None]:
np.max(aacuracyTrain)

In [None]:
# clf1 = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
#                             n_estimators=10,
#                             bootstrap=False,
#                             bootstrap_features=False,
#                             random_state=5).fit(X_train1, y_train1)

In [None]:
#clf1.estimators_

### Train data with linear regression again BUT after removing outliers to see their affect in the model !

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum()

In [None]:
q1 = df['charges'].quantile(0.25)
q3=df['charges'].quantile(0.75)
iqr = q3 - q1

In [None]:
charges = df[df['charges']< (q1 - 1.5 * iqr)]
charges = df[df['charges']> (q3 + 1.5 * iqr)]
charges.describe()

In [None]:
dfWithoutOutlier = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
dfWithoutOutlier.shape

In [None]:
dfWithoutOutlier.head()

In [None]:
dfWithoutOutlier.describe()

In [None]:
feature_cols = ['age','sex','bmi','children','smoker','region'] # a lsit of the predictors
X2 = dfWithoutOutlier[feature_cols] # subsetting our data to only the predictors
y2 = dfWithoutOutlier['charges'] # our response variable

X_train2, X_test2, y_train2, y_test2 = train_test_split(X1, y1)
# Pick a new random training and test set

linreg = LinearRegression()
linreg.fit(X_train2, y_train2)
y_pred = linreg.predict(X_test2)
# score it on our test set to get a better sense of out of sample performance
linreg.score(X_test2, y_test2)

In [None]:
linreg.score(X_train2, y_train2)

#### The accuracy very bad as a random model !! which indicates that the outliers must not remove and they are significant 

# Logistic Regrssion :

In [None]:
# mean=df['charges'].mean()
# median=df['charges'].median()
# print(median)
# #we will consider any value higher than mean as expensive (1) otherwise normal price(0).

df2=df
df2.head()

### I chose the threshold based on some articles and resourcese that reference the avg cost of the insurance in US is 9596

In [None]:
df2.loc[df['charges'] <= 9596, 'charges'] = 0 #on or under avarage
df2.loc[df['charges'] > 9596, 'charges'] = 1 # above avarage
df2

In [None]:
df3=dfWithoutOutlier
df3.loc[df3['charges'] <= 9596, 'charges'] = 0 #on or under avarage
df3.loc[df3['charges'] > 9596, 'charges'] = 1 # above avarage
df3

In [None]:
df2.groupby(['charges']).count()

In [None]:
sns.catplot(x="charges", kind="count", palette="cool", data=df3)

In [None]:
sns.catplot(x="charges", kind="count", palette="cool", data=df2)

In [None]:
feature_cols = ['age','sex','bmi','children','smoker','region'] # a lsit of the predictors
X = df2[feature_cols] # subsetting our data to only the predictors
y = df2['charges'] # our response variable

In [None]:
y

In [None]:
#split data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y)
print (X_train.shape,X_test.shape,y_train.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
# instantate our model
logreg = LogisticRegression()
# fit our model to our training set
logreg.fit(X_train, y_train)

In [None]:
# score it on our test set to get a better sense of out of sample performance
logreg.score(X_test, y_test)


In [None]:
logreg.score(X_train, y_train)

In [None]:
# pair the feature names with the coefficients
zip(feature_cols, logreg.coef_)
print(feature_cols)
print(logreg.intercept_)
print(logreg.coef_)

## Witout outliers !

In [None]:
feature_cols = ['age','sex','bmi','children','smoker','region'] # a lsit of the predictors
X4 = df3[feature_cols] # subsetting our data to only the predictors
y4 = df3['charges'] # our response variable
#split data to train and test
X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4)
# instantate our model
logreg = LogisticRegression()
# fit our model to our training set
logreg.fit(X_train4, y_train4)
logreg.score(X_test4, y_test4)

In [None]:
from sklearn.metrics import confusion_matrix
LR_prediction = logreg.predict(X_test)
confusion_matrix = confusion_matrix(y_test.tolist(), LR_prediction.tolist())
print(confusion_matrix)

In [None]:
class_names=[1,0] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap

sns.heatmap(pd.DataFrame(confusion_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")

plt.tight_layout()
plt.figure(figsize=(5,2))
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
error = pd.DataFrame({'Actual': y_test, 'Predicted': LR_prediction})
error.head(20)

In [None]:
from sklearn import metrics
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
# Compute error between our test predictions and the actual values.
mean_squared_error(LR_prediction, y_test.tolist())

In [None]:

# generate class probabilities
probs = logreg.predict_proba(X_test)
print (probs)