# Medical Cost Prediction

### Load Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [None]:
df = pd.read_csv("insurance.csv")
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

### Visualization + Preprocess

In [None]:
le = LabelEncoder()
le.fit(df['sex'].drop_duplicates()) 
df['sex'] = le.transform(df['sex'])

le.fit(df['smoker'].drop_duplicates()) 
df['smoker'] = le.transform(df['smoker'])

le.fit(df['region'].drop_duplicates()) 
df['region'] = le.transform(df['region'])

In [None]:
f = plt.figure(figsize=(18,5))

ax = f.add_subplot(121)
sns.distplot(df[(df['smoker'] == 1)]['charges'], color='c', ax=ax)
ax.set_title('Distribution of charges for smokers')

ax = f.add_subplot(122)
sns.distplot(df[(df['smoker'] == 0)]['charges'], color='b', ax=ax)
ax.set_title('Distribution of charges for non-smokers')

In [None]:
sns.catplot(x='smoker', kind='count', hue='sex', data=df)

In [None]:
plt.figure(figsize=(12,5))
plt.title('Distribution of age')
sns.distplot(df['age'], color='g')

In [None]:
sns.lmplot(x='age', y='charges', hue='smoker', data=df)

In [None]:
plt.figure(figsize=(12,5))
plt.title('Distribution of bmi')
sns.distplot(df['bmi'], color='g')

In [None]:
sns.lmplot(x='bmi', y='charges', hue='smoker', data=df)

In [None]:
df.corr()['charges'].sort_values()

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(df.corr(), annot=True, fmt= '.2f', cmap='coolwarm_r')

In [None]:
X = df.drop(['charges'], axis=1)
y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

### Models Training + Evaluation

In [None]:
models = [
    LinearRegression(), 
    RandomForestRegressor(n_estimators=100, criterion='mse', random_state=1, n_jobs=-1)
]

for m in models: 
    m.fit(X_train, y_train)
    name = m.__class__.__name__
    y_train_pred = m.predict(X_train)
    y_test_pred = m.predict(X_test)
    acc = r2_score(y_test, y_test_pred)
    
    print("R2 of {}: {}".format(name, acc))
    
    plt.figure(figsize=(10,6))
    plt.scatter(y_train_pred, y_train_pred-y_train, c='black', marker='o', s=35, alpha=0.5, label='Train data')
    plt.scatter(y_test_pred, y_test_pred-y_test, c='m', marker='o', s=35, alpha=0.7, label='Test data')
    plt.xlabel('Predicted Values')
    plt.ylabel('Tailings')
    plt.legend(loc='upper left')
    plt.hlines(y=0, xmin=0, xmax=60000, lw=2, color='red')
    plt.show()

In [None]:
'''
Inspiration
1. https://www.kaggle.com/hely333/eda-regression
'''