# Medical Expenses Prediction Analysis using Random Forest Regressor

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
data = pd.read_csv('med-insurance.csv')
data.shape

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
# lets check the descriptive summary
data.describe().style.background_gradient(cmap = 'Greens')

### Univariate Analysis

In [None]:
import plotly.graph_objects as go
#animals=['giraffes', 'orangutans', 'monkeys']

fig = go.Figure([go.Bar(x=data['smoker'], y=data['smoker'].index, marker_line_color='rgb(8,48,107)')])
fig.show()

In [None]:
fig = go.Figure([go.Bar(x=data['children'], y=data['children'].index, marker_line_color='rgb(100,48,107)')])
fig.show()

In [None]:
fig = go.Figure([go.Bar(x=data['region'], y=data['region'].index, marker_line_color='rgb(80,48,7)')])
fig.show()

In [None]:
fig = px.histogram(data, x="age")
fig.show()

In [None]:
fig = px.violin(data, x="bmi")
fig.show()

### Bivariate Analysis

In [None]:
# lets understand the impact of age on Medical Expenses
px.box(data, y = 'expenses',
           x = 'age')

* With Increasing Age, Expense is expeted to increase, but It is not obvious for all the scenarios.

In [None]:
# lets understand the impact of bmi on Medical Expenses
px.scatter(data, y = 'expenses',
           x = 'bmi')

In [None]:
px.box(data, x ='children',y = 'expenses', points="all")

In [None]:
px.box(data, x ='smoker',y = 'expenses', points="all")

In [None]:
data.head()

# Multivariate Analysis

In [None]:
fig = px.scatter(data, x="age", y="bmi",
	         size="children", color="expenses",
                 log_x=True, size_max=60)
fig.show()

### Data Processing

In [None]:
# lets perform encoding

# as we know males have higher expense than females, lets encode males as 2, and females as 1, 
# similarly smokers, have highers expense, so we will encode smokers as 2, and non smokers as 1,
# as we know that the south east region has higher expense than other regions

data['sex'] = data['sex'].replace(('male','female'), (2, 1))
data['smoker'] = data['smoker'].replace(('yes','no'), (2, 1))
data['region'] = data['region'].replace(('southeast','southwest','northeast','northwest'),(2, 1, 1, 1))

# let's check whether any categorical column is left
data.select_dtypes('object').columns

In [None]:
# now lets check our data again
data.head()

In [None]:
y = data['expenses']
X = data.drop(['expenses'], axis = 1)

In [None]:
X.head()

In [None]:
y.head()

In [None]:
# lets perform train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model Building using Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
model2 = RandomForestRegressor(n_estimators=100)
model2.fit(X_train, y_train)

In [None]:
y_pred = model2.predict(X_test)
y_test1 = y_test.values
data = np.concatenate((y_pred.reshape(len(y_pred),1), y_test1.reshape(len(y_test1),1)),1)
data

# Model Accuracy

In [None]:
# lets check the Model accuracy
from sklearn.metrics import r2_score, mean_squared_error

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE Score :", rmse)

r2_score = r2_score(y_test, y_pred)
print("R2 Score :",r2_score)

In [None]:
dataset = pd.DataFrame({'y_test': data[:, 0], 'y_pred': data[:, 1]})
dataset.to_csv('accuracy.csv')

In [None]:
dataset = pd.read_csv('accuracy.csv')

In [None]:
dataset.head()

In [None]:
import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(go.Scatter(x=dataset['Unnamed: 0'], y=dataset['y_test'], name='Actual Value',
                         line=dict(color='royalblue', width=3)))
fig.add_trace(go.Scatter(x=dataset['Unnamed: 0'], y=dataset['y_pred'], name = 'Predicted Value',
                         line=dict(color='firebrick', width=2)))


## This model can predict Medical Expences of a Person With the Accuracy of 87%