In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Columns

age: age of primary beneficiary

sex: insurance contractor gender, female, male

bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height,
objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9

children: Number of children covered by health insurance / Number of dependents

smoker: Smoking

region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.

charges: Individual medical costs billed by health insurance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('/kaggle/input/insurance/insurance.csv')
df.head()

# **Bivariate Analysis**

Age vs charges

In [None]:
sns.scatterplot(df.age,df.charges)
plt.show()

Here we can see as age is increasing, insurance charges are also increasing . Another important insight here would be 3 different groups can be seen here. This could be reflective of 3 different insurance packages with different benefits

Sex vs charges

In [None]:
df.groupby('sex')['charges'].mean().plot.bar()
plt.show()

We can see mean insurance charges for males are higher in comparison to females

BMI vs charges

In [None]:
sns.scatterplot(df.bmi,df.charges)
plt.show()

There does not seem to be no significant correlation between the bmi and charges

Children vs charges

In [None]:
sns.boxplot(y='charges',x='children',data=df)
plt.show()

Insurance cost is high if children are 2 or 3 but if children are 3+ then maybe insurance company is providing discounts and thus leading to decrease in insurance charges

Smoker vs Charges

In [None]:
df.groupby('smoker')['charges'].mean().plot.bar()
plt.show

Insurance charges are high if the customer is a smoker as there is a high chance he/she is at high risk of other diseases  

Region vs charges

In [None]:
sns.boxplot(x='region',y='charges',data=df)
plt.show()

Median Insurance cost is highest in southeast US whereas its similar in other parts 

Converting categorical columns into numerical 

Columns- region ,smoker,sex

In [None]:
df.groupby('region')['charges'].median().sort_values(ascending=False)

In [None]:
# Replacing region categories with labels as per the median values
#Region with highest median will get the highest numerical values
df.region=df.region.map({'northeast':4,'southeast':3,'northwest':2,'southwest':1})

In [None]:
#Creating dummies for smoker with drop_first=True
df=pd.get_dummies(df,columns=['sex','smoker'],drop_first=True)

### **Base Model**

Building linear regression through OLS method

In [None]:
import statsmodels.api as  sm

In [None]:
X=df.drop(columns='charges')
Y=df.charges

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=.3,random_state=0)

In [None]:
X_train_const=sm.add_constant(X_train)

In [None]:
model=sm.OLS(Y_train,X_train_const).fit()
model.summary()

### Calcualting SSE (Sum of squared error) & SSR(Sum of Squared Regression)

In [None]:
X_test.shape,X_train.shape

In [None]:
X_test_const=sm.add_constant(X_test)
y_pred=model.predict(X_test_const)

In [None]:
SSE=np.sum((Y_test-y_pred)**2)
SSR=np.sum((y_pred-Y_test.mean())**2)
SST=SSE+SSR

In [None]:
R2=SSR/SST
R2

In [None]:
N=len(X_test)# test data size
p=len(X_test.columns)
Adj_R2=1-(((1-R2)*(N-1))/(N-p-1))
Adj_R2

Calculating RMSE(Root Mean Squared Error)

In [None]:
rmse=np.sqrt(np.sum((y_pred-Y_test)**2)/N)
rmse

Building model through sklearn (Sklearn uses Gradient Descent Approach)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr=LinearRegression()
lr.fit(X_train,Y_train)

In [None]:
y_pred_lr=lr.predict(X_test)

In [None]:
lr.score(X_test,Y_test)

In [None]:
from sklearn.metrics import mean_squared_error
rmse_lr=np.sqrt(mean_squared_error(Y_test,y_pred_lr))
rmse_lr


RMSE is coming out to be same from both the methods OLS and Gradient Descent(Sklearn lib)


# Checking Assumptions of Linear Regression

In [None]:
model.summary()

## No Auto Correlation - 
In Context of linear regression, auto correlation refers to correlation between consecutive residuals

Test needed : Durbin- Watson Test.

    It's value ranges from 0-4. If the value of Durbin- Watson is Between 0-2, it's known as Positive Autocorrelation.
    
    If the value ranges from 2-4, it is known as Negative autocorrelation.
    
    If the value is exactly 2, it means No Autocorrelation.
    
For a good linear model, it should have low or no autocorrelation.


We can see from summary table Durbin Watson is almost close to 2 so there is no auto correlation

In [None]:
import statsmodels.tsa.api as smt
acf = smt.graphics.plot_acf(model.resid, lags=40 , alpha=0.05)
acf.show()

## Normality of residuals
1. The second assumption is the Normality of Residuals. 
For this we prefer the Jarque Bera test. For a good model, the residuals should be normally distributed.
The higher the value of Jarque Bera test , the lesser the residuals are normally distributed.
We generally prefer a lower value of jarque bera test.

  The Jarque–Bera test is a goodness-of-fit test of whether sample data 
  have the skewness and kurtosis matching a normal distribution.

    
  The jarque bera test tests whether the sample data has the skewness and kurtosis matching a normal distribution.
  Note that this test generally works good for large enough number of data samples(>2000) as the test statistics asymptotically has a chi       squared distribution with degrees 2 of freedom.

2. We can go for shapiro test here as our dataset is small.



In [None]:
from scipy.stats import shapiro
shapiro(model.resid)
#Since p value is almost 0 so we reject null hypothesis and residual distribution is not normal

In [None]:
#Here we can see visually also that residuals are not normally distributed and there is high skewness
sns.distplot(model.resid)
plt.show()

## Linearity of residuals
Here we have 2 options. Either we can plot the observed values Vs predicted values and plot the Residual Vs predicted values and see the linearity of residuals.

OR

We can go for rainbow test. Let's look both of them one by one.

OR 

We can also check if mean of residual is 0 or not

In [None]:
%matplotlib inline
%config InlineBackend.figure_format ='retina'
import seaborn as sns 
import matplotlib.pyplot as plt
import statsmodels.stats.api as sms
sns.set_style('darkgrid')
sns.mpl.rcParams['figure.figsize'] = (15.0, 9.0)

def linearity_test(model, y):
    '''
    Function for visually inspecting the assumption of linearity in a linear regression model.
    It plots observed vs. predicted values and residuals vs. predicted values.
    
    Args:
    * model - fitted OLS model from statsmodels
    * y - observed values
    '''
    fitted_vals = y_pred
    resids = y_pred-Y_test

    fig, ax = plt.subplots(1,2)
    
    sns.regplot(x=fitted_vals, y=y, lowess=True, ax=ax[0], line_kws={'color': 'red'})
    ax[0].set_title('Observed vs. Predicted Values', fontsize=16)
    ax[0].set(xlabel='Predicted', ylabel='Observed')

    sns.regplot(x=fitted_vals, y=resids, lowess=True, ax=ax[1], line_kws={'color': 'red'})
    ax[1].set_title('Residuals vs. Predicted Values', fontsize=16)
    ax[1].set(xlabel='Predicted', ylabel='Residuals')
    
linearity_test(model, Y_test)  

To detect nonlinearity one can inspect plots of observed vs. predicted values or residuals vs. predicted values. 
The desired outcome is that points are symmetrically distributed around a diagonal line in the former plot or 
around horizontal line in the latter one. 
In both cases with a low linearity of residuals can be seen. Less verify with rainbow test

In [None]:
import statsmodels.api as sm
sm.stats.diagnostic.linear_rainbow(res=model, frac=0.5)

P value is  more than .05 so we fail to reject null hypothesis and there is linearity in residuals

In [None]:
np.mean(model.resid)
# mean is close to 0 so residual are linear

## Homoscedasticity_test(using goldfeld test) OR (Beusch-Wagon Test)
Homoscedacity :: If the residuals are symmetrically distributed across the trend , then it is called as homoscedacious.

Heteroscedacity :: If the residuals are not symmetric across the trend, then it is called as heteroscedacious. In this the residuals can form an arrow shape or any other non symmetrical shape.

This test is based on the hytpothesis testing where null and alternate hypothesis are:

H0 = constant variance among residuals. (Homoscedacity)

Ha = Heteroscedacity.

The residuals should be homoscedacious.

In [None]:
from statsmodels.compat import lzip
from statsmodels.compat import lzip
%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
sns.set_style('darkgrid')
sns.mpl.rcParams['figure.figsize'] = (15.0, 9.0)

model = model
fitted_vals = model.predict()
resids = model.resid
resids_standardized = model.get_influence().resid_studentized_internal
fig, ax = plt.subplots(1,2)

sns.regplot(x=fitted_vals, y=resids, lowess=True, ax=ax[0], line_kws={'color': 'red'})
ax[0].set_title('Residuals vs Fitted', fontsize=16)
ax[0].set(xlabel='Fitted Values', ylabel='Residuals')
sns.regplot(x=fitted_vals, y=np.sqrt(np.abs(resids_standardized)), lowess=True, ax=ax[1], line_kws={'color': 'red'})
ax[1].set_title('Scale-Location', fontsize=16)
ax[1].set(xlabel='Fitted Values', ylabel='sqrt(abs(Residuals))')

name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(model.resid, model.model.exog)
lzip(name, test)

 We can also use two statistical tests: Breusch-Pagan and Goldfeld-Quandt. In both of them the null hypothesis assumes 
homoscedasticity and a p-value below a certain level (like 0.05)
indicates we should reject the null in favor of heteroscedasticity.

Here, p value is less than 0.05 so, it is homoscedasticity distribution.

H0 = constant variance (Homoscedacity)

Ha = Heteroscedacity

## MultiCollinearity

We use VIF method to check for multicollinearity.
In VIF Method we only deal with independent variables or features. VIF value for each feature is calculated by making that feature as target and rest of the features as independent variables and regression is done and VIF is calculated as 1/(1-R^2).

So for a feature exhibiting multicollinearity, R score would be high so denomminator will be low and VIF value would be high. We take feature with VIF factor >5 to have multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = [variance_inflation_factor(X_train_const.values, i) for i in range(X_train_const.shape[1])]
pd.DataFrame({'vif': vif}, index=X_train_const.columns)

Since VIF factor for all features is less than 5 so no multicollinearity exists