In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pylab
%matplotlib inline

from scipy import stats
import sklearn
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats import diagnostic as diag
from sklearn.linear_model import LinearRegression



import math

**Description:**

**Model Relating Viscosity to moisture, protein, and ash contents in flour used in baking ice cream cones.**

**Model: V = b0 + b1M + b2P + b3A**

In [None]:
df=pd.read_csv("../input/baking-ice-cream-cones/icecreamcone.csv")
df.head()

# Change Index To Month Number

In [None]:
df.set_index('codeNum',inplace=True)
df

**Model Assumptions**

1. Regression residuals must be normally distributed.

2. A linear relationship is assumed between dependent and independent variables.

3. Residuals are homoskedastic (error terms are constant)

4. Absence of multicollinearity

5. No autocorrelation of the residuals

In [None]:
df.dtypes

# Setting Datatype Of Dataframe To Float

In [None]:
df=df.astype(float)
df.dtypes

# Checking Any Null Value

In [None]:
df.isna().any()

# Checking Multicollinearity In The Data

Multicollinearity means variables are behaving so similarly that it is not possible to identify which variable has hat impact on the model.

## Print Correlation Matrix Of Our Dataframe

In [None]:
corr=df.corr()
display(corr)

## Plotting heat map: Heat map gives the idea about correlation of variables in the dataset.

Diagonally we should have a dark colour which shows perfect correlation.

In [None]:
#plot a heat map
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,cmap='RdBu')

Looking at heatmap alongwith correlation matrix we identify that ash and protein are very highly correlated. corr=1

**To be more systematic, we will use variance_inflation_factor method defined by statsmodel API**

*If VIF>=5 for variables, then they should be removed from the model.*

In [None]:
df_before=df
X1= df_before.drop('viscosity',axis=1)
# For each X, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])]
vif["features"] = X1.columns

display(vif)

From output, we observe that moisture and protein have VIF infinity

So, we drop column moisture

In [None]:
df_after= df.drop('protein',axis=1)
X2= df_after.drop('viscosity',axis=1)
# For each X, calculate VIF and save in dataframe
vif1 = pd.DataFrame()
vif1["VIF Factor"] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
vif1["features"] = X2.columns

display(vif1)

In [None]:
#plot the scatter matrix
pd.plotting.scatter_matrix(df_after,alpha=0.3)
plt.show()

# Describe The Original Dataset

In [None]:
desc_df=df.describe()

**Add The Standard Deviation Metric**

In [None]:
desc_df.loc['+3std']=desc_df.loc['mean']+(desc_df.loc['std']*3)
desc_df.loc['-3std']=desc_df.loc['mean']-(desc_df.loc['std']*3)
desc_df

In [None]:
d1=df['ash']
plt.boxplot(d1)

In [None]:
d2=df['moisture']
plt.boxplot(d2)

# Build The Model

In [None]:
X=df.drop(['viscosity','protein'],axis=1)
Y= df[['viscosity']]
X

In [None]:
lm = sm.add_constant(X)

In [None]:
result = sm.OLS(Y,lm).fit()
result.summary()

# Evaluating the model

**Checking for Heteroskedasticity**

statsmodels.stats.diagnostic.het_white(resid, exog, retres=False)





**White’s Lagrange Multiplier Test for Heteroscedasticity**

The null hypothesis for White's test is that the variances for the errors are equal.

resid: array_like

residuals, square of it is used as endogenous variable

exog: array_like

possible explanatory variables for variance, squares and
 interaction terms are included in the auxilliary regression.

Returns:

lm: float
lagrange multiplier statistic

lm_pvalue :float
p-value of lagrange multiplier test

fvalue: float
f-statistic of the hypothesis that the error variance does not depend on x. This is an alternative test variant not the original LM test.

f_pvalue:float
p-value for the f-statistic


In [None]:
#Run white's test
import statsmodels.stats.diagnostic as sm_diagnostic
_,pval, _, f_pval=sm_diagnostic.het_white(result.resid,result.model.exog)
print(pval, f_pval)

**The output for p value shows that the null hypothesis is retained.**

**No heteroskedasticity.**

# Checking for autocorrelation

To test autocorrelation , we use statsmodels.stats.diagnostic module and use Ljung - Box test for no autocorrelation of residuals.

H0: The data are random.

H1: The data are not random.

We want the p value that will retain the null hypothesis

To use Ljung - Box test, we call acorr_ljungbox function, pass through the result.resid and define the lags.

A rule of thumb for calculating lags for non-seasonal time series is min(10,(num_obs//5)

We can also visually check autocorrelation by using statsmodels.graphic module to plot a graph of the autocorrelation factor.

https://www.statsmodels.org/stable/generated/statsmodels.stats.diagnostic.acorr_ljungbox.html

In [None]:
#Test for autocorrelation

from statsmodels.stats.stattools import durbin_watson
# calculate the lag (optional)
lag= min(10, len(X)//5)
print("The number of lags will be{}".format(lag))
print()

# run ljung Box test for no autocorrelation of residuals

test_results= diag.acorr_ljungbox(result.resid, lags=lag)
print(test_results)

# grab the p value and test statistics
ibvalue, p_val = test_results
# print the results of the test
if min(p_val)>0.05:
    print("The lowest p_value found was {:.4}".format(min(p_val)))
    print("We fail to reject null hypothesis, there is no autocorrelation")
    print()
else:
    print("The lowest p_value found was {:.4}".format(min(p_val)))
    print("We reject null hypothesis, there no autocorrelation")
    print()

# plot autocorrelation
sm.graphics.tsa.plot_acf(result.resid)
plt.show()


### Checking for normally distributed residuals

This is done by plotting QQ plot.

We require the data should lie very closely to the line so that the normality assumption is satisfied.

### Checking the mean of residuals equal to zero

In [None]:
#Check for the normality of the residuals
sm.qqplot(result.resid, line='s')
pylab.show()

# check that mean of residuals is approx zero
mean_residuals= sum(result.resid)/len(result.resid)
mean_residuals