# Influential Observation Analysis

In [1]:
#=============================================================================================
# CODE NAME     : Multiple Linear Regression Model.py
# PURPOSE       : Demonstrate application of influential analysis using Pandas and statsmodels 
# APPLICATION   : Analyzing Fitness dataset
#==============================================================================================

In [1]:
# Load Input data
import numpy as np
import pandas as pd

fitness = pd.read_excel("C:\\Users\\Training\\Data Science using SAS and Python\\Data\\fitness.xlsx")
fitness.head()

Unnamed: 0,Name,Gender,RunTime,Age,Weight,Oxygen_Consumption,Run_Pulse,Rest_Pulse,Maximum_Pulse,Performance
0,Donna,F,8.17,42,68.15,59.57,166,40,172,90
1,Gracie,F,8.63,38,81.87,60.06,170,48,186,94
2,Luanne,F,8.65,43,85.84,54.3,156,45,168,83
3,Mimi,F,8.92,50,70.87,54.63,146,48,155,67
4,Chris,M,8.95,49,81.42,49.16,180,44,185,72


In [3]:
# Fit a model selected based on Cp statistics for prediction
from statsmodels.formula.api import ols
model = ols('Oxygen_Consumption ~ RunTime + Age + Run_Pulse + Maximum_Pulse', data = fitness).fit()
model.summary() 

0,1,2,3
Dep. Variable:,Oxygen_Consumption,R-squared:,0.835
Model:,OLS,Adj. R-squared:,0.81
Method:,Least Squares,F-statistic:,33.01
Date:,"Sat, 10 Apr 2021",Prob (F-statistic):,7.68e-10
Time:,09:02:33,Log-Likelihood:,-67.367
No. Observations:,31,AIC:,144.7
Df Residuals:,26,BIC:,151.9
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,97.1695,11.657,8.336,0.000,73.208,121.131
RunTime,-2.7758,0.342,-8.126,0.000,-3.478,-2.074
Age,-0.1890,0.094,-2.003,0.056,-0.383,0.005
Run_Pulse,-0.3457,0.118,-2.924,0.007,-0.589,-0.103
Maximum_Pulse,0.2719,0.134,2.023,0.053,-0.004,0.548

0,1,2,3
Omnibus:,0.144,Durbin-Watson:,1.862
Prob(Omnibus):,0.931,Jarque-Bera (JB):,0.036
Skew:,-0.061,Prob(JB):,0.982
Kurtosis:,2.885,Cond. No.,6940.0


In [4]:
# Get the Influential Summary 
infl = model.get_influence()

# Get the Name from original Dataframe
name = pd.DataFrame(fitness['Name'])
name['Observation'] = fitness.index.values

# Prepare final datafrae containing all influential data points
influential_summary = infl.summary_frame()
influential_summary['Observation'] = fitness.index.values
influential_summary = name.merge(influential_summary)
influential_summary

Unnamed: 0,Name,Observation,dfb_Intercept,dfb_RunTime,dfb_Age,dfb_Run_Pulse,dfb_Maximum_Pulse,cooks_d,standard_resid,hat_diag,dffits_internal,student_resid,dffits
0,Donna,0,0.322413,-0.489736,-0.265813,0.04287,-0.064523,0.1054573,1.703101,0.153825,0.726145,1.771779,0.755427
1,Gracie,1,-0.250104,-0.227766,-0.181414,-0.961656,1.026926,0.3305135,1.33157,0.48241,1.285522,1.352648,1.305872
2,Luanne,2,-0.212727,0.128018,0.171111,0.408361,-0.301691,0.07998722,-1.206741,0.215464,-0.632405,-1.217904,-0.638256
3,Mimi,3,-0.000125,3.5e-05,8e-06,7.5e-05,-1.1e-05,9.767868e-09,-0.000417,0.219284,-0.000221,-0.000409,-0.000217
4,Chris,4,0.317015,0.358642,-0.279825,0.018451,-0.179223,0.05347098,-0.967175,0.22228,-0.517064,-0.965928,-0.516397
5,Allen,5,-0.171269,0.165799,0.288337,-0.1994,0.172178,0.04213675,-0.928329,0.196446,-0.459003,-0.925774,-0.45774
6,Nancy,6,-0.006636,-0.006751,0.005314,0.002407,0.001335,2.306513e-05,0.018675,0.248493,0.010739,0.018313,0.010531
7,Patty,7,-0.122229,0.286101,-0.203561,-0.243365,0.259185,0.06029092,-1.640167,0.100767,-0.549049,-1.698589,-0.568606
8,Suzanne,8,0.041004,-0.013062,0.085479,-0.053469,0.012294,0.007725017,0.361487,0.228148,0.196533,0.355362,0.193202
9,Teresa,9,-0.01095,0.077198,-0.136848,0.103116,-0.072809,0.02090047,-1.236747,0.063953,-0.323268,-1.250057,-0.326747


# Analyse only those data points which exceeds cut-offs

In [5]:
import math
n = 31 # Sample Size
p = 5  # Numper of Parameter in model including intercept

condition = [(influential_summary["student_resid"] > 3) | 
             (influential_summary["dffits"] > 2*math.sqrt(p/n)) | 
             (influential_summary["cooks_d"] > 4/n) |
             (influential_summary["dfb_RunTime"] > 2*math.sqrt(1/n)) |
             (influential_summary["dfb_Age"] > 2*math.sqrt(1/n)) |
             (influential_summary["dfb_Run_Pulse"] > 2*math.sqrt(1/n)) |
             (influential_summary["dfb_Maximum_Pulse"] > 2*math.sqrt(1/n))]
choice = ["Yes"]

influential_summary["Flag"] = np.select(condition,choice,default = "No")
influential_summary[influential_summary.Flag == "Yes"]

Unnamed: 0,Name,Observation,dfb_Intercept,dfb_RunTime,dfb_Age,dfb_Run_Pulse,dfb_Maximum_Pulse,cooks_d,standard_resid,hat_diag,dffits_internal,student_resid,dffits,Flag
1,Gracie,1,-0.250104,-0.227766,-0.181414,-0.961656,1.026926,0.330514,1.33157,0.48241,1.285522,1.352648,1.305872,Yes
2,Luanne,2,-0.212727,0.128018,0.171111,0.408361,-0.301691,0.079987,-1.206741,0.215464,-0.632405,-1.217904,-0.638256,Yes
14,Sammy,14,-0.223787,-0.204317,0.540121,-0.02178,0.065701,0.088562,2.132058,0.088767,0.66544,2.301503,0.718326,Yes


In [23]:
print(2*math.sqrt(1/n))

0.3592106040535498


In [24]:
print(-0.489736 > 0.3592106040535498)

False
