In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## Multiple Linear Regression

In [2]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
y = df['target']
x = df.loc[:, df.columns != 'target']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [5]:
model = LinearRegression().fit(X_train, y_train)

In [6]:
heart_pred = model.predict(X_test)

In [7]:
# The coefficients

print('Coefficients: \n', model.coef_)

# The mean squared error

print('Mean squared error: %.2f'
      % mean_squared_error(y_test, heart_pred))

# The coefficient of determination: 1 is perfect prediction

print('Coefficient of determination: %.2f'
      % r2_score(y_test, heart_pred))

Coefficients: 
 [-0.00114757 -0.21741362  0.12279564 -0.00109366 -0.0004579   0.0671081
  0.05910295  0.00319551 -0.17699364 -0.05892023  0.05511375 -0.09127311
 -0.14520748]
Mean squared error: 0.17
Coefficient of determination: 0.32


## ANOVA

In [8]:
A = [12.6, 12, 11.8, 11.9, 13, 12.5, 14]
B = [10, 10.2, 10, 12, 14, 13]
C = [10.1, 13, 13.4, 12.9, 8.9, 10.7, 13.6, 12]

In [9]:
all_scores = A + B + C
company_names = (['A'] * len(A)) +  (['B'] * len(B)) +  (['C'] * len(C))

In [10]:
data = pd.DataFrame({'company': company_names, 'score': all_scores})

In [11]:
data.head()

Unnamed: 0,company,score
0,A,12.6
1,A,12.0
2,A,11.8
3,A,11.9
4,A,13.0


In [12]:
data.groupby('company').mean()

Unnamed: 0_level_0,score
company,Unnamed: 1_level_1
A,12.542857
B,11.533333
C,11.825


In [13]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [14]:
lm = ols('score ~ company',data=data).fit()
table = sm.stats.anova_lm(lm)
print(table)
# p-value < 0.05

            df     sum_sq   mean_sq         F    PR(>F)
company    2.0   3.606905  1.803452  0.821297  0.455683
Residual  18.0  39.525476  2.195860       NaN       NaN


In [15]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In heart.csv example:

In [16]:
lm = ols('target ~ age + sex + cp + trestbps + chol + fbs',data=df).fit()
table = sm.stats.anova_lm(lm)
print(table)

             df     sum_sq    mean_sq          F        PR(>F)
age         1.0   3.819244   3.819244  22.255890  3.683448e-06
sex         1.0   6.972822   6.972822  40.632740  7.031728e-10
cp          1.0  12.188938  12.188938  71.028627  1.556923e-15
trestbps    1.0   1.126777   1.126777   6.566067  1.088890e-02
chol        1.0   0.239409   0.239409   1.395106  2.384929e-01
fbs         1.0   0.005953   0.005953   0.034689  8.523768e-01
Residual  296.0  50.795373   0.171606        NaN           NaN


In [17]:
lm = ols('target ~ chol',data=df).fit()
table = sm.stats.anova_lm(lm)
print(table)

             df     sum_sq   mean_sq         F   PR(>F)
chol        1.0   0.546007  0.546007  2.202983  0.13879
Residual  301.0  74.602508  0.247849       NaN      NaN


## Multicollinearity

In [18]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [19]:
X = df.assign(const=1)

In [20]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,const
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,1


In [21]:
# VIF threshold above 10

pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)

age           1.443937
sex           1.231356
cp            1.397152
trestbps      1.180747
chol          1.152971
fbs           1.087698
restecg       1.066721
thalach       1.653567
exang         1.440147
oldpeak       1.744666
slope         1.662325
ca            1.290729
thal          1.191528
target        2.072754
const       212.998773
dtype: float64