In [1]:
import numpy as np
import pandas as pd
import matplotlib
# %matplotlib notebook
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
linmod = LinearRegression()

In [2]:
datafile = '../../dataset/ch09/bodyfat.txt'
bodyfat = pd.read_csv(datafile, sep='\t')

In [3]:
bodyfat.head()

Unnamed: 0,Density,Pct.BF,Age,Weight,Height,Neck,Chest,Abdomen,Waist,Hip,Thigh,Knee,Ankle,Bicep,Forearm,Wrist
0,1.0708,12.3,23,154.25,67.75,36.2,93.1,85.2,33.543307,94.5,59.0,37.3,21.9,32.0,27.4,17.1
1,1.0853,6.1,22,173.25,72.25,38.5,93.6,83.0,32.677165,98.7,58.7,37.3,23.4,30.5,28.9,18.2
2,1.0414,25.3,22,154.0,66.25,34.0,95.8,87.9,34.606299,99.2,59.6,38.9,24.0,28.8,25.2,16.6
3,1.0751,10.4,26,184.75,72.25,37.4,101.8,86.4,34.015748,101.2,60.1,37.3,22.8,32.4,29.4,18.2
4,1.034,28.7,24,184.25,71.25,34.4,97.3,100.0,39.370079,101.9,63.2,42.2,24.0,32.2,27.7,17.7


### Simple Regression: % Body Fat ~ Waist Size

In [None]:
y = bodyfat['Pct.BF']

In [None]:
x = bodyfat['Waist']

In [None]:
x = np.array(x).reshape(-1, 1)
linmod.fit(x, y);
y_hat = linmod.predict(x)
r = linmod.score(x, y)
print(f'R_squared: {r * 100:.1f}%')

In [None]:
plt.figure()
plt.scatter(x, y)
plt.plot(x, y_hat, color='r', lw=3);
plt.xlabel('Waist')
plt.ylabel('% body fat');

In [None]:
residues = y - y_hat
plt.scatter(y_hat, residues, color='r');
plt.xlabel('Estimated %$BodyFat$')
plt.ylabel('Residues')
plt.title('Simple Regression: %BodyFat ~ Waist');

In [None]:
x = bodyfat['Height']

In [None]:
x = np.array(x).reshape(-1, 1)
linmod.fit(x, y);
y_hat = linmod.predict(x)
r = linmod.score(x, y)
print(f'R_squared: {r * 100:.1f}%')

In [None]:
plt.figure()
plt.scatter(x, y)
plt.plot(x, y_hat, color='r', lw=3);
plt.xlabel('Height')
plt.ylabel('% body fat');

In [None]:
residues = y - y_hat
plt.scatter(y_hat, residues, color='r');
plt.xlabel('Estimated %$BodyFat$')
plt.ylabel('Residues')
plt.title('Simple Regression: %BodyFat ~ Waist');

In [None]:
plt.hist(bodyfat['Pct.BF'])
plt.title("% body fat");

In [None]:
plt.hist(bodyfat['Waist'])
plt.title("Waist");

In [None]:
thin  = bodyfat[(bodyfat['Waist']>=32) & (bodyfat['Waist']<32.5)]
thick = bodyfat[(bodyfat['Waist']>=36) & (bodyfat['Waist']<36.5)]

In [None]:
y = thin['Pct.BF']
x = thin['Height']
x = np.array(x).reshape(-1, 1)
linmod.fit(x, y);
y_hat = linmod.predict(x)
plt.figure()
plt.scatter(x, y)
plt.plot(x, y_hat, color='r', lw=3);
plt.xlabel('Height')
plt.ylabel('% body fat');

In [None]:
y = thick['Pct.BF']
x = thick['Height']
x = np.array(x).reshape(-1, 1)
linmod.fit(x, y);
y_hat = linmod.predict(x)
plt.figure()
plt.scatter(x, y)
plt.plot(x, y_hat, color='r', lw=3);
plt.xlabel('Height')
plt.ylabel('% body fat');

### Multiple Regression:  % Body Fat ~ (Waist Size, Height)

In [None]:
x = bodyfat[ ['Waist', 'Height'] ]
y = bodyfat['Pct.BF']

In [None]:
linmod.fit(x, y);
y_hat = linmod.predict(x)
residues = y - y_hat
r = linmod.score(x, y)
print(f'R_squared: {r * 100:.1f}%')

In [None]:
print(f'Waist coef: {linmod.coef_[0]:.4f}')
print(f'Height coef: {linmod.coef_[1]:.4f}')

In [None]:
plt.figure()
plt.scatter(y_hat, residues, color='r');
plt.xlabel('Estimated %$BodyFat$')
plt.ylabel('Residues')
plt.title('Multiple Regression: %BodyFat ~ Waist, Height');

In [None]:
x1 = np.array(bodyfat['Waist'])
x2 = np.array(bodyfat['Height'])
X1, X2 = np.meshgrid(x1, x2)
x1_bar = x1.mean()
x2_bar = x2.mean()
y_bar = y.mean()
fitted = linmod.coef_[0] * (X1-x1_bar) + linmod.coef_[1] * (X2 - x2_bar) + y_bar
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(x1, x2, np.array(y));
ax.plot_surface(X1, X2, fitted, color='r');
ax.set_xlabel('Waist')
ax.set_ylabel('Height')
ax.set_zlabel('% body fat');

In [None]:
linmod.fit(np.array(bodyfat['Waist']).reshape(-1, 1), y)
fat_residues = y - linmod.predict(np.array(bodyfat['Waist']).reshape(-1, 1))

linmod.fit(np.array(bodyfat['Waist']).reshape(-1, 1), bodyfat['Height'])
height_residues = bodyfat['Height'] - linmod.predict(np.array(bodyfat['Waist']).reshape(-1, 1))

plt.scatter(height_residues, fat_residues)
plt.ylabel('%$BodyFat$ residues after accounting for $Waist$')
plt.xlabel('$Height$ residues after accounting for $Waist$')

linmod.fit(np.array(height_residues).reshape(-1,1), fat_residues)
plt.title(f'Linear Coefficient: {linmod.coef_[0]:.4f}')
plt.plot(height_residues, linmod.predict(np.array(height_residues).reshape(-1,1)), 'r');


**Note**: 

Thus computed linear coefficient is the same as the *Height* coefficient obtained in the above multi-regression.


In [None]:
sm.graphics.plot_partregress(y, 'Height', ['Waist'], data=bodyfat,
                             obs_labels=False);

### Multiple Regression: % Body Fat ~ (Waist Size, ...)

In [None]:
bodyfat.columns

In [None]:
x = bodyfat[ ['Waist', 'Weight'] ]
linmod.fit(x, y);
y_hat = linmod.predict(x)
residues = y - y_hat
r = linmod.score(x, y)
print(f'R_squared: {r * 100:.1f}%')
plt.figure()
plt.scatter(y_hat, residues, color='r');
plt.xlabel('y_hat')
plt.ylabel('residues')
plt.title('Multiple Regression: %BodyFat ~ Waist, Weight');

In [None]:
x1 = bodyfat['Waist']
x1 = np.array(x1).reshape(-1, 1)
x2 = bodyfat['Height']
y = bodyfat['Pct.BF']
linmod.fit(x1, y)
ey = y - linmod.predict(x1)
linmod.fit(x1, x2)
ex2 = x2 - linmod.predict(x1)

In [None]:
linmod.fit(np.array(ex2).reshape(-1,1), ey)
plt.scatter(ex2, ey)
plt.plot(ex2, linmod.predict(np.array(ex2).reshape(-1,1)), 'r-')
plt.xlabel('Height (linear component of waist removed)')
plt.ylabel('% body fat (linear component of waist removed)')


In [None]:
r = linmod.score(np.array(ex2).reshape(-1,1), ey)
print(f'R_squared: {r * 100:.1f}%')