In [184]:
# Load dataset

import pandas as pd

df = pd.read_csv('galton_height_data.csv')

# Display data
df

Unnamed: 0.1,Unnamed: 0,height,father,mother,gender,kids
0,0,73.2,78.5,67.0,M,4
1,1,69.2,78.5,67.0,F,4
2,2,69.0,78.5,67.0,F,4
3,3,69.0,78.5,67.0,F,4
4,4,73.5,75.5,66.5,M,4
...,...,...,...,...,...,...
893,893,68.5,68.5,65.0,M,8
894,894,67.7,68.5,65.0,M,8
895,895,64.0,68.5,65.0,F,8
896,896,63.5,68.5,65.0,F,8


In [185]:
# Drop the first column

df.drop(df.columns[0], axis=1, inplace=True)

df

Unnamed: 0,height,father,mother,gender,kids
0,73.2,78.5,67.0,M,4
1,69.2,78.5,67.0,F,4
2,69.0,78.5,67.0,F,4
3,69.0,78.5,67.0,F,4
4,73.5,75.5,66.5,M,4
...,...,...,...,...,...
893,68.5,68.5,65.0,M,8
894,67.7,68.5,65.0,M,8
895,64.0,68.5,65.0,F,8
896,63.5,68.5,65.0,F,8


In [186]:
# Test 1: Assuming the number of children in the family does not affect the height of the child

In [187]:
# Drop the 'kids' column

df.drop(['kids'], axis=1, inplace=True)

df

Unnamed: 0,height,father,mother,gender
0,73.2,78.5,67.0,M
1,69.2,78.5,67.0,F
2,69.0,78.5,67.0,F
3,69.0,78.5,67.0,F
4,73.5,75.5,66.5,M
...,...,...,...,...
893,68.5,68.5,65.0,M
894,67.7,68.5,65.0,M
895,64.0,68.5,65.0,F
896,63.5,68.5,65.0,F


In [188]:
# Label encode the 'gender' column

g = df['gender'].value_counts()
gender = g.index

for i in range(len(gender)):
    df['gender'].replace(gender[i], i, inplace = True)
    
df

Unnamed: 0,height,father,mother,gender
0,73.2,78.5,67.0,0
1,69.2,78.5,67.0,1
2,69.0,78.5,67.0,1
3,69.0,78.5,67.0,1
4,73.5,75.5,66.5,0
...,...,...,...,...
893,68.5,68.5,65.0,0
894,67.7,68.5,65.0,0
895,64.0,68.5,65.0,1
896,63.5,68.5,65.0,1


In [189]:
y = df['height']

features = ['father', 'mother', 'gender']

X = df[features]

In [190]:
y

0      73.2
1      69.2
2      69.0
3      69.0
4      73.5
       ... 
893    68.5
894    67.7
895    64.0
896    63.5
897    63.0
Name: height, Length: 898, dtype: float64

In [191]:
X

Unnamed: 0,father,mother,gender
0,78.5,67.0,0
1,78.5,67.0,1
2,78.5,67.0,1
3,78.5,67.0,1
4,75.5,66.5,0
...,...,...,...
893,68.5,65.0,0
894,68.5,65.0,0
895,68.5,65.0,1
896,68.5,65.0,1


In [192]:
# Normalise and Standardise Features

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

X = StandardScaler().fit_transform(X)
X = MinMaxScaler().fit_transform(X)

In [193]:
# Splitting the dataset into separate train and test sets (60-40)

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [194]:
X_train.shape

(628, 3)

In [195]:
X_test.shape

(270, 3)

In [196]:
y_train.shape

(628,)

In [197]:
y_test.shape

(270,)

In [198]:
# Use multiple algorithms to train models

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

print("Accuracy Scores (Train-Test Split):")

lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
print("Linear Regression:", lin_model.score(X_test, y_test).round(3))

lasso_model = Lasso()
lasso_model.fit(X_train, y_train)
print("Lasso:", lasso_model.score(X_test, y_test).round(3))

elastic_model = ElasticNet()
elastic_model.fit(X_train, y_train)
print("ElasticNet:", elastic_model.score(X_test, y_test).round(3))

dec_model = DecisionTreeRegressor()
dec_model.fit(X_train, y_train)
print("Decision Tree Regressor:", dec_model.score(X_test, y_test).round(3))

knr_model = KNeighborsRegressor()
knr_model.fit(X_train, y_train)
print("K-Neighbors Regressor:", knr_model.score(X_test, y_test).round(3))

gbr_model = GradientBoostingRegressor()
gbr_model.fit(X_train, y_train)
print("Gradient Boosting Regressor:", gbr_model.score(X_test, y_test).round(3))

rfr_model = RandomForestRegressor()
rfr_model.fit(X_train, y_train)
print("Random Forest Regressor:", rfr_model.score(X_test, y_test).round(3))

Accuracy Scores (Train-Test Split):
Linear Regression: 0.651
Lasso: 0.185
ElasticNet: 0.178
Decision Tree Regressor: 0.589
K-Neighbors Regressor: 0.6
Gradient Boosting Regressor: 0.661
Random Forest Regressor: 0.62


In [199]:
# Accuracy Score for 10-fold Cross Validation

print("Accuracy Scores (10-fold Cross Validation):")

lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
lin_scores = cross_val_score(lin_model, X, y, cv=10)
print("Linear Regression (CV): %0.2f accuracy with a standard deviation of %0.2f" % (lin_scores.mean(), lin_scores.std()))

lasso_model = Lasso()
lasso_model.fit(X_train, y_train)
lasso_scores = cross_val_score(lasso_model, X, y, cv=10)
print("Lasso: %0.2f accuracy with a standard deviation of %0.2f" % (lasso_scores.mean(), lasso_scores.std()))

elastic_model = ElasticNet()
elastic_model.fit(X_train, y_train)
elastic_scores = cross_val_score(elastic_model, X, y, cv=10)
print("ElasticNet (CV): %0.2f accuracy with a standard deviation of %0.2f" % (elastic_scores.mean(), elastic_scores.std()))

dec_model = DecisionTreeRegressor()
dec_model.fit(X_train, y_train)
dec_scores = cross_val_score(dec_model, X, y, cv=10)
print("Decision Tree Regressor (CV): %0.2f accuracy with a standard deviation of %0.2f" % (dec_scores.mean(), dec_scores.std()))

knr_model = KNeighborsRegressor()
knr_model.fit(X_train, y_train)
knr_scores = cross_val_score(knr_model, X, y, cv=10)
print("K-Neighbors Regressor (CV): %0.2f accuracy with a standard deviation of %0.2f" % (knr_scores.mean(), knr_scores.std()))

gbr_model = GradientBoostingRegressor()
gbr_model.fit(X_train, y_train)
gbr_scores = cross_val_score(gbr_model, X, y, cv=10)
print("Gradient Boosting Regressor: %0.2f accuracy with a standard deviation of %0.2f" % (gbr_scores.mean(), gbr_scores.std()))

rfr_model = RandomForestRegressor()
rfr_model.fit(X_train, y_train)
rfr_scores = cross_val_score(rfr_model, X, y, cv=10)
print("Random Forest Regressor (CV): %0.2f accuracy with a standard deviation of %0.2f" % (rfr_scores.mean(), rfr_scores.std()))

Accuracy Scores (10-fold Cross Validation):
Linear Regression (CV): 0.59 accuracy with a standard deviation of 0.05
Lasso: 0.08 accuracy with a standard deviation of 0.14
ElasticNet (CV): 0.07 accuracy with a standard deviation of 0.14
Decision Tree Regressor (CV): 0.41 accuracy with a standard deviation of 0.09
K-Neighbors Regressor (CV): 0.50 accuracy with a standard deviation of 0.07
Gradient Boosting Regressor: 0.58 accuracy with a standard deviation of 0.04
Random Forest Regressor (CV): 0.52 accuracy with a standard deviation of 0.05


In [200]:
# Using Linear Regressor-trained model to predict height

lin_pred = lin_model.predict(X_test)
ser_lin_pred = pd.Series(np.round(lin_pred, 1))
ser_lin_pred.name = "Predicted"

ser_y_test = pd.Series(y_test)
ser_y_test = ser_y_test.reset_index(drop=True)
ser_y_test.name = "Actual"

difference = ser_y_test - ser_lin_pred
difference.name = 'Actual - Prediction'

prediction = pd.concat([ser_y_test, ser_lin_pred, difference], axis=1).reset_index(drop=True)
prediction

Unnamed: 0,Actual,Predicted,Actual - Prediction
0,65.5,65.5,0.0
1,71.5,69.0,2.5
2,64.5,65.4,-0.9
3,63.5,62.5,1.0
4,67.0,68.9,-1.9
...,...,...,...
265,67.7,70.4,-2.7
266,67.0,68.4,-1.4
267,65.0,61.9,3.1
268,70.0,68.3,1.7


In [201]:
# Test 2: Taking into consideration the number of children in the family

In [202]:
df = pd.read_csv('galton_height_data.csv')

# Drop the first column
df.drop(df.columns[0], axis=1, inplace=True)

df

Unnamed: 0,height,father,mother,gender,kids
0,73.2,78.5,67.0,M,4
1,69.2,78.5,67.0,F,4
2,69.0,78.5,67.0,F,4
3,69.0,78.5,67.0,F,4
4,73.5,75.5,66.5,M,4
...,...,...,...,...,...
893,68.5,68.5,65.0,M,8
894,67.7,68.5,65.0,M,8
895,64.0,68.5,65.0,F,8
896,63.5,68.5,65.0,F,8


In [203]:
# Label encode the 'gender' column

g = df['gender'].value_counts()
gender = g.index

for i in range(len(gender)):
    df['gender'].replace(gender[i], i, inplace = True)
    
df

Unnamed: 0,height,father,mother,gender,kids
0,73.2,78.5,67.0,0,4
1,69.2,78.5,67.0,1,4
2,69.0,78.5,67.0,1,4
3,69.0,78.5,67.0,1,4
4,73.5,75.5,66.5,0,4
...,...,...,...,...,...
893,68.5,68.5,65.0,0,8
894,67.7,68.5,65.0,0,8
895,64.0,68.5,65.0,1,8
896,63.5,68.5,65.0,1,8


In [204]:
y = df['height']

features = ['father', 'mother', 'gender', 'kids']

X = df[features]

In [205]:
y

0      73.2
1      69.2
2      69.0
3      69.0
4      73.5
       ... 
893    68.5
894    67.7
895    64.0
896    63.5
897    63.0
Name: height, Length: 898, dtype: float64

In [206]:
X

Unnamed: 0,father,mother,gender,kids
0,78.5,67.0,0,4
1,78.5,67.0,1,4
2,78.5,67.0,1,4
3,78.5,67.0,1,4
4,75.5,66.5,0,4
...,...,...,...,...
893,68.5,65.0,0,8
894,68.5,65.0,0,8
895,68.5,65.0,1,8
896,68.5,65.0,1,8


In [207]:
# Standardise and Normalise

X = StandardScaler().fit_transform(X)
X = MinMaxScaler().fit_transform(X)

In [208]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [209]:
X_train.shape

(628, 4)

In [210]:
X_test.shape

(270, 4)

In [211]:
y_train.shape

(628,)

In [212]:
y_test.shape

(270,)

In [213]:
# Use multiple algorithms to train models

print("Accuracy Scores (Train-Test Split):")

lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
print("Linear Regression:", lin_model.score(X_test, y_test).round(3))

lasso_model = Lasso()
lasso_model.fit(X_train, y_train)
print("Lasso:", lasso_model.score(X_test, y_test).round(3))

elastic_model = ElasticNet()
elastic_model.fit(X_train, y_train)
print("ElasticNet:", elastic_model.score(X_test, y_test).round(3))

dec_model = DecisionTreeRegressor()
dec_model.fit(X_train, y_train)
print("Decision Tree Regressor:", dec_model.score(X_test, y_test).round(3))

knr_model = KNeighborsRegressor()
knr_model.fit(X_train, y_train)
print("K-Neighbors Regressor:", knr_model.score(X_test, y_test).round(3))

gbr_model = GradientBoostingRegressor()
gbr_model.fit(X_train, y_train)
print("Gradient Boosting Regressor:", gbr_model.score(X_test, y_test).round(3))

rfr_model = RandomForestRegressor()
rfr_model.fit(X_train, y_train)
print("Random Forest Regressor:", rfr_model.score(X_test, y_test).round(3))

Accuracy Scores (Train-Test Split):
Linear Regression: 0.652
Lasso: 0.185
ElasticNet: 0.178
Decision Tree Regressor: 0.618
K-Neighbors Regressor: 0.638
Gradient Boosting Regressor: 0.669
Random Forest Regressor: 0.649


In [214]:
# Accuracy Score for 10-fold Cross Validation

print("Accuracy Scores (10-fold Cross Validation):")

lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
lin_scores = cross_val_score(lin_model, X, y, cv=10)
print("Linear Regression (CV): %0.2f accuracy with a standard deviation of %0.2f" % (lin_scores.mean(), lin_scores.std()))

lasso_model = Lasso()
lasso_model.fit(X_train, y_train)
lasso_scores = cross_val_score(lasso_model, X, y, cv=10)
print("Lasso: %0.2f accuracy with a standard deviation of %0.2f" % (lasso_scores.mean(), lasso_scores.std()))

elastic_model = ElasticNet()
elastic_model.fit(X_train, y_train)
elastic_scores = cross_val_score(elastic_model, X, y, cv=10)
print("ElasticNet (CV): %0.2f accuracy with a standard deviation of %0.2f" % (elastic_scores.mean(), elastic_scores.std()))

dec_model = DecisionTreeRegressor()
dec_model.fit(X_train, y_train)
dec_scores = cross_val_score(dec_model, X, y, cv=10)
print("Decision Tree Regressor (CV): %0.2f accuracy with a standard deviation of %0.2f" % (dec_scores.mean(), dec_scores.std()))

knr_model = KNeighborsRegressor()
knr_model.fit(X_train, y_train)
knr_scores = cross_val_score(knr_model, X, y, cv=10)
print("K-Neighbors Regressor (CV): %0.2f accuracy with a standard deviation of %0.2f" % (knr_scores.mean(), knr_scores.std()))

gbr_model = GradientBoostingRegressor()
gbr_model.fit(X_train, y_train)
gbr_scores = cross_val_score(gbr_model, X, y, cv=10)
print("Gradient Boosting Regressor: %0.2f accuracy with a standard deviation of %0.2f" % (gbr_scores.mean(), gbr_scores.std()))

rfr_model = RandomForestRegressor()
rfr_model.fit(X_train, y_train)
rfr_scores = cross_val_score(rfr_model, X, y, cv=10)
print("Random Forest Regressor (CV): %0.2f accuracy with a standard deviation of %0.2f" % (rfr_scores.mean(), rfr_scores.std()))

Accuracy Scores (10-fold Cross Validation):
Linear Regression (CV): 0.59 accuracy with a standard deviation of 0.05
Lasso: 0.08 accuracy with a standard deviation of 0.14
ElasticNet (CV): 0.07 accuracy with a standard deviation of 0.14
Decision Tree Regressor (CV): 0.36 accuracy with a standard deviation of 0.13
K-Neighbors Regressor (CV): 0.46 accuracy with a standard deviation of 0.10
Gradient Boosting Regressor: 0.57 accuracy with a standard deviation of 0.05
Random Forest Regressor (CV): 0.50 accuracy with a standard deviation of 0.08


In [215]:
# Using Linear Regressor-trained model to predict height

lin_pred = lin_model.predict(X_test)
ser_lin_pred = pd.Series(np.round(lin_pred, 1))
ser_lin_pred.name = "Predicted"

ser_y_test = pd.Series(y_test)
ser_y_test = ser_y_test.reset_index(drop=True)
ser_y_test.name = "Actual"

difference = ser_y_test - ser_lin_pred
difference.name = 'Actual - Prediction'

prediction = pd.concat([ser_y_test, ser_lin_pred, difference], axis=1).reset_index(drop=True)
prediction

Unnamed: 0,Actual,Predicted,Actual - Prediction
0,65.5,65.4,0.1
1,71.5,69.1,2.4
2,64.5,65.4,-0.9
3,63.5,62.5,1.0
4,67.0,68.8,-1.8
...,...,...,...
265,67.7,70.6,-2.9
266,67.0,68.4,-1.4
267,65.0,61.8,3.2
268,70.0,68.4,1.6
