In [15]:
# Load dataset

import pandas as pd

df = pd.read_csv('galton_height_data.csv')

# Display data
df

Unnamed: 0.1,Unnamed: 0,height,father,mother,gender,kids
0,0,73.2,78.5,67.0,M,4
1,1,69.2,78.5,67.0,F,4
2,2,69.0,78.5,67.0,F,4
3,3,69.0,78.5,67.0,F,4
4,4,73.5,75.5,66.5,M,4
...,...,...,...,...,...,...
893,893,68.5,68.5,65.0,M,8
894,894,67.7,68.5,65.0,M,8
895,895,64.0,68.5,65.0,F,8
896,896,63.5,68.5,65.0,F,8


In [16]:
# Drop the first column

df.drop(df.columns[0], axis=1, inplace=True)

df

Unnamed: 0,height,father,mother,gender,kids
0,73.2,78.5,67.0,M,4
1,69.2,78.5,67.0,F,4
2,69.0,78.5,67.0,F,4
3,69.0,78.5,67.0,F,4
4,73.5,75.5,66.5,M,4
...,...,...,...,...,...
893,68.5,68.5,65.0,M,8
894,67.7,68.5,65.0,M,8
895,64.0,68.5,65.0,F,8
896,63.5,68.5,65.0,F,8


In [17]:
# Test 1: Assuming the number of children in the family does not affect the probability of gender

In [18]:
# Drop the 'kids' column

df.drop(['kids'], axis=1, inplace=True)

df

Unnamed: 0,height,father,mother,gender
0,73.2,78.5,67.0,M
1,69.2,78.5,67.0,F
2,69.0,78.5,67.0,F
3,69.0,78.5,67.0,F
4,73.5,75.5,66.5,M
...,...,...,...,...
893,68.5,68.5,65.0,M
894,67.7,68.5,65.0,M
895,64.0,68.5,65.0,F
896,63.5,68.5,65.0,F


In [19]:
# # Label encode the 'gender' column

# g = df['gender'].value_counts()
# gender = g.index

# for i in range(len(gender)):
#     df['gender'].replace(gender[i], i, inplace = True)
    
# df

In [20]:
y = df['gender']

features = ['height', 'father', 'mother']

X = df[features]

In [21]:
y

0      M
1      F
2      F
3      F
4      M
      ..
893    M
894    M
895    F
896    F
897    F
Name: gender, Length: 898, dtype: object

In [33]:
# Count the class samples

from collections import Counter

Counter(y)

Counter({'M': 465, 'F': 433})

In [22]:
X

Unnamed: 0,height,father,mother
0,73.2,78.5,67.0
1,69.2,78.5,67.0
2,69.0,78.5,67.0
3,69.0,78.5,67.0
4,73.5,75.5,66.5
...,...,...,...
893,68.5,68.5,65.0
894,67.7,68.5,65.0
895,64.0,68.5,65.0
896,63.5,68.5,65.0


In [23]:
# # Normalise and Standardise Features

# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import MinMaxScaler

# X = StandardScaler().fit_transform(X)
# X = MinMaxScaler().fit_transform(X)

In [24]:
# Splitting the dataset into separate train and test sets (60-40)

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [25]:
X_train.shape

(628, 3)

In [26]:
X_test.shape

(270, 3)

In [27]:
y_train.shape

(628,)

In [28]:
y_test.shape

(270,)

In [29]:
# Use multiple classification algorithms

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

print("Accuracy Scores (Train-Test Split):")

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K Nearest Neighbor:", knn_model.score(X_test, y_test).round(3))

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification & Regression Tree:", cart_model.score(X_test, y_test).round(3))

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

svm_model = SVC(gamma='auto')
svm_model.fit(X_train, y_train)
print("Support-Vector Machine:", svm_model.score(X_test, y_test).round(3))

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

Accuracy Scores (Train-Test Split):
Logistic Regression: 0.896
Linear Discriminant Analysis: 0.889
K Nearest Neighbor: 0.863
Classification & Regression Tree: 0.874
Gaussian Naive Bayes: 0.874
Support-Vector Machine: 0.885
Random Forest Classifier: 0.907


In [30]:
# Accuracy Score for 10-fold Cross Validation

print("Accuracy Scores (10-fold Cross Validation):")

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
log_scores = cross_val_score(log_model, X, y, cv=10)
print("Logistic Regression: %0.2f accuracy with a standard deviation of %0.2f" % (log_scores.mean(), log_scores.std()))

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
lda_scores = cross_val_score(lda_model, X, y, cv=10)
print("Linear Discriminant Analysis: %0.2f accuracy with a standard deviation of %0.2f" % (lda_scores.mean(), lda_scores.std()))

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_scores = cross_val_score(knn_model, X, y, cv=10)
print("K Nearest Neighbor: %0.2f accuracy with a standard deviation of %0.2f" % (knn_scores.mean(), knn_scores.std()))

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
cart_scores = cross_val_score(cart_model, X, y, cv=10)
print("Classification & Regression Tree: %0.2f accuracy with a standard deviation of %0.2f" % (cart_scores.mean(), cart_scores.std()))

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
gnb_scores = cross_val_score(gnb_model, X, y, cv=10)
print("Gaussian Naive Bayes: %0.2f accuracy with a standard deviation of %0.2f" % (gnb_scores.mean(), gnb_scores.std()))

svm_model = SVC(gamma='auto')
svm_model.fit(X_train, y_train)
svm_scores = cross_val_score(svm_model, X, y, cv=10)
print("Support-Vector Machine: %0.2f accuracy with a standard deviation of %0.2f" % (svm_scores.mean(), svm_scores.std()))

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
rfc_scores = cross_val_score(rfc_model, X, y, cv=10)
print("Random Forest Classifier: %0.2f accuracy with a standard deviation of %0.2f" % (rfc_scores.mean(), rfc_scores.std()))

Accuracy Scores (10-fold Cross Validation):
Logistic Regression: 0.88 accuracy with a standard deviation of 0.03
Linear Discriminant Analysis: 0.88 accuracy with a standard deviation of 0.03
K Nearest Neighbor: 0.85 accuracy with a standard deviation of 0.06
Classification & Regression Tree: 0.78 accuracy with a standard deviation of 0.08
Gaussian Naive Bayes: 0.85 accuracy with a standard deviation of 0.04
Support-Vector Machine: 0.85 accuracy with a standard deviation of 0.02
Random Forest Classifier: 0.80 accuracy with a standard deviation of 0.09
