# Height vs Weight

Welcome to Core Workshop 4: Polynomial Regression's live code session

Today we are dealing with a csv toy dataset taken from Kaggle: [https://www.kaggle.com/sakshamjn/heightvsweight-for-linear-polynomial-regression](https://www.kaggle.com/sakshamjn/heightvsweight-for-linear-polynomial-regression), which records the **weight** and **height** of people (they are made up). The data is already cleaned.

---

#### In this session we are using this data to build a `polynomial regression` model, trying to predict the **height** by a given **weight**, using k-folds cross validation method to find optimal hyperparameters

# Explore

In [None]:
# Repeat previous processing
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

df = pd.read_csv("./data/HeightVsWeight.csv")

x = df.iloc[:, :1]
y = df.iloc[:, 1:]

plt.figure(figsize=(15, 6))
plt.scatter(x, y, color="purple")
plt.xlabel("Weight")
plt.ylabel("Height")
plt.title("Height VS Weight")


# Split using k-folds

In [None]:
# We are going to split the training data into 5 folds, which is common number used for small datasets

from sklearn.model_selection import KFold, train_test_split

# Regular train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=1)

# Build 5-folds split, split by index
kf5 = KFold(n_splits=5)

# Save the train index and validate index
t_v_indexes = []

for train_index, validation_index in kf5.split(x_train):
    t_v_indexes.append([train_index, validation_index])
    print("Train: {} | Validate: {}".format(train_index, validation_index))


# Training (Hyperparameter optimization)

In [None]:
# We are iterating through each fold, for a range of hyperparameter, and take average accuracy to find the best hyperparameter
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression


# Let's range it to 10, because it doesn't seem to be a very tortuous dataset
n = 20

hyper_average = dict()

# For each degree setting
for i in range(1, n+1):

    # Build respective pipeline and initialize sum
    sum = 0
    pipeline = make_pipeline(PolynomialFeatures(degree=i), LinearRegression())

    # For each fold's train index and validation index
    for train_index, validation_index in t_v_indexes:

        cur_train_fold_x = x_train.iloc[train_index]
        cur_train_fold_y = y_train.iloc[train_index]

        cur_validation_fold_x = x_train.iloc[validation_index]
        cur_validation_fold_y = y_train.iloc[validation_index]

        pipeline.fit(cur_train_fold_x, cur_train_fold_y)
        sum += pipeline.score(cur_validation_fold_x, cur_validation_fold_y)

    # Add average to the performance
    hyper_average[i] = sum/len(t_v_indexes)

for key in hyper_average:
    print("For degree {}, the average accuracy score is {}.".format(key, round(hyper_average.get(key), 4)))

In [None]:
best_degree = max(hyper_average, key=hyper_average.get)
print("The best prediction degree is {}, with average accuracy of {}".format(best_degree, hyper_average.get(best_degree)))


In [None]:
# Similar to the pipeline, this whole process can be simplified using cross_val_score function from sklearn, it uses k-folds method cross validation by default
from sklearn.model_selection import cross_val_score
import numpy as np

n = 20

hyper_average = [None] * n

for i in range(n):
    pipeline = make_pipeline(PolynomialFeatures(degree=i+1), LinearRegression())
    hyper_average[i] = np.mean(cross_val_score(pipeline, x_train, y_train, cv=5))

# Plus one because list are 0-indexed, unlike how we defined our dictionary
best_degree = hyper_average.index(max(hyper_average))
print("The best prediction degree is {}, with average accuracy of {}".format(best_degree+1, hyper_average[best_degree]))

plt.figure(figsize=(15, 6))
plt.plot(np.arange(1, 21), hyper_average, color="purple")
plt.xlabel("Model Complexity (degree)")
plt.ylabel("Accuracy Score")
plt.title("Accuracy score from different degrees")
plt.xlim(1, 20)
plt.ylim(0.990, 1)


## Conclusion: our best degree is 11

# Training (Parameter optimization)

In [None]:
import random

model = make_pipeline(PolynomialFeatures(degree=best_degree+1), LinearRegression())
model.fit(x_train, y_train)

print("Model training completed")
rand = np.array([[random.randint(10, 80)]])
print("Trying to predict a person with a weight of {} kg will have a height of {} cm".format(rand[0][0], round(model.predict(rand)[0][0], 2)))

In [None]:
# How does it look like?

x_test = x_test.sort_values("Weight")
y_test = y_test.reindex(x_test.index)

plt.figure(figsize=(15, 6))
plt.scatter(x, y, color="purple")
plt.plot(x_test, model.predict(x_test), color="black", linewidth=5)
plt.xlabel("Weight")
plt.ylabel("Height")
plt.title("Height VS Weight")

# Score

In [None]:
accuracy_score = model.score(x_test, y_test)
print("Model Accuracy: {}".format(round(accuracy_score, 4)))

# This is almost 100% correct everytime, the best we can do avoiding overfitting