# Supervised Machine Learning - Regression
Sumudu Tennakoon, PhD

To learn more about Python, refeer to the following websites

* Python : www.python.org

To learn more about the Python packages we explore in this notebook, refeer to the following websites

* NumPy : www.numpy.org
* Matplotlib : www.matplotlib.org
* Pandas : https://pandas.pydata.org
* Scikit-Learn : https://scikit-learn.org/
* Seaborn: https://seaborn.pydata.org/
* StatsModel : https://www.statsmodels.org

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Polynomial Regression

## Generate Dataset

In [None]:
x = np.random.normal(0, 1, 25)
print(F"x = {x}")

In [None]:
y = 3.14 + 0.5*x + 2*(x**2) - 1.5*(x**3) + np.random.normal(-1, 1, 25)
print(F"y = {y}")

In [None]:
data = pd.DataFrame(data={"x":x, "y":y})
data

In [None]:
data.plot(x="x", y="y", kind="scatter")

In [None]:

from sklearn.linear_model import LinearRegression

X = data[['x']]
y = data['y']

model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

data['y_pred'] = y_pred

data.head()

In [None]:
f,ax0 = plt.subplots()
#ax1 = ax0.twinx()

data.plot(x='x', y='y', kind='scatter', ax=ax0)
data.plot(x='x', y='y_pred', kind='line', c='red', ax=ax0)

plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures

polynomial_features= PolynomialFeatures(degree=3)

data[['x0', 'x1', 'x2', 'x3']] = polynomial_features.fit_transform(data[["x"]])
data[['x', 'x0', 'x1', 'x2', 'x3', 'y']].head()

In [None]:
X = data[['x0', 'x1', 'x2', 'x3']] 
y = data['y']

model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

data['y_pred'] = y_pred

data[['x0', 'x1', 'x2', 'x3', 'y', 'y_pred']] .head()

In [None]:
f,ax0 = plt.subplots()
#ax1 = ax0.twinx()

data.plot(x='x', y='y', kind='scatter', ax=ax0)
data.plot(x='x', y='y_pred', kind='line', c='red', ax=ax0)

plt.show()

In [None]:
f,ax0 = plt.subplots()
#ax1 = ax0.twinx()

data.plot(x='x', y='y', kind='scatter', ax=ax0)
data.sort_values(by='x').plot(x='x', y='y_pred', kind='line', c='red', ax=ax0) # sort dstapoints

plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluate
mse = mean_squared_error(data['y'], data['y_pred'])
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(data['y'], data['y_pred'])
print("R2:", r2)

In [None]:
# Degree = 2

polynomial_features= PolynomialFeatures(degree=2)

data[['x0', 'x1', 'x2']] = polynomial_features.fit_transform(data[["x"]])

X = data[['x0', 'x1', 'x2']] 
y = data['y']

model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

data['y_pred_d2'] = y_pred

f,ax0 = plt.subplots()
#ax1 = ax0.twinx()

data.plot(x='x', y='y', kind='scatter', ax=ax0)
data.sort_values(by='x').plot(x='x', y='y_pred_d2', kind='line', c='red', ax=ax0) # sort dstapoints

plt.show()

# Evaluate
mse = mean_squared_error(data['y'], data['y_pred_d2'])
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(data['y'], data['y_pred_d2'])
print("R2:", r2)

In [None]:
# Degree = 10

polynomial_features= PolynomialFeatures(degree=10)

data[['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']] = polynomial_features.fit_transform(data[["x"]])

X = data[['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']] 
y = data['y']

model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

data['y_pred_d10'] = y_pred

f,ax0 = plt.subplots()
#ax1 = ax0.twinx()

data.plot(x='x', y='y', kind='scatter', ax=ax0)
data.sort_values(by='x').plot(x='x', y='y_pred_d10', kind='line', c='red', ax=ax0) # sort dstapoints

plt.show()

# Evaluate
mse = mean_squared_error(data['y'], data['y_pred_d10'])
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(data['y'], data['y_pred'])
print("R2:", r2)

In [None]:
data[['x', 'y', 'y_pred', 'y_pred_d2', 'y_pred_d10']].head()

# Overfitting and Underfitting

In [None]:
f,ax0 = plt.subplots()
#ax1 = ax0.twinx()

data.plot(x='x', y='y', kind='scatter', ax=ax0) 
data.sort_values(by='x').plot(x='x', y='y_pred_d2', kind='line', c='red', ax=ax0)
data.sort_values(by='x').plot(x='x', y='y_pred_d10', kind='line', c='magenta', ax=ax0) 
data.sort_values(by='x').plot(x='x', y='y_pred', kind='line', c='green', linewidth=2, ax=ax0)

# Linear Regression

In [None]:
file_name = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'

# Load CSV File
data = pd.read_csv(file_name, sep=';')
data.sample(20)

In [None]:
data.info()

In [None]:
data.describe(include='all').transpose()

In [None]:
data['id'] = data.index+1
data.head()

In [None]:
data.columns

In [None]:
data = data[[ 'id', 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality']]

data.head()

In [None]:
sns.pairplot(data[[ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality']])
plt.show()

In [None]:
correlation_matrix = data[[ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality']].corr()
correlation_matrix

In [None]:
sns.heatmap(correlation_matrix.abs())

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [None]:
X = [ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'
    ]
    
y = ['quality']
       
X_train, X_test, y_train, y_test = train_test_split(data[X], data[y], test_size=0.3, random_state=42)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
model = linear_model.LinearRegression()
print(model)

In [None]:
y_actual = 'quality'
y_predict = 'prected_quality'
correlation_matrix[y_actual].sort_values()

In [None]:
# Seelct variables
X = ['alcohol']

#
# Fit
model.fit(X_train[X], y_train[y_actual])

#Find model parameters
coefficients = model.coef_
intercept = model.intercept_

print(pd.DataFrame(data={'features':X, 'coefficients':coefficients}))
print('\n') # Add new line to print
print(F"Intercept = {intercept}")

In [None]:
result = y_test
result[y_predict] = model.predict(X_test[X])

result['abs_difference'] = (result[y_actual] - result[y_predict]).abs()
result[[y_actual, y_predict, 'abs_difference']]

In [None]:
result['abs_difference'].describe()

In [None]:
mse = mean_squared_error(result[y_actual], result[y_predict])
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(result[y_actual], result[y_predict])
print("R2:", r2)

## 2nd Iteration

In [None]:
y_actual = 'quality'
y_predict = 'prected_quality'
correlation_matrix[y_actual].sort_values()

In [None]:
# Seelct variables
X = ['alcohol', 'volatile acidity']

# Fit
model.fit(X_train[X], y_train[y_actual])

#Find model parameters
coefficients = model.coef_
intercept = model.intercept_

print(pd.DataFrame(data={'features':X, 'coefficients':coefficients}))
print('\n') # Add new line to print
print(F"Intercept = {intercept}")

result = y_test
result[y_predict] = model.predict(X_test[X])

mse = mean_squared_error(result[y_actual], result[y_predict])
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(result[y_actual], result[y_predict])
print("R2:", r2)

## 3rd Iteration

In [None]:
y_actual = 'quality'
y_predict = 'prected_quality'
correlation_matrix[y_actual].abs().sort_values()

In [None]:
correlation_matrix[y_actual].abs().sort_values().index

In [None]:
# Seelct variables
X = ['alcohol', 'volatile acidity', 'sulphates']

# Fit
model.fit(X_train[X], y_train[y_actual])

#Find model parameters
coefficients = model.coef_
intercept = model.intercept_

print(pd.DataFrame(data={'features':X, 'coefficients':coefficients}))
print('\n') # Add new line to print
print(F"Intercept = {intercept}")

result = y_test
result[y_predict] = model.predict(X_test[X])

mse = mean_squared_error(result[y_actual], result[y_predict])
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(result[y_actual], result[y_predict])
print("R2:", r2)

In [None]:
# Seelct variables
X = ['fixed acidity',
       'chlorides', 'density', 'total sulfur dioxide', 'citric acid',
       'sulphates', 'volatile acidity', 'alcohol']

# Fit
model.fit(X_train[X], y_train[y_actual])

#Find model parameters
coefficients = model.coef_
intercept = model.intercept_

print(pd.DataFrame(data={'features':X, 'coefficients':coefficients}))
print('\n') # Add new line to print
print(F"Intercept = {intercept}")

result = y_test
result[y_predict] = model.predict(X_test[X])

mse = mean_squared_error(result[y_actual], result[y_predict])
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(result[y_actual], result[y_predict])
print("R2:", r2)

# Normalization

In [None]:
for column in X_train.columns:
  print(F"min({column}): {X_train[column].min()}")
  print(F"max({column}): {X_train[column].max()}")
  X_train[column] = ( X_train[column] - X_train[column].min() ) / ( X_train[column].max() - X_train[column].min() )

In [None]:
for column in X_test.columns:
  print(F"min({column}): {X_test[column].min()}")
  print(F"max({column}): {X_test[column].max()}")
  X_test[column] = ( X_test[column] - X_test[column].min() ) / ( X_test[column].max() - X_test[column].min() )

In [None]:
# Seelct variables
X = X_train.columns

# Fit
model.fit(X_train[X], y_train[y_actual])

#Find model parameters
coefficients = model.coef_
intercept = model.intercept_

print(pd.DataFrame(data={'features':X, 'coefficients':coefficients}))
print('\n') # Add new line to print
print(F"Intercept = {intercept}")

result = y_test
result[y_predict] = model.predict(X_test[X])

mse = mean_squared_error(result[y_actual], result[y_predict])
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(result[y_actual], result[y_predict])
print("R2:", r2)

<hr>
Last update 2021-10-16 by Sumudu Tennakoon

<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>.