# MAE
https://en.wikipedia.org/wiki/Mean_absolute_error

# MSE
https://en.wikipedia.org/wiki/Mean_squared_error

# R2
https://condor.depaul.edu/sjost/it223/documents/correlation.htm#:~:text=The%20R%2Dsquared%20value%2C%20denoted,Perfect%20positive%20linear%20association.
https://en.wikipedia.org/wiki/Coefficient_of_determination

In [None]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
df = pd.read_csv(r"housing.data", sep=" +", engine="python", header=None, names=["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"])
df

In [None]:
df = df.loc[:, ["LSTAT","MEDV"]]

IQR METHOD

In [None]:
# IQR method
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

Removing outliers

In [None]:
outlier_condition = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)))
df = df[~outlier_condition.any(axis=1)]

In [None]:
X = df["LSTAT"].values.reshape(-1,1)
y = df["MEDV"].values.reshape(-1,1)

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(X, y, color="black")

Commented code is only equivalent for R^2. Not for MAE and MSE.

Data standardization; StandardScaler(), transform()
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
#
# lin_reg = make_pipeline(StandardScaler(), LinearRegression())
# lin_reg.fit(X_train, y_train)
# lin_reg.score(X_train, y_train)


scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

scaler = StandardScaler()
scaler.fit(y)
y = scaler.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lin_reg.score(X_train, y_train)

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(X_train, y_train, color="black")
plt.plot(X_train, lin_reg.predict(X_train), color="red")

In [None]:
# MAE
mae_train = mean_absolute_error(y_train, lin_reg.predict(X_train))
mae_test = mean_absolute_error(y_test, lin_reg.predict(X_test))
print(f"MAE: trainVal = {round(mae_train, 2)}, testVal = {round(mae_test, 2)}")

# MSE
mse_train = mean_squared_error(y_train, lin_reg.predict(X_train))
mse_test = mean_squared_error(y_test, lin_reg.predict(X_test))
print(f"MSE: trainVal = {round(mse_train, 2)}, testVal = {round(mse_test, 2)}")

# R2
# https://condor.depaul.edu/sjost/it223/documents/correlation.htm#:~:text=The%20R%2Dsquared%20value%2C%20denoted,Perfect%20positive%20linear%20association.
# https://en.wikipedia.org/wiki/Coefficient_of_determination
r2_train = r2_score(y_train, lin_reg.predict(X_train))
r2_test = r2_score(y_test, lin_reg.predict(X_test))
print(f"R^2:  trainVal = {round(r2_train, 2)}, testVal = {round(r2_test, 2)}")

No data scalling and removing outliers, beyond that same process.

In [None]:
print("2nd Version")

In [None]:
df1 = pd.read_csv(r"housing1.data", sep=" +", engine="python", header=None, names=["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"])
df1 = df1.loc[:, ["LSTAT", "MEDV"]]
X1 = df1["LSTAT"].values.reshape(-1, 1)
y1 = df1["MEDV"].values.reshape(-1, 1)
plt.figure(figsize=(10, 5))
plt.scatter(X1, y1, color="black")

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.15)

lin_reg1 = LinearRegression()
lin_reg1.fit(X_train1, y_train1)
lin_reg1.score(X_train1, y_train1)

In [None]:
plt.figure(figsize=(10, 5))
plt.scatter(X_train1, y_train1, color="black")
plt.plot(X_train1, lin_reg1.predict(X_train1), color="red")

We got, as expected, much worse values for MAE and MSE, R2 still pretty good scores. R2 is always standarized, earlier we got similar values cause we did hand made standarization to MAE and MSE. R2 method has this procedure built in library.

R2 on test data could go above 100%

In [None]:
# # MAE
mae_train1 = mean_absolute_error(y_train1, lin_reg1.predict(X_train1))
mae_test1 = mean_absolute_error(y_test1, lin_reg1.predict(X_test1))
print(f"MAE: trainVal1 = {round(mae_train1, 2)}, testVal1 = {round(mae_test1, 2)}")

# # MSE
mse_train1 = mean_squared_error(y_train1, lin_reg1.predict(X_train1))
mse_test1 = mean_squared_error(y_test1, lin_reg1.predict(X_test1))
print(f"MSE: trainVal1 = {round(mse_train1, 2)}, testVal1 = {round(mse_test1, 2)}")

# # R2
r2_train1 = r2_score(y_train1, lin_reg1.predict(X_train1))
r2_test1 = r2_score(y_test1, lin_reg1.predict(X_test1))
print(f"R^2:  trainVal1 = {round(r2_train1, 2)}, testVal1 = {round(r2_test1, 2)}")