In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, mean_absolute_percentage_error

In [70]:
# Load datasets
dataset = pd.read_csv('./Q1/house_price.csv')

# Check for missing values (All rows has values)

# Extract our features and label
X = dataset.iloc[:, :-1] # anything but the last column
y = dataset.iloc[:, -1] # our label is in the last column 

# Divide our data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# Normalize data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
                        
# Model for Linear Regression
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

print("************* MODEL USING LINEAR REGRESSION ************* \n\n")

# After the model is training we can get the coefficients
coeff_lr = pd.DataFrame(model_lr.coef_, X.columns, columns=['Coefficient'])
print("Coefficients:")
print(coeff_lr)
print(f"\nIntercepts: {model_lr.intercept_} \n\n")


# Evaluate for Linear Regression
y_pred = model_lr.predict(X_test)
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', root_mean_squared_error(y_test, y_pred))
print('Mean Absolute Percentage Error:', mean_absolute_percentage_error(y_test, y_pred))

# Model for SGDRegressor

model_sgd = SGDRegressor()
model_sgd.fit(X_train, y_train)

print("\n\n************* MODEL USING SGDRegressor ************* \n\n")

# After the model is training we can get the coefficients
coeff_sgd = pd.DataFrame(model_sgd.coef_, X.columns, columns=['Coefficient'])
print("Coefficients:")
print(coeff_sgd)
print(f"\nIntercepts: {model_sgd.intercept_} \n\n")


# Evaluate for Linear Regression
y_pred = model_sgd.predict(X_test)
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', root_mean_squared_error(y_test, y_pred))
print('Mean Absolute Percentage Error:', mean_absolute_percentage_error(y_test, y_pred))

"""
MAE, MSE and RMSE are great formulas to calculate the error margin in our model,
however each of them provide different ways to understand the behaviour of our model
and we need to know which one give us a better understanding based on our goal

MAE is simple and tell us the amount of range that our prediction is wrong, for example,
if our prediction is 10 and our MAE is 5, it means that the real value probably will be between
5 - 15. This is easy to understand but treats every error equal, good for robust projects but bad for other sectors

MSE gives an exageratting error value based on how big the margin is, for example, if the error margin is 5, the MSE is
25, this might be a lot but if we compared to an error margin of 10, we'll see that MSE is 100, this provide that bigger
erros should be penalize more than small ones which is good for optimization but sensitive.

Finally, RMSE Offers a little bit of the exageration of MSE and the robust of MAE, it penalize bigger errors than small ones
but with less priority than MSE.

MAPE is another way to desplay MAE and it gives a more intuitive way to measure MAE, instead of say that the prediction
is off 1000, we can say that the prediction is off 20%, this is better since the error is always relative to the preds
"""

************* MODEL USING LINEAR REGRESSION ************* 


Coefficients:
           Coefficient
size     108575.891912
bedroom   -3600.971569

Intercepts: 349283.6756756757 


Mean Absolute Error: 46822.58381892502
Mean Squared Error: 3003136784.1917143
Root Mean Squared Error: 54800.883060327724
Mean Absolute Percentage Error: 0.167617856439434


************* MODEL USING SGDRegressor ************* 


Coefficients:
           Coefficient
size     107494.781518
bedroom   -2500.644218

Intercepts: [349199.60675343] 


Mean Absolute Error: 47208.39506593478
Mean Squared Error: 3041021742.723039
Root Mean Squared Error: 55145.45985593954
Mean Absolute Percentage Error: 0.1685304210666396
