# Supervised Learning Model

In [39]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error



df = pd.read_csv("Melbourne_housing_FULL.csv")



del df["Address"]
del df["Method"]
del df["SellerG"]
del df["Date"]
del df["Postcode"]
del df["Lattitude"]
del df["Longtitude"]
del df["Regionname"]
del df["Propertycount"]



# using dropna to remove rows with missing values
df.dropna(axis = 0, how = "any", subset = None, inplace = True)



# perform one-hot encoding in Pandas suing pd.get_dummies, to convert non-numeric data to numeric values
df = pd.get_dummies (df, columns = ["Suburb", "CouncilArea", "Type"])



# y is the dependent variable Price, and x are the remaining independent variables
X = df.drop("Price", axis=1)
y = df["Price"]



# splitting data in to a 70/30 training and testing segments using a Scikit-learn command
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True)



# selecting an algorithm and configuring hyperparameters
model =  ensemble.GradientBoostingRegressor(
    n_estimators = 150,
    learning_rate = 0.1,
    max_depth = 30,
    min_samples_split = 4,
    min_samples_leaf = 6,
    max_features = 0.6,
    loss = "huber"
)



# using the fit() function to link the training data to the above algorithm
model.fit(X_train, y_train)



# using Scikit-learn predict() function to compare the training and testing data, using mean absolute error
mae_train = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.2f" % mae_train)

mae_test = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error: %.2f" % mae_test)
# a high discrepancy between training and test data is usually indicator of overfitting
# an example of this issue here was setting the max_depth to 30

Training Set Mean Absolute Error: 28759.44
Test Set Mean Absolute Error: 165777.61
