In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
import joblib

# Read in data from CSV
df = pd.read_csv('~/Downloads/archive/Melbourne_housing_FULL.csv')

# Delete unneeded columns
del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Latitude']
del df['Longitude']
del df['Regionname']
del df['Propertycount']

# Remove rows with missing values
df = df.dropna(axis=0)

# Convert categorical data to numbers
features_df = pd.get_dummies(df, columns=['Suburb', 'CouncilArea', 'Type'])

# remove the price column
del features_df['Price']

# Create X and y arrays
X = features_df.to_numpy()
y = df['Price'].to_numpy()

# Split data into training and test sets, random_state seed, 70% training, 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create gradient boosting model with 150 trees, learning rate of 0.1, max depth of 30, min samples split of 4, min samples leaf of 6, max features of 0.6, and huber loss function
model = ensemble.GradientBoostingRegressor(
n_estimators=150,
learning_rate=0.1,
max_depth=30,
min_samples_split=4,
min_samples_leaf=6,
max_features=0.6,
loss='huber'
)

# Train the model
model.fit(X_train, y_train)

# Save the trained model to a file
joblib.dump(model, 'house_trained_model.pkl')

# Evaluate the results
mse = mean_absolute_error(y_train, model.predict(X_train))
print ("Training Set Mean Absolute Error: %.2f" % mse)

mse = mean_absolute_error(y_test, model.predict(X_test))
print ("Test Set Mean Absolute Error: %.2f" % mse)

Training Set Mean Absolute Error: 29967.87
Test Set Mean Absolute Error: 160900.82
