In [None]:
# Import Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn import tree



In [None]:
# Read in CSV file
perth_houses_df = pd.read_csv("Resources/all_perth_310121.csv")

# Review the DataFrame
perth_houses_df.head()

### Data PreProcessing and Cleaning

In [None]:
perth_houses_df.describe().T

In [None]:
#find the datatypes
perth_houses_df.info()

In [None]:
#find null values of any of the columns
perth_houses_df.isnull().sum()

In [None]:
# change GARAGE 'NULL' to figure of 0 (assume there are no garages for these houses) 
perth_houses_df['GARAGE'] = perth_houses_df['GARAGE'].fillna(0)

# Set BUILD_YEAR to the median 
perth_houses_df['BUILD_YEAR'] = perth_houses_df['BUILD_YEAR'].fillna(perth_houses_df['BUILD_YEAR'].median())
perth_houses_df.tail(5)

In [None]:
#change garage and build_year columns from float to integer
perth_houses_df['GARAGE'] = perth_houses_df['GARAGE'].astype(int)
perth_houses_df['BUILD_YEAR'] = perth_houses_df['BUILD_YEAR'].astype(int)
perth_houses_df.tail()

In [None]:
#check breakdown of number of garages per house
perth_houses_df["GARAGE"].value_counts()

In [None]:
# remove all Garage values of <2
# value_counts = perth_houses_df['GARAGE'].value_counts()
# filter_classification = value_counts[value_counts >1]
# print(filter_classification)

In [None]:
#check breakdown of number of bedrooms per house
perth_houses_df["BEDROOMS"].value_counts()


In [None]:
#check breakdown of number of bathrooms per house
perth_houses_df["BATHROOMS"].value_counts()


In [None]:
# Set DATE_SOLD to datetime
perth_houses_df['DATE_SOLD'] = pd.to_datetime(perth_houses_df['DATE_SOLD'], format='%m-%Y\r')

# Split DATE_SOLD into MONTH and DATE columna
perth_houses_df['MONTH_SOLD'] = perth_houses_df['DATE_SOLD'].dt.month
perth_houses_df['YEAR_SOLD'] = perth_houses_df['DATE_SOLD'].dt.year
perth_houses_df.head()


In [None]:
#drop columns not required for ML 
perth_houses_cleaned_df = perth_houses_df.drop(['ADDRESS','CBD_DIST','NEAREST_STN_DIST','POSTCODE','DATE_SOLD','LATITUDE','LONGITUDE','NEAREST_SCH_DIST','NEAREST_SCH_RANK'], axis=1)
perth_houses_cleaned_df.head()

In [None]:
perth_houses_cleaned_df.isnull().sum()

In [None]:
perth_houses_cleaned_df.info()

In [None]:
for column in ['SUBURB','NEAREST_STN','NEAREST_SCH']:
    dummies= pd.get_dummies(perth_houses_cleaned_df[column], prefix= column)
    perth_houses_cleaned_df= pd.concat([perth_houses_cleaned_df, dummies], axis=1)
    perth_houses_cleaned_df= perth_houses_cleaned_df.drop(column, axis=1)

perth_houses_cleaned_df.head()    


### Split Data into Training and Test Sets

In [None]:
# Separate the y variable, the labels
y = perth_houses_cleaned_df["PRICE"]

# Separate the X variable, the features
X = perth_houses_cleaned_df.drop(columns="PRICE")

In [None]:
# Review the y variable Series
y[:5]

In [None]:
# Review the X variable DataFrame
X[:5]

In [None]:
# create a histogram for the y values
plt.hist(y, bins=10, edgecolor='black')
plt.xlabel('House Prices $m')
plt.ylabel('Frequency')
plt.title('Histogram of Perth House Prices')
plt.show()
plt.savefig("Images/Perth_House_Prices_Histogram.png")


### Split Data into Training and Test Datasets

In [None]:
# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape   

### Create a Linear Regression Model with the Cleaned Data

In [None]:
# Instantiate the Logistic Regression model
linear_regression_model = LinearRegression()

# fit the model using the training data
linear_regression_model.fit(X_train, y_train)

# Use the Test data to make a prediction
lr_predict = linear_regression_model.predict(X_test)
lr_predict

In [None]:
 # Score the model
print(f"Training Data Score: {linear_regression_model.score(X_train, y_train)}")
print(f"Testing Data Score: {linear_regression_model.score(X_test, y_test)}")

In [None]:
# Create a scatter plot 
plt.scatter(y_test, lr_predict)
plt.savefig("Images/ScatterPlot.png")


### Model 2: Decision Tree

In [None]:
# Create Standard Scaler
scaler= StandardScaler()

# Fit Standard Scaler
X_scaler= scaler.fit(X_train)

# Scale Data
X_train_scaled= X_scaler.transform(X_train)
X_test_scaled= X_scaler.transform(X_test)

In [None]:
# Fit the Decision Tree

# Create the Decision Tree classifier
model= tree.DecisionTreeClassifier()

# Fit the model
model= model.fit(X_train_scaled, y_train)

In [None]:
# Make Predictions using the Tree Model
tree_predict= model.predict(X_test_scaled)

In [None]:
# Calculate the confusion matrix
con_max= confusion_matrix(y_test, tree_predict)
con_max_df= pd.DataFrame(con_max, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

#Calculate the Accuracy Score
acc_score= accuracy_score(y_test, tree_predict)


In [None]:
# Display Results
print("Confusion Matrix")
display(con_max_df)
print(f"Accuracy Score: {acc_score}")
print("Classification Report")
print(classification_report(y_test, tree_predict))