In [258]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree

# level definition
# low, medium, high = 1, 2, 3

In [259]:
# Prepare data

houses_by_lga = pd.read_csv("Houses by LGA.csv")
houses_by_lga.set_index(["LGA", "Year"], inplace=True)

offences_by_lga = pd.read_csv("Offences By LGA NEW.csv")
offences_by_lga.set_index(["LGA", "Year"], inplace=True)

egm_by_lga = pd.read_csv("EGM-New-Format.csv")
egm_by_lga.set_index(["LGA", "Year"], inplace=True)



merged_data = houses_by_lga.merge(offences_by_lga, how="inner", on=["LGA", "Year"]) \
                           .merge(egm_by_lga, how="inner", on=["LGA", "Year"])

merged_data.rename(columns={"total dollars" : "Total Loss On EGM", "rate" : "EGM Rate"}, inplace=True)

merged_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Weighted House Price,Type A Percentage,Type B Percentage,Type C Percentage,Type D Percentage,Type E Percentage,Offence Rate,Total Loss On EGM,EGM Rate
LGA,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Yarra,2016,1278619.71,9.86,69.94,6.33,6.51,7.10,15074.17,3.299235e+07,401.04
Moonee Valley,2016,1014690.20,12.11,67.44,5.70,5.18,9.41,7827.72,7.540108e+07,651.95
Brimbank,2016,538872.26,13.47,59.99,7.65,6.13,12.63,9375.09,1.430457e+08,689.02
Port Phillip,2016,1564173.34,11.26,69.41,5.90,6.31,6.84,12520.83,2.809594e+07,322.63
Ballarat,2016,334852.68,15.16,61.04,4.03,5.67,13.86,11679.23,5.461125e+07,466.93
...,...,...,...,...,...,...,...,...,...,...
Melbourne,2020,1437999.93,12.28,48.29,8.97,8.05,8.44,21397.63,6.055093e+07,498.27
Frankston,2020,660194.72,15.29,40.30,8.99,7.12,19.05,10397.28,4.630308e+07,350.90
Nillumbik,2020,950270.23,16.40,50.02,6.73,7.35,15.26,3555.55,2.778951e+07,378.91
Whittlesea,2020,624857.68,16.84,49.33,8.47,4.57,14.82,7268.01,6.327701e+07,378.91


In [260]:
print(merged_data["Weighted House Price"].groupby("Year").mean())
print()
print(merged_data["Weighted House Price"].groupby("Year").quantile(.333))
print()
print(merged_data["Weighted House Price"].groupby("Year").quantile(.667))
print()

Year
2016    9.043720e+05
2017    1.019331e+06
2018    1.013338e+06
2019    9.732211e+05
2020    1.038691e+06
Name: Weighted House Price, dtype: float64

Year
2016    583393.30992
2017    667420.37968
2018    692980.90856
2019    653007.79368
2020    692429.00360
Name: Weighted House Price, dtype: float64

Year
2016    9.439010e+05
2017    1.070509e+06
2018    1.070244e+06
2019    1.025545e+06
2020    1.100004e+06
Name: Weighted House Price, dtype: float64



In [261]:
low_weighted_house_price = merged_data["Weighted House Price"].groupby("Year").quantile(.333)
high_weighted_house_price = merged_data["Weighted House Price"].groupby("Year").quantile(.667)

print(low_weighted_house_price.index.get_level_values("Year"))

def weighted_house_price_level_generator(year, price):
    low_price = low_weighted_house_price[low_weighted_house_price.index.get_level_values("Year") == year].values[0]
    high_price = high_weighted_house_price[high_weighted_house_price.index.get_level_values("Year") == year].values[0]

    if price < low_price:
        return "Low"
    elif low_price <= price < high_price:
        return "Medium"
    else:
        return "High"


Index([2016, 2017, 2018, 2019, 2020], dtype='int64', name='Year')


In [262]:
merged_data["Weighted House Price Level"] = np.nan

for index, row in merged_data.iterrows():
    merged_data.loc[index, "Weighted House Price Level"] = weighted_house_price_level_generator(
        index[1],
        merged_data.loc[index, "Weighted House Price"]
    )

merged_data

  merged_data.loc[index, "Weighted House Price Level"] = weighted_house_price_level_generator(


Unnamed: 0_level_0,Unnamed: 1_level_0,Weighted House Price,Type A Percentage,Type B Percentage,Type C Percentage,Type D Percentage,Type E Percentage,Offence Rate,Total Loss On EGM,EGM Rate,Weighted House Price Level
LGA,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Yarra,2016,1278619.71,9.86,69.94,6.33,6.51,7.10,15074.17,3.299235e+07,401.04,High
Moonee Valley,2016,1014690.20,12.11,67.44,5.70,5.18,9.41,7827.72,7.540108e+07,651.95,High
Brimbank,2016,538872.26,13.47,59.99,7.65,6.13,12.63,9375.09,1.430457e+08,689.02,Low
Port Phillip,2016,1564173.34,11.26,69.41,5.90,6.31,6.84,12520.83,2.809594e+07,322.63,High
Ballarat,2016,334852.68,15.16,61.04,4.03,5.67,13.86,11679.23,5.461125e+07,466.93,Low
...,...,...,...,...,...,...,...,...,...,...,...
Melbourne,2020,1437999.93,12.28,48.29,8.97,8.05,8.44,21397.63,6.055093e+07,498.27,High
Frankston,2020,660194.72,15.29,40.30,8.99,7.12,19.05,10397.28,4.630308e+07,350.90,Low
Nillumbik,2020,950270.23,16.40,50.02,6.73,7.35,15.26,3555.55,2.778951e+07,378.91,Medium
Whittlesea,2020,624857.68,16.84,49.33,8.47,4.57,14.82,7268.01,6.327701e+07,378.91,Low


In [263]:
# Knn

train = pd.concat([
    merged_data[merged_data.index.get_level_values("Year") == 2016],
    merged_data[merged_data.index.get_level_values("Year") == 2018],
    merged_data[merged_data.index.get_level_values("Year") == 2019],
   

])

test = pd.concat([
    merged_data[merged_data.index.get_level_values("Year") == 2017],  
    merged_data[merged_data.index.get_level_values("Year") == 2020],
])

X_COLS = [
    "Offence Rate",
    "EGM Rate"
]

y_COL = ["Weighted House Price Level"]


X_train = train[X_COLS]
y_train = train[y_COL]

X_test = test[X_COLS]
y_test = test[y_COL]

knn = KNeighborsClassifier(n_neighbors=3)


knn.fit(X_train, y_train)

# Calculate accuracy score for the test data
y_pred = knn.predict(X_test)
accuracy = knn.score(X_test, y_test)

accuracy

  return self._fit(X, y)


0.696969696969697