<a href="https://colab.research.google.com/github/VineetSivadasan/houses/blob/main/Intro_to_Ensemble_Models_and_Regression_(KNN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal as mvn
import seaborn as sns
from geopy import Nominatim

houses_cleaned = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/houses_cleaned.csv')
houses_cleaned

In [None]:
houses_cleaned.shape #determine the number of rows and columns of the dataset

In [None]:
houses_cleaned.dtypes   #determine what types of data are in the cells

In [None]:
#check to see if there are any null values
checking_NaN = houses_cleaned.isnull().values.any() 
checking_NaN

In [None]:
#print the rows where 'None' appear
print_None_rows = houses_cleaned[houses_cleaned.eq('None').any(axis=1)]  
print_None_rows.shape

In [None]:
# remove rows that contain a HOA value greater than 550 
print_large_rows = houses_cleaned[houses_cleaned["HOA"] > 550]
houses_cleaned = houses_cleaned[houses_cleaned["HOA"] < 551]
houses_cleaned



In [None]:
#print columns where 'None' appears
houses_cleaned.eq("None").any()[lambda x: x]  

In [None]:
#construct a correlation matrix for the 14 columns/features 
corr_matrix = houses_cleaned.corr()
corr_matrix.style.background_gradient(cmap='coolwarm')


In [None]:
#heatmap for the 14 features
sns.heatmap(corr_matrix)

In [None]:
X=np.zeros((len(houses_cleaned),2)) #create a numpy array with the number of rows of the training set and 2 columns
X

In [None]:
# fill the X numpy array with the longitude and latitude columns from the houses_cleaned datasets
X[:,0]=houses_cleaned.iloc[:,3] #longitude
X[:,1]=(houses_cleaned.iloc[:,4]) #latitude
X

In [None]:
X[np.argmax(X[:,0])][0], X[np.argmax(X[:,0])][1] #find longitude and latitude where max value for longitude is

In [None]:
X[np.argmax(X[:,1])][0], X[np.argmax(X[:,1])][1] #find longitude and latitude where max value for latitude is

In [None]:
# plotting longitude and lattitude datapoints and labelling the known locations
plt.scatter(X[:,0],X[:,1])
plt.scatter(X[np.argmax(X[:,0])][0], X[np.argmax(X[:,0])][1], color="red")
plt.scatter(X[np.argmax(X[:,1])][0], X[np.argmax(X[:,1])][1], color="yellow")
plt.scatter(X[2220,0],X[2220,1], color="brown")
plt.xlabel("longitude")
plt.ylabel("latitude")

In [None]:
HOA_data = (houses_cleaned.iloc[:,15]).astype(int) #fill the variable HOA_data with the "HOA" column from houses_cleaned
print(HOA_data)

In [314]:
# group the HOA values as classes
HOA_data2 = HOA_data[(HOA_data >= 0) & (HOA_data <=20000)]

In [None]:
y = HOA_data2  #y is the variable which contains the 
y

In [None]:
#Turn the HOA column into a numpy array
y = y.to_numpy()
y

In [None]:
y.shape

In [None]:
y.dtype

In [320]:
y = y.astype(int)

In [None]:
y.max()

In [None]:
y.min()

In [None]:
# histogram of HOA value vs frequency
plt.figure()
plt.hist((houses_cleaned.iloc[:,15]))
plt.xlabel("HOA value")
plt.ylabel("frequency")

In [329]:
class KNNClassifier():

  def fit(self,X,y):
    self.X=X
    self.y=y
  
  def predict(self, X,K, epsilon =1e-3):
    N=len(X)
    y_hat = np.zeros(N)

    for i in range(N):
      dist2 = np.sum((self.X-X[i])**2, axis=1)
      idxt = np.argsort(dist2)[:K]
      gamma_k = 1/(np.sqrt(dist2[idxt])+epsilon)
      y_hat[i]=np.bincount(self.y[idxt], weights=gamma_k).argmax()
    return y_hat


In [608]:
def accuracy(y, y_hat):
  return np.mean(y == y_hat)

In [None]:
accuracy(y, y_hat)

In [330]:
knn = KNNClassifier()

In [331]:
knn.fit(X,y)

In [None]:
# predict 
y_hat = knn.predict(X,200)
y_hat

In [None]:
# plot of longitude and latitude vs predicted HOA 
plt.figure()
plt.plot(X,y_hat)
plt.xlabel("longitude and latitude")
plt.ylabel("predicted HOA")

In [None]:
#plot of predicted y values against frequency
plt.figure()
plt.hist((y_hat),200)
plt.ylabel("frequency")
plt.xlabel("predicted HOA")

# Adding the new classified "longitude and latitude as a function of HOA"column to the houses dataset

In [None]:
Z = y_hat
Z.astype(int)

In [None]:
Z = Z.T
Z.shape

In [370]:
# turn the cleaned houses dataframe to a numpy array
houses_cleaned_classified = houses_cleaned.to_numpy()

In [None]:
houses_cleaned_classified.shape

In [None]:
#joining the new KNN classified column of longitude and latitude against HOA to the original dataset, thus making 17 columns in total
houses_cleaned_classified_new = np.hstack((houses_cleaned_classified, np.atleast_2d(Z).T))
houses_cleaned_classified_new

In [None]:
houses_cleaned_classified_new.shape

In [None]:
#convert the joined array back into a dataframe
houses_cleaned_classified_new_df = pd.DataFrame(data =houses_cleaned_classified_new)
houses_cleaned_classified_new_df.head()

## Splitting the newly combined dataset into training, validation and test sets

In [375]:
# constructing training, validation and test sets
def data_train_validate_test(df, training_set_percent=0.8, validation_set_percent=0.1, seed=None):
  np.random.seed(seed)  #set a random seed number such that the same index sampled here is repeated 
  permutation = np.random.permutation(df.index)  #random selection of the dataframe index
  print(permutation)
  length = len(df.index)  #size of index of dataset
  #print(length)
  training_set_end = int(training_set_percent * length)  #size of the training set
  print(training_set_end)
  validation_set_end = int(validation_set_percent * length) + training_set_end  #size of the validation set + training set
  print(validation_set_end)
  training = df.iloc[permutation[:training_set_end]] #select random rows up to the size of the training set to form the training dataset
  validation = df.iloc[permutation[training_set_end:validation_set_end]] #select random rows up to the size of validation set to form the validation dataset
  test = df.iloc[permutation[validation_set_end:]] #The remaining rows from the validation set onwards to the end will be the test dataset
  return training, validation, test

In [None]:
houses_cleaned_classified_new_df.head()



In [None]:
training, validation, test = data_train_validate_test(houses_cleaned_classified_new_df)

In [None]:
training.shape, validation.shape, test.shape

In [None]:
training_numpy = training.to_numpy()
training_numpy

In [None]:
test_numpy = test.to_numpy()
test_numpy

In [None]:
validation_numpy = validation.to_numpy()
validation_numpy

In [None]:
#training , validation and test y variables i.e. the HOA column
Y_MULTI = training_numpy[:,[15]].astype(float) 
Y_MULTI_TEST = test_numpy[:,[15]].astype(float) 
Y_MULTI_VALIDATION = validation_numpy[:,[15]].astype(float) 

Y_MULTI #training y, i.e. the HOA column

In [None]:
X_MULTI = training_numpy[:,[0,1,2,5,6,7,8,9,10,11,13,16]].astype(float) #multivariate linear regression
X_MULTI_TEST = test_numpy[:,[0,1,2,5,6,7,8,9,10,11,13,16]].astype(float) #multivariate linear regression
X_MULTI_VALIDATION = validation_numpy[:,[0,1,2,5,6,7,8,9,10,11,13,16]].astype(float) #multivariate linear regression

X_MULTI #training X, i.e. all columns except longitude, latitude (which is being replaced with the new classified column) and HOA column

In [None]:
X_MULTI.shape

In [568]:
def OLS(Y,Y_hat, N):
  return (1/(2*N)*np.sum((Y-Y_hat)**2))

def R2(Y, Y_hat):
  return (1-(np.sum((Y-Y_hat)**2)/np.sum((Y-np.mean(Y))**2)))

In [569]:
class LinearRegression():
  def fit(self,X,y):
    self.w = np.linalg.solve(X.T@X, X.T@y)
  
  def predict(self,X):
    return np.matmul(X, self.w)

In [570]:
lr1=LinearRegression()

In [571]:
lr1.fit(X_MULTI,Y_MULTI)

In [572]:
y_hat2=lr1.predict(X_MULTI) #predicted HOA values in the training set

In [573]:
y_hat2_test=lr1.predict(X_MULTI_TEST)  #predicted HOA values in the test set

In [574]:
y_hat2_validation=lr1.predict(X_MULTI_VALIDATION)   #predicted HOA values in the validation set

In [None]:
R2(Y_MULTI,y_hat2)

In [None]:
R2(Y_MULTI_TEST,y_hat2_test)

In [None]:
R2(Y_MULTI_VALIDATION,y_hat2_validation)

In [None]:
X_MULTI[:,11]  #classified longitude and latitude column

In [None]:
plt.figure()
plt.scatter(X_MULTI[:,11],y_hat2)
plt.xlabel("classified longitude and latitude by HOA")
plt.ylabel("predicted HOA")

In [None]:
plt.figure()
plt.scatter(X_MULTI[:,11],Y_MULTI)
plt.xlabel("classified longitude and latitude by HOA")
plt.ylabel("actual HOA")

In [None]:
plt.figure()
plt.scatter(X_MULTI[:,2],y_hat2, color ="red")
plt.xlabel("zip code")
plt.ylabel("predicted HOA")

In [None]:
plt.figure()
plt.scatter(X_MULTI[:,2],Y_MULTI, color ="red")
plt.xlabel("zip code")
plt.ylabel(" HOA")

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)
#ax1.scatter(X_MULTI[:,0],y_hat2, color = "red")  #MLS vs predicted HOA
#ax1.scatter(X_MULTI[:,1],y_hat2, color = "orange")  #sold price vs predicted HOA
#ax1.scatter(X_MULTI[:,2],y_hat2, color = "yellow")  #zipcode vs predicted HOA
#ax1.scatter(X_MULTI[:,5],y_hat2, color = "purple")  #lot_acres vs predicted HOA
ax1.scatter(X_MULTI[:,6],y_hat2, color = "brown")  #taxes vs predicted HOA
ax1.scatter(X_MULTI[:,7],y_hat2, color = "violet")  #year_built vs predicted HOA
#ax1.scatter(X_MULTI[:,8],y_hat2, color = "pink")  #bedrooms vs predicted HOA
ax1.scatter(X_MULTI[:,9],y_hat2, color = "black")  #bathrooms vs predicted HOA
ax1.scatter(X_MULTI[:,10],y_hat2, color = "grey")  #bathrooms vs predicted HOA
#ax1.scatter(X_MULTI[:,11],y_hat2, color ="blue") #combined column of longitude and latitude against HOA vs predicted HOA


plt.ylabel("predicted HOA")

plt.show

In [None]:
sns.pairplot(training)