In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import multivariate_normal as mvn

import warnings
warnings.filterwarnings("ignore")

import geopy
from geopy.geocoders import Nominatim

In [2]:
# read the data file
ncdata1 = pd.read_csv("~/Desktop/Work/ML_EIT/Data/raw_house_data.csv")
ncdata1.head()

Unnamed: 0,MLS,sold_price,zipcode,longitude,latitude,lot_acres,taxes,year_built,bedrooms,bathrooms,sqrt_ft,garage,kitchen_features,fireplaces,floor_covering,HOA
0,21530491,5300000.0,85637,-110.3782,31.356362,2154.0,5272.0,1941,13,10,10500.0,0.0,"Dishwasher, Freezer, Refrigerator, Oven",6,"Mexican Tile, Wood",0.0
1,21529082,4200000.0,85646,-111.045371,31.594213,1707.0,10422.36,1997,2,2,7300.0,0.0,"Dishwasher, Garbage Disposal",5,"Natural Stone, Other",0.0
2,3054672,4200000.0,85646,-111.040707,31.594844,1707.0,10482.0,1997,2,3,,,"Dishwasher, Garbage Disposal, Refrigerator",5,"Natural Stone, Other: Rock",
3,21919321,4500000.0,85646,-111.035925,31.645878,636.67,8418.58,1930,7,5,9019.0,4.0,"Dishwasher, Double Sink, Pantry: Butler, Refri...",4,"Ceramic Tile, Laminate, Wood",
4,21306357,3411450.0,85750,-110.813768,32.285162,3.21,15393.0,1995,4,6,6396.0,3.0,"Dishwasher, Garbage Disposal, Refrigerator, Mi...",5,"Carpet, Concrete",55.0


In [3]:
print(ncdata1.shape)
ncdata1.info()

(5000, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   MLS               5000 non-null   int64  
 1   sold_price        5000 non-null   float64
 2   zipcode           5000 non-null   int64  
 3   longitude         5000 non-null   float64
 4   latitude          5000 non-null   float64
 5   lot_acres         4990 non-null   float64
 6   taxes             5000 non-null   float64
 7   year_built        5000 non-null   int64  
 8   bedrooms          5000 non-null   int64  
 9   bathrooms         5000 non-null   object 
 10  sqrt_ft           5000 non-null   object 
 11  garage            5000 non-null   object 
 12  kitchen_features  5000 non-null   object 
 13  fireplaces        5000 non-null   object 
 14  floor_covering    5000 non-null   object 
 15  HOA               5000 non-null   object 
dtypes: float64(5), int64(4), object

In [4]:
# Data variables
ncdata1.columns

Index(['MLS', 'sold_price', 'zipcode', 'longitude', 'latitude', 'lot_acres',
       'taxes', 'year_built', 'bedrooms', 'bathrooms', 'sqrt_ft', 'garage',
       'kitchen_features', 'fireplaces', 'floor_covering', 'HOA'],
      dtype='object')

In [5]:
# Drop the 'MLS", kitchen_features', 'floor_covering'
ncdata1 = ncdata1.drop(columns = ['MLS', 'kitchen_features', 'floor_covering',"fireplaces","HOA"])
# Data shape (number of rwos and columns)
print(ncdata1.shape)

(5000, 11)


In [6]:
ncdata1.columns

Index(['sold_price', 'zipcode', 'longitude', 'latitude', 'lot_acres', 'taxes',
       'year_built', 'bedrooms', 'bathrooms', 'sqrt_ft', 'garage'],
      dtype='object')

In [7]:
# Replacing any special characters to NaN values in the entire data frame
ncdata1 = ncdata1.replace(r'^\s*$',np.nan, regex = True)
# Replace "none' to "NaN" values in the data frame

col_none = list(ncdata1.columns)
ncdata1 = ncdata1[col_none].replace(['None'],'NaN')
ncdata1.replace(0., np.nan, inplace= True)

In [8]:
# convert columns: object to float  
cols = ['sqrt_ft', 'garage', 'bathrooms']
ncdata1[cols] = ncdata1[cols].astype('float')
ncdata1.dtypes

sold_price    float64
zipcode         int64
longitude     float64
latitude      float64
lot_acres     float64
taxes         float64
year_built      int64
bedrooms        int64
bathrooms     float64
sqrt_ft       float64
garage        float64
dtype: object

In [9]:
#since all the columns with missing data were skewed
ncdata1['lot_acres'] = ncdata1['lot_acres'].fillna(ncdata1['lot_acres'].mode()[0])
ncdata1['sqrt_ft'] = ncdata1['sqrt_ft'].fillna(ncdata1['sqrt_ft'].mode()[0])
# ncdata1['fireplaces'] = ncdata1['fireplaces'].fillna(ncdata1['fireplaces'].mode()[0])
ncdata1['garage'] = ncdata1['garage'].fillna(ncdata1['garage'].mode()[0])
ncdata1['bathrooms'] = ncdata1['bathrooms'].fillna(ncdata1['bathrooms'].mode()[0])

In [10]:
# checking the outliers
print("Badrooms (min, max)   : ",min(ncdata1.bedrooms),max(ncdata1.bedrooms))
print("Bathrooms (min, max)  : ",min(ncdata1.bathrooms),max(ncdata1.bathrooms))
print("Square_foot (min, max): ",min(ncdata1.sqrt_ft),max(ncdata1.sqrt_ft))
print("Year built (min, max) : ",min(ncdata1.year_built),max(ncdata1.year_built))
print("garage (min, max) : ",min(ncdata1.garage),max(ncdata1.garage))

Badrooms (min, max)   :  1 36
Bathrooms (min, max)  :  1.0 36.0
Square_foot (min, max):  1100.0 22408.0
Year built (min, max) :  0 2019
garage (min, max) :  0.0 30.0


In [11]:
#Removing the outlier rows in the datasets
# print(bdata[bdata.bedrooms > 30])
ncdata1.drop(ncdata1[ncdata1.bedrooms > 30].index, inplace = True) 
ncdata1.drop(ncdata1[ncdata1.bathrooms > 30].index, inplace = True) 
ncdata1.drop(ncdata1[ncdata1.year_built <= 0].index, inplace = True) 
ncdata1.drop(ncdata1[ncdata1.year_built >= 2020].index, inplace = True)
print(ncdata1.shape)

(4991, 11)


In [12]:
# checking the outliers
print("Badrooms (min, max)   : ",min(ncdata1.bedrooms),max(ncdata1.bedrooms))
print("Bathrooms (min, max)  : ",min(ncdata1.bathrooms),max(ncdata1.bathrooms))
print("Square_foot (min, max): ",min(ncdata1.sqrt_ft),max(ncdata1.sqrt_ft))
print("Year built (min, max) : ",min(ncdata1.year_built),max(ncdata1.year_built))

Badrooms (min, max)   :  1 19
Bathrooms (min, max)  :  1.0 18.0
Square_foot (min, max):  1100.0 12808.0
Year built (min, max) :  1893 2019


In [13]:
# Replace Nan to mean value (important)
# ncdata.fillna(ncdata.mean())
ncdata1.fillna(ncdata1.median(), inplace=True)

In [14]:
ncdata1.isnull().sum()

sold_price    0
zipcode       0
longitude     0
latitude      0
lot_acres     0
taxes         0
year_built    0
bedrooms      0
bathrooms     0
sqrt_ft       0
garage        0
dtype: int64

In [15]:
# Picking the lon/latitude values near place
geolocator=Nominatim(user_agent='myapp')

In [16]:
# from geopy.geocoders import Nominatim
# address="10441 E Port Townsend St Tucson, AZ 85747"
address="4650, West Lone Ridge Place, Marana, Pima County, Arizona, 85658"
# address="5000, East Via Estancia, Catalina, Pima County, Arizona, 85739"
# address = "4650, West Lone Ridge Place, Marana, Pima County, Arizona, 85658"
# find the lon/lat using address
mlocation = geolocator.geocode(address)
mlat, mlon = mlocation.latitude, mlocation.longitude
print(mlocation.latitude, mlocation.longitude)

32.464260544113415 -111.06449135902294


In [17]:
indx = np.where(((ncdata1.latitude >= (mlat-0.2)) & (ncdata1.latitude < (mlat+0.2))) &((ncdata1.longitude > (mlon-0.3)) & (ncdata1.longitude < (mlon+0.3))))[0]
print(len(indx))

# Picking the the selected region indices and write into another Table 
ncdata2 = ncdata1.iloc[indx,:]
ncdata2.shape

3783


(3783, 11)

In [18]:
ncdata2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3783 entries, 4 to 4998
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sold_price  3783 non-null   float64
 1   zipcode     3783 non-null   int64  
 2   longitude   3783 non-null   float64
 3   latitude    3783 non-null   float64
 4   lot_acres   3783 non-null   float64
 5   taxes       3783 non-null   float64
 6   year_built  3783 non-null   int64  
 7   bedrooms    3783 non-null   int64  
 8   bathrooms   3783 non-null   float64
 9   sqrt_ft     3783 non-null   float64
 10  garage      3783 non-null   float64
dtypes: float64(8), int64(3)
memory usage: 354.7 KB


In [19]:
# Normalize entire data frame except long and latitude columns: Panda
def normalize(dataset):
    dataNorm=((dataset-dataset.min())/(dataset.max()-dataset.min()))
#     print("Minimum: ",dataset.min())
#     dataNorm["longitude"] = dataset["longitude"]
#     dataNorm["latitude"] = dataset["latitude"]
    return dataNorm

In [20]:
ncdata =normalize(ncdata2)
print(ncdata.shape)
# ncdata.sample(5)
ncdata[0:5]

(3783, 11)


Unnamed: 0,sold_price,zipcode,longitude,latitude,lot_acres,taxes,year_built,bedrooms,bathrooms,sqrt_ft,garage
4,0.913218,0.962121,0.883988,0.064159,0.030977,0.00126,0.730337,0.333333,0.5,0.498579,0.1
5,0.864662,0.719697,0.655114,0.229899,0.016023,0.002276,0.775281,0.222222,0.3,0.543849,0.1
7,0.639098,0.962121,0.772337,0.206889,0.010196,0.001772,0.910112,0.555556,0.7,0.754872,0.133333
8,1.0,0.719697,0.651419,0.243763,0.065158,0.002054,0.808989,0.444444,0.6,0.381039,0.1
9,0.864662,0.962121,0.826827,0.194509,0.034084,0.00155,0.865169,0.444444,0.5,0.507105,0.1


In [21]:
# revert the normalize entire data frame except long and latitude columns: Panda
# pass normalized data and original data(ncdata2 in this file)
def revertnormalize(normdata,orgdata):
    revNorm = ((normdata)*(orgdata.max()-orgdata.min())) + orgdata.min()
#     print("Minimum: ",datasett.min())
#     revNorm["longitude"] = datasett["longitude"]
#     revNorm["latitude"] = datasett["latitude"]
    return revNorm

In [22]:
# ncdata2[0:5]

In [23]:
revncdata =revertnormalize(ncdata,ncdata2)
print(revncdata.shape)
# revncdata.sample(5)
revncdata[0:5]

(3783, 11)


Unnamed: 0,sold_price,zipcode,longitude,latitude,lot_acres,taxes,year_built,bedrooms,bathrooms,sqrt_ft,garage
4,3411450.0,85750.0,-110.813768,32.285162,3.21,15393.0,1995.0,4.0,6.0,6396.0,3.0
5,3250000.0,85718.0,-110.910593,32.33909,1.67,27802.84,1999.0,3.0,4.0,6842.0,3.0
7,2500000.0,85750.0,-110.861002,32.331603,1.07,21646.0,2011.0,6.0,8.0,8921.0,4.0
8,3700000.0,85718.0,-110.912156,32.343601,6.73,25094.39,2002.0,5.0,7.0,5238.0,3.0
9,3250000.0,85750.0,-110.83795,32.327575,3.53,18936.11,2007.0,5.0,6.0,6480.0,3.0


In [24]:
# selecting the X and y values
cols = ['longitude','latitude','bedrooms','bathrooms','sqrt_ft']
sel_feat = ncdata2[cols]
sel_price = ncdata2['sold_price']
print(sel_feat.shape)

##Actual selected features
sel_feat[0:5]

(3783, 5)


Unnamed: 0,longitude,latitude,bedrooms,bathrooms,sqrt_ft
4,-110.813768,32.285162,4,6.0,6396.0
5,-110.910593,32.33909,3,4.0,6842.0
7,-110.861002,32.331603,6,8.0,8921.0
8,-110.912156,32.343601,5,7.0,5238.0
9,-110.83795,32.327575,5,6.0,6480.0


In [25]:
## Normalizing the selected features
norm_feat =normalize(sel_feat)
print(norm_feat.shape)
norm_feat[0:5]

norm_price = normalize(sel_price)
norm_price[0:5]

(3783, 5)


4    0.913218
5    0.864662
7    0.639098
8    1.000000
9    0.864662
Name: sold_price, dtype: float64

In [26]:
# Reverting the normalized features
revrt_feat = revertnormalize(norm_feat,sel_feat)
print(revrt_feat.shape)
# revrt_feat.sample(5)
revrt_feat[0:5]

(3783, 5)


Unnamed: 0,longitude,latitude,bedrooms,bathrooms,sqrt_ft
4,-110.813768,32.285162,4.0,6.0,6396.0
5,-110.910593,32.33909,3.0,4.0,6842.0
7,-110.861002,32.331603,6.0,8.0,8921.0
8,-110.912156,32.343601,5.0,7.0,5238.0
9,-110.83795,32.327575,5.0,6.0,6480.0


In [27]:
colls = ['longitude','latitude','bedrooms','bathrooms','sqrt_ft']
features = norm_feat[colls].to_numpy()
prices = norm_price.to_numpy(dtype='int64')
# prices2 = ncdata1["sold_price"]
print(prices)
print(features[0:10,2])

[0 0 0 ... 0 0 0]
[0.33333333 0.22222222 0.55555556 0.44444444 0.44444444 0.44444444
 0.33333333 0.22222222 0.22222222 0.22222222]


In [28]:
# without using sklearn function
import math 
nn = math.ceil(len(features)*0.7)

X_train = features[:nn,:]
X_test= features[nn:,:]
y_train = prices[:nn]
y_test = prices[nn:]

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2649, 5) (1134, 5) (2649,) (1134,)


In [29]:
class KNNRegressor():
    def fit(self,X,y):
        self.X = X
        self.y = y
        
    def predict(self,X, K, epsilon=1e-03):
        N = len(X)
        y_hat = np.zeros(N)
        
        for i in range(N):
            dist2 = np.sum((self.X-X[i])**2,axis=1)
            idxt = np.argsort(dist2)[:K]
            gamma_k = 1/(np.sqrt(dist2[idxt]) + epsilon)
            y_hat[i] = gamma_k.dot(self.y[idxt])/gamma_k.sum() #traget value with the height weight
             
        return y_hat

In [30]:
knnreg = KNNRegressor()

In [31]:
knnreg.fit(X_train,y_train)

In [32]:
y_hat = knnreg.predict(X_test,100, epsilon=1e-3)

In [33]:
def accuracy(y,y_hat):
    return np.mean(y == y_hat)

In [34]:
accuracy(y_test,y_hat)

0.9982363315696648

## Single Home predictor

In [35]:
print(mlon,mlat)

bedrms= 3.0
bathrooms= 1.0
sqrft=2400.0

# c_data =np.array([[mlon,mlat,bedrms,bathrooms,sqrft]])
cdata =[mlon,mlat,bedrms,bathrooms,sqrft]
print(cdata)

# Normalize entire data frame except long and latitude columns: Panda
def normalize1(dataset,orgdata):
    dataNorm=((dataset-orgdata.min())/(orgdata.max()-orgdata.min()))
    return dataNorm

cdatanorm = normalize1(cdata,sel_feat)

val = knnreg.predict(cdatanorm,100, epsilon=1e-3)
print("Predicted normalized price: ",val[0])

val1 = np.mean(val)
prce = revertnormalize(val[0],sel_price)
print("Estimated Price: ",prce)

-111.06449135902294 32.464260544113415
[-111.06449135902294, 32.464260544113415, 3.0, 1.0, 2400.0]
Predicted normalized price:  0.0
Estimated Price:  375000.0
