## Housing Project
#### Data from Kaggle.com

In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [56]:
# Load dataset
df = pd.read_csv("American_Housing_Data_20231209.csv")

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39981 entries, 0 to 39980
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Zip Code                 39981 non-null  int64  
 1   Price                    39981 non-null  float64
 2   Beds                     39981 non-null  int64  
 3   Baths                    39981 non-null  int64  
 4   Living Space             39981 non-null  int64  
 5   Address                  39981 non-null  object 
 6   City                     39981 non-null  object 
 7   State                    39981 non-null  object 
 8   Zip Code Population      39981 non-null  int64  
 9   Zip Code Density         39981 non-null  float64
 10  County                   39981 non-null  object 
 11  Median Household Income  39979 non-null  float64
 12  Latitude                 39981 non-null  float64
 13  Longitude                39981 non-null  float64
dtypes: float64(5), int64(5

In [58]:
df.columns

Index(['Zip Code', 'Price', 'Beds', 'Baths', 'Living Space', 'Address', 'City',
       'State', 'Zip Code Population', 'Zip Code Density', 'County',
       'Median Household Income', 'Latitude', 'Longitude'],
      dtype='object')

In [59]:
cols_to_drop = ['Zip Code','Address', 'State']
existing_cols = [col for col in cols_to_drop if col in df.columns]
df_clean = df.drop(existing_cols, axis=1)

In [60]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39981 entries, 0 to 39980
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Price                    39981 non-null  float64
 1   Beds                     39981 non-null  int64  
 2   Baths                    39981 non-null  int64  
 3   Living Space             39981 non-null  int64  
 4   City                     39981 non-null  object 
 5   Zip Code Population      39981 non-null  int64  
 6   Zip Code Density         39981 non-null  float64
 7   County                   39981 non-null  object 
 8   Median Household Income  39979 non-null  float64
 9   Latitude                 39981 non-null  float64
 10  Longitude                39981 non-null  float64
dtypes: float64(5), int64(4), object(2)
memory usage: 3.4+ MB


In [61]:
df_clean.head(3)

Unnamed: 0,Price,Beds,Baths,Living Space,City,Zip Code Population,Zip Code Density,County,Median Household Income,Latitude,Longitude
0,3999000.0,2,3,1967,New York,29563,20967.9,New York,370046.0,40.72001,-74.00472
1,3999000.0,2,3,1967,New York,29563,20967.9,New York,370046.0,40.72001,-74.00472
2,1650000.0,1,1,718,New York,29815,23740.9,New York,249880.0,40.73407,-74.00601


In [62]:
# Separate Features (X) and Target (y)
X = df_clean.drop('Price', axis=1)
y = df_clean['Price']

In [63]:
# Seperate into numerical-categporical type and then encode
X_num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
X_cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

X = pd.get_dummies(X, columns=X_cat_cols, drop_first=True, dtype=int)

In [64]:
X_num_cols

['Beds',
 'Baths',
 'Living Space',
 'Zip Code Population',
 'Zip Code Density',
 'Median Household Income',
 'Latitude',
 'Longitude']

In [65]:
X.head(3)

Unnamed: 0,Beds,Baths,Living Space,Zip Code Population,Zip Code Density,Median Household Income,Latitude,Longitude,City_Aledo,City_Apache Junction,...,County_Travis,County_Tulsa,County_Union,County_Virginia Beach,County_Wagoner,County_Wake,County_Washington,County_Wayne,County_Williamson,County_Wyandotte
0,2,3,1967,29563,20967.9,370046.0,40.72001,-74.00472,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,3,1967,29563,20967.9,370046.0,40.72001,-74.00472,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,718,29815,23740.9,249880.0,40.73407,-74.00601,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
# Scaling

def check_feature_range(col):
    return (min(col), max(col))

for col in X.columns:
    print(col + "  " + str(check_feature_range(X[col])))

Beds  (1, 54)
Baths  (1, 66)
Living Space  (2, 74340)
Zip Code Population  (0, 116469)
Zip Code Density  (0.0, 58289.6)
Median Household Income  (27475.0, 900203.0)
Latitude  (25.72983, 47.74237)
Longitude  (-122.82687, -73.70451)
City_Aledo  (0, 1)
City_Apache Junction  (0, 1)
City_Arlington  (0, 1)
City_Arverne  (0, 1)
City_Astoria  (0, 1)
City_Atascosa  (0, 1)
City_Atlanta  (0, 1)
City_Atlantic Beach  (0, 1)
City_Austin  (0, 1)
City_Baltimore  (0, 1)
City_Bayside  (0, 1)
City_Bellerose  (0, 1)
City_Bellevue  (0, 1)
City_Bennington  (0, 1)
City_Berkeley  (0, 1)
City_Beverly Hills  (0, 1)
City_Blacklick  (0, 1)
City_Boerne  (0, 1)
City_Broken Arrow  (0, 1)
City_Bronx  (0, 1)
City_Brooklyn  (0, 1)
City_Brownsburg  (0, 1)
City_Burbank  (0, 1)
City_Burleson  (0, 1)
City_Calhan  (0, 1)
City_Cambria Heights  (0, 1)
City_Campbell  (0, 1)
City_Canal Winchester  (0, 1)
City_Canoga Park  (0, 1)
City_Catonsville  (0, 1)
City_Cave Creek  (0, 1)
City_Charlotte  (0, 1)
City_Chatsworth  (0, 1)
City

In [73]:
minmax_scaler = MinMaxScaler()
minmax_cols = [col+"_minmax" for col in X_num_cols]
X_scaled = X.copy()

X_scaled[minmax_cols] = minmax_scaler.fit_transform(X_scaled[X_num_cols])
X_scaled.head(3)

Unnamed: 0,Beds,Baths,Living Space,Zip Code Population,Zip Code Density,Median Household Income,Latitude,Longitude,City_Aledo,City_Apache Junction,...,County_Williamson,County_Wyandotte,Beds_minmax,Baths_minmax,Living Space_minmax,Zip Code Population_minmax,Zip Code Density_minmax,Median Household Income_minmax,Latitude_minmax,Longitude_minmax
0,2,3,1967,29563,20967.9,370046.0,40.72001,-74.00472,0,0,...,0,0,0.018868,0.030769,0.026433,0.253827,0.359719,0.392529,0.680984,0.993889
1,2,3,1967,29563,20967.9,370046.0,40.72001,-74.00472,0,0,...,0,0,0.018868,0.030769,0.026433,0.253827,0.359719,0.392529,0.680984,0.993889
2,1,1,718,29815,23740.9,249880.0,40.73407,-74.00601,0,0,...,0,0,0.0,0.0,0.009632,0.255991,0.407292,0.254839,0.681622,0.993862


In [72]:
for col in X_scaled.columns:
    print(col + "  " + str(check_feature_range(X_scaled[col])))

Beds  (1, 54)
Baths  (1, 66)
Living Space  (2, 74340)
Zip Code Population  (0, 116469)
Zip Code Density  (0.0, 58289.6)
Median Household Income  (27475.0, 900203.0)
Latitude  (25.72983, 47.74237)
Longitude  (-122.82687, -73.70451)
City_Aledo  (0, 1)
City_Apache Junction  (0, 1)
City_Arlington  (0, 1)
City_Arverne  (0, 1)
City_Astoria  (0, 1)
City_Atascosa  (0, 1)
City_Atlanta  (0, 1)
City_Atlantic Beach  (0, 1)
City_Austin  (0, 1)
City_Baltimore  (0, 1)
City_Bayside  (0, 1)
City_Bellerose  (0, 1)
City_Bellevue  (0, 1)
City_Bennington  (0, 1)
City_Berkeley  (0, 1)
City_Beverly Hills  (0, 1)
City_Blacklick  (0, 1)
City_Boerne  (0, 1)
City_Broken Arrow  (0, 1)
City_Bronx  (0, 1)
City_Brooklyn  (0, 1)
City_Brownsburg  (0, 1)
City_Burbank  (0, 1)
City_Burleson  (0, 1)
City_Calhan  (0, 1)
City_Cambria Heights  (0, 1)
City_Campbell  (0, 1)
City_Canal Winchester  (0, 1)
City_Canoga Park  (0, 1)
City_Catonsville  (0, 1)
City_Cave Creek  (0, 1)
City_Charlotte  (0, 1)
City_Chatsworth  (0, 1)
City