## Housing Project
#### Data from Kaggle.com

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [7]:
# Load dataset
df = pd.read_csv("American_Housing_Data_20231209.csv")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39981 entries, 0 to 39980
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Zip Code                 39981 non-null  int64  
 1   Price                    39981 non-null  float64
 2   Beds                     39981 non-null  int64  
 3   Baths                    39981 non-null  int64  
 4   Living Space             39981 non-null  int64  
 5   Address                  39981 non-null  object 
 6   City                     39981 non-null  object 
 7   State                    39981 non-null  object 
 8   Zip Code Population      39981 non-null  int64  
 9   Zip Code Density         39981 non-null  float64
 10  County                   39981 non-null  object 
 11  Median Household Income  39979 non-null  float64
 12  Latitude                 39981 non-null  float64
 13  Longitude                39981 non-null  float64
dtypes: float64(5), int64(5

In [9]:
df.columns

Index(['Zip Code', 'Price', 'Beds', 'Baths', 'Living Space', 'Address', 'City',
       'State', 'Zip Code Population', 'Zip Code Density', 'County',
       'Median Household Income', 'Latitude', 'Longitude'],
      dtype='object')

In [10]:
cols_to_drop = ['Zip Code','Address', 'State']
existing_cols = [col for col in cols_to_drop if col in df.columns]
df_clean = df.drop(existing_cols, axis=1)

In [11]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39981 entries, 0 to 39980
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Price                    39981 non-null  float64
 1   Beds                     39981 non-null  int64  
 2   Baths                    39981 non-null  int64  
 3   Living Space             39981 non-null  int64  
 4   City                     39981 non-null  object 
 5   Zip Code Population      39981 non-null  int64  
 6   Zip Code Density         39981 non-null  float64
 7   County                   39981 non-null  object 
 8   Median Household Income  39979 non-null  float64
 9   Latitude                 39981 non-null  float64
 10  Longitude                39981 non-null  float64
dtypes: float64(5), int64(4), object(2)
memory usage: 3.4+ MB


In [12]:
df_clean.head(3)

Unnamed: 0,Price,Beds,Baths,Living Space,City,Zip Code Population,Zip Code Density,County,Median Household Income,Latitude,Longitude
0,3999000.0,2,3,1967,New York,29563,20967.9,New York,370046.0,40.72001,-74.00472
1,3999000.0,2,3,1967,New York,29563,20967.9,New York,370046.0,40.72001,-74.00472
2,1650000.0,1,1,718,New York,29815,23740.9,New York,249880.0,40.73407,-74.00601


In [13]:
# Separate Features (X) and Target (y)
X = df_clean.drop('Price', axis=1)
y = df_clean['Price']

In [14]:
# Train/Test Split (80/20)
rng = np.random.default_rng(42)  
row_positions = np.arange(X.shape[0])  # all row indices
rng.shuffle(row_positions)  # shuffle in place

split_point = int(0.8 * X.shape[0])  # 80% train, 20% test

X_train_idx, X_test_idx = row_positions[:split_point], row_positions[split_point:]
y_train_idx, y_test_idx = X_train_idx, X_test_idx  # keep y in sync

X_train, X_test = X.iloc[X_train_idx], X.iloc[X_test_idx]
y_train, y_test = y.iloc[y_train_idx], y.iloc[y_test_idx]

In [15]:
# Seperate into numerical-categporical type
X_num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
X_cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()


In [16]:
# One-hot encode categorical columns

X_train = pd.get_dummies(X_train, columns=X_cat_cols, drop_first=True, dtype=int)
X_test = pd.get_dummies(X_test, columns=X_cat_cols, drop_first=True, dtype=int)

# Align columns (in case test set misses some categories)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [17]:
# Scaling

# Checking min/max of all columns before scaling
def check_feature_range(col):
    return (min(col), max(col))

for col in X_train.columns:
    print(col + "  " + str(check_feature_range(X_train[col])))

Beds  (1, 54)
Baths  (1, 56)
Living Space  (3, 35429)
Zip Code Population  (0, 116469)
Zip Code Density  (0.0, 58289.6)
Median Household Income  (27475.0, 900203.0)
Latitude  (25.72983, 47.74237)
Longitude  (-122.82687, -73.71424)
City_Aledo  (0, 1)
City_Apache Junction  (0, 1)
City_Arlington  (0, 1)
City_Arverne  (0, 1)
City_Astoria  (0, 1)
City_Atascosa  (0, 1)
City_Atlanta  (0, 1)
City_Atlantic Beach  (0, 1)
City_Austin  (0, 1)
City_Baltimore  (0, 1)
City_Bayside  (0, 1)
City_Bellerose  (0, 1)
City_Bellevue  (0, 1)
City_Bennington  (0, 1)
City_Berkeley  (0, 1)
City_Beverly Hills  (0, 1)
City_Blacklick  (0, 1)
City_Broken Arrow  (0, 1)
City_Bronx  (0, 1)
City_Brooklyn  (0, 1)
City_Burbank  (0, 1)
City_Burleson  (0, 1)
City_Calhan  (0, 1)
City_Campbell  (0, 1)
City_Canal Winchester  (0, 1)
City_Canoga Park  (0, 1)
City_Catonsville  (0, 1)
City_Cave Creek  (0, 1)
City_Charlotte  (0, 1)
City_Chatsworth  (0, 1)
City_Chesapeake  (0, 1)
City_Chicago  (0, 1)
City_Choctaw  (0, 1)
City_Clint 

In [18]:
# Scale numerical columns using MinMaxScaler
minmax_scaler = MinMaxScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Scale only the original numerical columns
X_train_scaled[X_num_cols] = minmax_scaler.fit_transform(X_train_scaled[X_num_cols])
X_test_scaled[X_num_cols] = minmax_scaler.transform(X_test_scaled[X_num_cols])

In [19]:
# Check min/max of columns after scaling

print("Train ranges AFTER scaling:")
for col in X_num_cols:
    print(col, check_feature_range(X_train_scaled[col]))

Train ranges AFTER scaling:
Beds (0.0, 1.0)
Baths (0.0, 0.9999999999999999)
Living Space (0.0, 1.0)
Zip Code Population (0.0, 1.0)
Zip Code Density (0.0, 1.0)
Median Household Income (0.0, 1.0)
Latitude (0.0, 1.0)
Longitude (0.0, 1.0)


In [20]:
X_train_scaled.shape[1]

325

### Training the model 

In [None]:
# Initialize the weights
class LinearRegression:
    def __init__(self, lr=0.001, num_epochs=1000):
        self.lr = lr
        self.num_epochs = num_epochs
        self.W = None
        self.b = None

    def initialize_params(self, X):
        if isinstance(X, list):
            assert isinstance(X[0], (list, np.ndarray))
            X = np.array(X)

        self.X = X
        n_features = X.shape[1] if len(X.shape) >= 2 else 1
        self.W = np.zeros((n_features, 1))
        self.b = 0.0

    def predict(self, X):
        return np.dot(X, self.W) + self.b

    def compute_loss(self, y, y_pred):
        return np.mean((y - y_pred) ** 2)

    def compute_gradient(self, X, y):
        m,n = X.shape
        dj_dw = np.zeros((n,))
        dj_db = 0

        for i in range(m):
            err = (np.dot(X[i], self.W[i]) + self.b) - y[i]
            for j in range(n):
                dj_dw[j] += dj_dw[j] + err * X[i,j]
            dj_db = dj_db + err

        dj_dw = dj_dw/m
        dj_db = dj_db/m
        return dj_dw, dj_db
    
    def update_params(self, dj_dw, dj_db):
        self.W -= self.lr * dj_dw
        self.b -= self.lr * dj_db

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y).reshape(-1, 1)

        self.initialize_params(X)

        for _ in range(self.num_epochs):
            y_pred = self.predict(X)
            dj_dw, dj_db = self.compute_gradient(X, y, y_pred)
            self.update_params(dj_dw, dj_db)

    def get_params(self):
        return self.W, self.b



In [24]:
# Convert to numpy for custom LinearRegression
X_train_np = X_train_scaled.to_numpy()
y_train_np = y_train.to_numpy()
X_test_np  = X_test_scaled.to_numpy()
y_test_np  = y_test.to_numpy()


In [26]:
#Train
lr = LinearRegression(lr=1e-4)
lr.fit(X_train_np, y_train_np)

#Evaluate
y_pred = lr.predict(X_test_np)
mse = np.mean((y_test_np.reshape(-1,1) - y_pred)**2)
print("Test MSE:", mse)


Test MSE: nan
