### Spark mllib coding devotional

#### learning goal:
- Understand basic model building with the mllib library
- Understand key differences from Scikit-learn

#### Building an XGBoost Model: Spark vs Python

import necessary tools

In [2]:
#SCIKIT & other necessary libraries
!pip install scikit-learn seaborn matplotlib xgboost pandas numpy plotly
!pip install lets-plot

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# PYTHON - SCIKIT / Pandas

# import skikit.learn (sklearn) models, functions and preprocessing code
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier 

from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split 

from sklearn import metrics 
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             mean_squared_error,
                             mean_absolute_error,
                             r2_score,
                             confusion_matrix,
                             accuracy_score,
                             precision_score)

# import the model and hyperparameter tuning functions
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import LeavePOut

# Visualization
!pip install lets-plot
from lets_plot import *
LetsPlot.setup_html()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


load dataset

In [4]:
# PYTHON - Pandas
df = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')
df.drop('date', axis=1, inplace=True) # remove this column, it messes with the model we are using
df = df.query('bedrooms <= 12') #there is a house that has an absurd number of rooms so we need to remove it as an outlier

In [16]:
df.columns

display(df.head(5)) # show the first 5 rows of the data

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price
0,1565930130,4,3.25,3760,4675,2.0,0,0,3,8,2740,1020,2007,0,98038,47.3862,-122.048,3280,4033,429900.0
1,3279000420,3,1.75,1460,7800,1.0,0,0,2,7,1040,420,1979,0,98023,47.3035,-122.382,1310,7865,233000.0
2,194000575,4,1.0,1340,5800,1.5,0,2,3,7,1340,0,1914,0,98116,47.5658,-122.389,1900,5800,455000.0
3,2115510160,3,1.75,1440,8050,1.0,0,0,3,8,1440,0,1985,0,98023,47.3187,-122.39,1790,7488,258950.0
4,7522500005,2,1.5,1780,4750,1.0,0,0,4,7,1080,700,1947,0,98117,47.6859,-122.395,1690,5962,555000.0


Create a regression model (reg1)

In [6]:
df2 = df.copy() # don't want to mess with the original dataset
y = df2['price'] # choose a target variable from the testing dataset
X = df2.drop('price', axis=1) # remove the target from the testing dataset


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create a baseline xgboost model
reg1 = XGBRegressor()
reg1.fit(X_train, y_train)

# run the data through the model and check tested vs predictions

y_pred = reg1.predict(X_test)

#check the classification report on how the model did
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))


0.8698035296154463
17579100346.97157
68997.35462109375


In [7]:
# Get feature importances
importances = reg1.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualization
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='importance', ascending=False)

# print(importance_df) #(add this line to see exact values for the features)

features = px.bar(importance_df,
                  x='importance',
                  y='feature',
                  orientation='h',
                  title='Feature Importances in Base XGBoost Model',
                  height= 800)
features.show()

hyperparameter tuning / k-fold validation to prep for model2:

In [8]:
# PYTHON -SCIKIT

# tools used 
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [3, 4],
    'learning_rate': [0.1, 0.15],
    'subsample': [0.8, 0.9, 1.0],
}

# Set up the grid search
grid_search = GridSearchCV(
    estimator=reg1,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error', # For regression, to minimize RMSE
    cv=3, # Cross-validation (k-fold) splitting strategy
    verbose=1,
    n_jobs=-1 # Use all available cores
)

# Fit the grid search
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [9]:
# Best parameters and model performance
print("Best Hyperparameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)
print("R-squared:", grid_search.best_estimator_.score(X_test, y_test))

Best Hyperparameters: {'learning_rate': 0.15, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.8}
Best RMSE: 122088.59645082681
R-squared: 0.880816120135671


Regression model when using Hyperparameter tuning (reg2)

In [10]:
# build out the model based on the best parameters
reg2 = XGBRegressor(
                    learning_rate = 0.15,
                    max_depth = 4,
                    n_estimators = 300,
                    subsample = 1.0)

# fit the model 2 xgboost model
reg2.fit(X_train, y_train)

# run the data through the model and check tested vs predictions

y_pred = reg2.predict(X_test)

#check the classification report on how the model did
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))

0.8696875377566029
17594761543.623188
68002.18127539063


In [11]:
# previous model scores
# 0.8772387477317761
# 16575198743.629103
# 69075.43608984374

In [12]:
import pickle

with open('xgboost_model.pkl', 'wb') as model_file:
    pickle.dump(reg2, model_file)

In [13]:
import pickle
import base64
import io

# Convert the trained model to base64 string
bytes_pickle = io.BytesIO()
pickle.dump(reg2, bytes_pickle)
bytes_pickle.seek(0)
base64_str = base64.b64encode(bytes_pickle.read()).decode('utf-8')

# Save to text file
with open('xgb_model_string.txt', 'w') as f:
    f.write(base64_str)

print(f"Model saved to text file 'xgb_model_string.txt'")
print(f"String length: {len(base64_str)} characters")

# Optional: Display first 100 characters of the string
print(f"First 100 characters of string: {base64_str[:100]}...")

Model saved to text file 'xgb_model_string.txt'
String length: 676484 characters
First 100 characters of string: gASVQgMAAAAAAACMD3hnYm9vc3Quc2tsZWFybpSMDFhHQlJlZ3Jlc3NvcpSTlCmBlH2UKIwMbl9lc3RpbWF0b3JzlE0sAYwJb2Jq...


In [14]:
# Instead of pickle, save the model using XGBoost's native format
reg2.save_model('xgb_model.json')

# Then read the file and upload it to GitHub
with open('xgb_model.json', 'r') as f:
    model_json = f.read()

In [15]:
import pandas as pd
import numpy as np

# Load the data from GitHub
df = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')
df.drop('date', axis=1, inplace=True)  # remove this column, it messes with the model we are using
df = df.query('bedrooms <= 12')  # there is a house that has an absurd number of rooms so we need to remove it as an outlier

# Drop price as it's what we're trying to predict
df_features = df.drop('price', axis=1)

# Calculate statistics for each column
stats = {}

for col in df_features.columns:
    if col == 'id':  # Skip id column
        continue
        
    if df_features[col].dtype == 'object' or col in ['waterfront', 'view', 'condition', 'grade', 'zipcode']:
        # For categorical features, get unique values
        stats[col] = {
            'type': 'categorical',
            'unique_values': sorted(df_features[col].unique().tolist())
        }
    else:
        # For numerical features, get min, max, mean, std
        stats[col] = {
            'type': 'numerical',
            'min': df_features[col].min(),
            'max': df_features[col].max(),
            'mean': df_features[col].mean(),
            'std': df_features[col].std()
        }

# Print the statistics for each feature
print("Feature Statistics for Sample Data Generation:")
print("-" * 50)

for feature, feature_stats in stats.items():
    print(f"\n{feature}:")
    if feature_stats['type'] == 'categorical':
        print(f"  Type: Categorical")
        print(f"  Unique values: {feature_stats['unique_values']}")
    else:
        print(f"  Type: Numerical")
        print(f"  Min: {feature_stats['min']}")
        print(f"  Max: {feature_stats['max']}")
        print(f"  Mean: {feature_stats['mean']:.2f}")
        print(f"  Std: {feature_stats['std']:.2f}")


Feature Statistics for Sample Data Generation:
--------------------------------------------------

bedrooms:
  Type: Numerical
  Min: 0
  Max: 11
  Mean: 3.37
  Std: 0.91

bathrooms:
  Type: Numerical
  Min: 0.0
  Max: 8.0
  Mean: 2.12
  Std: 0.77

sqft_living:
  Type: Numerical
  Min: 290
  Max: 13540
  Mean: 2079.91
  Std: 919.45

sqft_lot:
  Type: Numerical
  Min: 520
  Max: 1164794
  Mean: 15015.78
  Std: 39702.29

floors:
  Type: Numerical
  Min: 1.0
  Max: 3.5
  Mean: 1.50
  Std: 0.54

waterfront:
  Type: Categorical
  Unique values: [0, 1]

view:
  Type: Categorical
  Unique values: [0, 1, 2, 3, 4]

condition:
  Type: Categorical
  Unique values: [1, 2, 3, 4, 5]

grade:
  Type: Categorical
  Unique values: [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

sqft_above:
  Type: Numerical
  Min: 290
  Max: 9410
  Mean: 1790.27
  Std: 829.59

sqft_basement:
  Type: Numerical
  Min: 0
  Max: 4820
  Mean: 289.64
  Std: 441.54

yr_built:
  Type: Numerical
  Min: 1900
  Max: 2015
  Mean: 1971.10

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, rand, round as spark_round
import random
import pandas as pd
import numpy as np

def create_neighborhood(
    name,               # Neighborhood name (string)
    size,               # Number of houses (int)
    bed_range,          # (min, max) bedrooms (tuple of ints)
    bath_range,         # (min, max) bathrooms (tuple of floats)
    sqft_range,         # (min, max) square footage (tuple of ints)
    grade_range,        # (min, max) grade (tuple of ints)
    waterfront_pct,     # Percentage of waterfront properties (int, 0-100)
    view_range,         # (min, max) view rating (tuple of ints)
    lot_multiplier,     # Multiplier for lot size relative to house (float)
    zipcode,            # Neighborhood zipcode (string)
    coordinates         # (lat, long) for neighborhood (tuple of floats)
):
    """
    Creates a PySpark DataFrame of houses for a specific neighborhood.
    
    Returns:
        A PySpark DataFrame with the generated houses
    """
    # For simpler random generation
    min_bed, max_bed = bed_range
    min_bath, max_bath = bath_range
    min_sqft, max_sqft = sqft_range
    min_grade, max_grade = grade_range
    min_view, max_view = view_range
    lat, long = coordinates
    
    # Create pandas dataframe first (easier for random generation)
    data = []
    
    for _ in range(size):
        # Generate basic features
        bedrooms = random.randint(min_bed, max_bed)
        
        # Bathroom options (typical increments of 0.25, 0.5, etc.)
        bath_options = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0, 3.25, 3.5, 3.75, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0]
        bathrooms = random.choice([b for b in bath_options if min_bath <= b <= max_bath])
        
        # Square footage
        sqft_living = random.randint(min_sqft, max_sqft)
        
        # Lot size based on multiplier
        sqft_lot = int(sqft_living * lot_multiplier * random.uniform(0.9, 1.1))
        
        # Floor options (1, 1.5, 2, 2.5, 3)
        floors = random.choice([1.0, 1.5, 2.0, 2.5, 3.0])
        
        # Waterfront based on percentage
        waterfront = 1 if random.random() < (waterfront_pct / 100) else 0
        
        # View rating
        view = random.randint(min_view, max_view)
        
        # Condition rating (1-5, mostly 3-5 for populated areas)
        condition = random.randint(3, 5)
        
        # Grade
        grade = random.randint(min_grade, max_grade)
        
        # Above ground and basement
        if floors < 1.5:
            sqft_above = sqft_living
            sqft_basement = 0
        else:
            sqft_above = int(sqft_living * random.uniform(0.6, 0.8))
            sqft_basement = sqft_living - sqft_above
        
        # Year built (most between 1970-2015)
        yr_built = random.randint(1970, 2015)
        
        # Year renovated (0 means no renovation)
        yr_renovated = 0
        if yr_built < 2000 and random.random() < 0.3:
            yr_renovated = random.randint(yr_built + 10, 2023)
        
        # Small variations in lat/long to spread houses out
        house_lat = lat + random.uniform(-0.01, 0.01)
        house_long = long + random.uniform(-0.01, 0.01)
        
        # Living area and lot 15 years ago (slightly smaller)
        sqft_living15 = int(sqft_living * random.uniform(0.85, 0.95))
        sqft_lot15 = int(sqft_lot * random.uniform(0.9, 1.0))
        
        # Generate a unique ID
        id_num = str(random.randint(1000000000, 9999999999))
        
        # Compile house data
        house_data = {
            'id': id_num,
            'bedrooms': bedrooms,
            'bathrooms': bathrooms,
            'sqft_living': sqft_living,
            'sqft_lot': sqft_lot,
            'floors': floors,
            'waterfront': waterfront,
            'view': view,
            'condition': condition,
            'grade': grade,
            'sqft_above': sqft_above,
            'sqft_basement': sqft_basement,
            'yr_built': yr_built,
            'yr_renovated': yr_renovated,
            'zipcode': zipcode,
            'lat': house_lat,
            'long': house_long,
            'sqft_living15': sqft_living15,
            'sqft_lot15': sqft_lot15
        }
        
        data.append(house_data)
    
    # Convert to pandas DataFrame
    pdf = pd.DataFrame(data)
    
    # Convert to Spark DataFrame
    sdf = spark.createDataFrame(pdf)
    
    # Add additional information for tracking
    sdf = sdf.withColumn("neighborhood", lit(name))
    
    return sdf

In [18]:
def create_evergreen_estates():
    """Luxury Mansion Neighborhood - Team: Exalted"""
    return create_neighborhood(
        name="Evergreen Estates",
        size=25,
        bed_range=(4, 6),
        bath_range=(3.5, 7.0),
        sqft_range=(4000, 9000),
        grade_range=(10, 13),
        waterfront_pct=40,
        view_range=(2, 4),
        lot_multiplier=3.0, # Very large lots
        zipcode="98039",
        coordinates=(47.6288, -122.2313)
    )

In [21]:
import random
import csv
import os

def generate_houses(
    neighborhood,       # Name of the neighborhood
    size,               # Number of houses
    bed_range,          # (min, max) bedrooms
    bath_range,         # (min, max) bathrooms
    sqft_range,         # (min, max) square footage
    grade_range,        # (min, max) grade
    waterfront_pct,     # Percentage of waterfront properties
    view_range,         # (min, max) view rating
    lot_multiplier,     # Multiplier for lot size
    zipcode,            # Zipcode
    coordinates         # (lat, long)
):
    """Generate houses for a neighborhood with specified characteristics."""
    houses = []
    
    # Extract ranges
    min_bed, max_bed = bed_range
    min_bath, max_bath = bath_range
    min_sqft, max_sqft = sqft_range
    min_grade, max_grade = grade_range
    min_view, max_view = view_range
    lat, long = coordinates
    
    # Bathroom options (standard increments)
    bath_options = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0, 
                   3.25, 3.5, 3.75, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0]
    valid_bath_options = [b for b in bath_options if min_bath <= b <= max_bath]
    
    # Floor options
    floor_options = [1.0, 1.5, 2.0, 2.5, 3.0]
    
    for i in range(size):
        # Generate ID
        id_num = str(random.randint(1000000000, 9999999999))
        
        # Generate basic features
        bedrooms = random.randint(min_bed, max_bed)
        bathrooms = random.choice(valid_bath_options)
        sqft_living = random.randint(min_sqft, max_sqft)
        
        # Lot size based on multiplier (with some randomness)
        sqft_lot = int(sqft_living * lot_multiplier * random.uniform(0.9, 1.1))
        
        # Floor options
        floors = random.choice(floor_options)
        
        # Waterfront based on percentage
        waterfront = 1 if random.random() < (waterfront_pct / 100) else 0
        
        # View rating
        view = random.randint(min_view, max_view)
        
        # Condition rating (1-5, mostly 3-5 for populated areas)
        condition = random.randint(3, 5)
        
        # Grade
        grade = random.randint(min_grade, max_grade)
        
        # Above ground and basement
        if floors < 1.5:
            sqft_above = sqft_living
            sqft_basement = 0
        else:
            sqft_above = int(sqft_living * random.uniform(0.6, 0.8))
            sqft_basement = sqft_living - sqft_above
        
        # Year built (most between 1970-2015)
        yr_built = random.randint(1970, 2015)
        
        # Year renovated (0 means no renovation)
        yr_renovated = 0
        if yr_built < 2000 and random.random() < 0.3:
            yr_renovated = random.randint(yr_built + 10, 2023)
        
        # Small variations in lat/long to spread houses out
        house_lat = lat + random.uniform(-0.01, 0.01)
        house_long = long + random.uniform(-0.01, 0.01)
        
        # Living area and lot 15 years ago (slightly smaller)
        sqft_living15 = int(sqft_living * random.uniform(0.85, 0.95))
        sqft_lot15 = int(sqft_lot * random.uniform(0.9, 1.0))
        
        # Add house to result
        house = {
            'id': id_num,
            'bedrooms': bedrooms,
            'bathrooms': bathrooms,
            'sqft_living': sqft_living,
            'sqft_lot': sqft_lot,
            'floors': floors,
            'waterfront': waterfront,
            'view': view,
            'condition': condition,
            'grade': grade,
            'sqft_above': sqft_above,
            'sqft_basement': sqft_basement,
            'yr_built': yr_built,
            'yr_renovated': yr_renovated,
            'zipcode': zipcode,
            'lat': house_lat,
            'long': house_long,
            'sqft_living15': sqft_living15,
            'sqft_lot15': sqft_lot15,
            'neighborhood': neighborhood
        }
        houses.append(house)
    
    return houses

# Generate all neighborhoods
all_houses = []

# 1. Evergreen Estates (Luxury Mansions)
all_houses.extend(generate_houses(
    neighborhood="Evergreen Estates",
    size=25,
    bed_range=(4, 6),
    bath_range=(3.5, 7.0),
    sqft_range=(4000, 9000),
    grade_range=(10, 13),
    waterfront_pct=40,
    view_range=(2, 4),
    lot_multiplier=3.0,
    zipcode="98039",
    coordinates=(47.6288, -122.2313)
))

# 2. Cedar Grove (Government Housing)
all_houses.extend(generate_houses(
    neighborhood="Cedar Grove",
    size=200,
    bed_range=(2, 3),
    bath_range=(1.0, 1.5),
    sqft_range=(800, 1200),
    grade_range=(5, 6),
    waterfront_pct=0,
    view_range=(0, 1),
    lot_multiplier=1.2,
    zipcode="98178",
    coordinates=(47.4924, -122.2359)
))

# 3. Greenfield Commons (Middle Class Neighborhood 1)
all_houses.extend(generate_houses(
    neighborhood="Greenfield Commons",
    size=250,
    bed_range=(3, 4),
    bath_range=(2.0, 2.5),
    sqft_range=(1600, 2200),
    grade_range=(7, 8),
    waterfront_pct=0,
    view_range=(0, 2),
    lot_multiplier=2.0,
    zipcode="98052",
    coordinates=(47.6769, -122.1069)
))

# 4. Parkside Meadows (Middle Class Neighborhood 2)
all_houses.extend(generate_houses(
    neighborhood="Parkside Meadows",
    size=200,
    bed_range=(3, 4),
    bath_range=(2.0, 2.5),
    sqft_range=(1800, 2400),
    grade_range=(7, 9),
    waterfront_pct=0,
    view_range=(0, 3),
    lot_multiplier=2.5,
    zipcode="98074",
    coordinates=(47.6157, -122.0355)
))

# 5. Shoreline Terrace (Beachfront Community)
all_houses.extend(generate_houses(
    neighborhood="Shoreline Terrace",
    size=100,
    bed_range=(2, 4),
    bath_range=(1.5, 2.5),
    sqft_range=(1500, 2500),
    grade_range=(7, 9),
    waterfront_pct=100,
    view_range=(3, 4),
    lot_multiplier=1.8,
    zipcode="98166",
    coordinates=(47.4435, -122.3569)
))

# Write to CSV
csv_file = "seattle_housing_neighborhoods.csv"
fieldnames = ['id', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
             'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement',
             'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
             'sqft_living15', 'sqft_lot15', 'neighborhood']

with open(csv_file, 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(all_houses)

print(f"Generated {len(all_houses)} houses across 5 neighborhoods")
print(f"Data saved to {csv_file}")

# Sample output - display first 5 houses from each neighborhood
neighborhoods = ["Evergreen Estates", "Cedar Grove", "Greenfield Commons", 
                "Parkside Meadows", "Shoreline Terrace"]

for neighborhood in neighborhoods:
    houses = [h for h in all_houses if h["neighborhood"] == neighborhood]
    print(f"\n{neighborhood} - Sample houses (first 5 of {len(houses)}):")
    for house in houses[:5]:
        print(f"  {house['bedrooms']} bed, {house['bathrooms']} bath, {house['sqft_living']} sqft, Grade: {house['grade']}, Waterfront: {house['waterfront']}")(5)import random
import csv
import os

def generate_houses(
    neighborhood,       # Name of the neighborhood
    size,               # Number of houses
    bed_range,          # (min, max) bedrooms
    bath_range,         # (min, max) bathrooms
    sqft_range,         # (min, max) square footage
    grade_range,        # (min, max) grade
    waterfront_pct,     # Percentage of waterfront properties
    view_range,         # (min, max) view rating
    lot_multiplier,     # Multiplier for lot size
    zipcode,            # Zipcode
    coordinates         # (lat, long)
):
    """Generate houses for a neighborhood with specified characteristics."""
    houses = []
    
    # Extract ranges
    min_bed, max_bed = bed_range
    min_bath, max_bath = bath_range
    min_sqft, max_sqft = sqft_range
    min_grade, max_grade = grade_range
    min_view, max_view = view_range
    lat, long = coordinates
    
    # Bathroom options (standard increments)
    bath_options = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0, 
                   3.25, 3.5, 3.75, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0]
    valid_bath_options = [b for b in bath_options if min_bath <= b <= max_bath]
    
    # Floor options
    floor_options = [1.0, 1.5, 2.0, 2.5, 3.0]
    
    for i in range(size):
        # Generate ID
        id_num = str(random.randint(1000000000, 9999999999))
        
        # Generate basic features
        bedrooms = random.randint(min_bed, max_bed)
        bathrooms = random.choice(valid_bath_options)
        sqft_living = random.randint(min_sqft, max_sqft)
        
        # Lot size based on multiplier (with some randomness)
        sqft_lot = int(sqft_living * lot_multiplier * random.uniform(0.9, 1.1))
        
        # Floor options
        floors = random.choice(floor_options)
        
        # Waterfront based on percentage
        waterfront = 1 if random.random() < (waterfront_pct / 100) else 0
        
        # View rating
        view = random.randint(min_view, max_view)
        
        # Condition rating (1-5, mostly 3-5 for populated areas)
        condition = random.randint(3, 5)
        
        # Grade
        grade = random.randint(min_grade, max_grade)
        
        # Above ground and basement
        if floors < 1.5:
            sqft_above = sqft_living
            sqft_basement = 0
        else:
            sqft_above = int(sqft_living * random.uniform(0.6, 0.8))
            sqft_basement = sqft_living - sqft_above
        
        # Year built (most between 1970-2015)
        yr_built = random.randint(1970, 2015)
        
        # Year renovated (0 means no renovation)
        yr_renovated = 0
        if yr_built < 2000 and random.random() < 0.3:
            yr_renovated = random.randint(yr_built + 10, 2023)
        
        # Small variations in lat/long to spread houses out
        house_lat = lat + random.uniform(-0.01, 0.01)
        house_long = long + random.uniform(-0.01, 0.01)
        
        # Living area and lot 15 years ago (slightly smaller)
        sqft_living15 = int(sqft_living * random.uniform(0.85, 0.95))
        sqft_lot15 = int(sqft_lot * random.uniform(0.9, 1.0))
        
        # Add house to result
        house = {
            'id': id_num,
            'bedrooms': bedrooms,
            'bathrooms': bathrooms,
            'sqft_living': sqft_living,
            'sqft_lot': sqft_lot,
            'floors': floors,
            'waterfront': waterfront,
            'view': view,
            'condition': condition,
            'grade': grade,
            'sqft_above': sqft_above,
            'sqft_basement': sqft_basement,
            'yr_built': yr_built,
            'yr_renovated': yr_renovated,
            'zipcode': zipcode,
            'lat': house_lat,
            'long': house_long,
            'sqft_living15': sqft_living15,
            'sqft_lot15': sqft_lot15,
            'neighborhood': neighborhood
        }
        houses.append(house)
    
    return houses

# Generate all neighborhoods
all_houses = []

# 1. Evergreen Estates (Luxury Mansions)
all_houses.extend(generate_houses(
    neighborhood="Evergreen Estates",
    size=25,
    bed_range=(4, 6),
    bath_range=(3.5, 7.0),
    sqft_range=(4000, 9000),
    grade_range=(10, 13),
    waterfront_pct=40,
    view_range=(2, 4),
    lot_multiplier=3.0,
    zipcode="98039",
    coordinates=(47.6288, -122.2313)
))

# 2. Cedar Grove (Government Housing)
all_houses.extend(generate_houses(
    neighborhood="Cedar Grove",
    size=200,
    bed_range=(2, 3),
    bath_range=(1.0, 1.5),
    sqft_range=(800, 1200),
    grade_range=(5, 6),
    waterfront_pct=0,
    view_range=(0, 1),
    lot_multiplier=1.2,
    zipcode="98178",
    coordinates=(47.4924, -122.2359)
))

# 3. Greenfield Commons (Middle Class Neighborhood 1)
all_houses.extend(generate_houses(
    neighborhood="Greenfield Commons",
    size=250,
    bed_range=(3, 4),
    bath_range=(2.0, 2.5),
    sqft_range=(1600, 2200),
    grade_range=(7, 8),
    waterfront_pct=0,
    view_range=(0, 2),
    lot_multiplier=2.0,
    zipcode="98052",
    coordinates=(47.6769, -122.1069)
))

# 4. Parkside Meadows (Middle Class Neighborhood 2)
all_houses.extend(generate_houses(
    neighborhood="Parkside Meadows",
    size=200,
    bed_range=(3, 4),
    bath_range=(2.0, 2.5),
    sqft_range=(1800, 2400),
    grade_range=(7, 9),
    waterfront_pct=0,
    view_range=(0, 3),
    lot_multiplier=2.5,
    zipcode="98074",
    coordinates=(47.6157, -122.0355)
))

# 5. Shoreline Terrace (Beachfront Community)
all_houses.extend(generate_houses(
    neighborhood="Shoreline Terrace",
    size=100,
    bed_range=(2, 4),
    bath_range=(1.5, 2.5),
    sqft_range=(1500, 2500),
    grade_range=(7, 9),
    waterfront_pct=100,
    view_range=(3, 4),
    lot_multiplier=1.8,
    zipcode="98166",
    coordinates=(47.4435, -122.3569)
))

# Write to CSV
csv_file = "seattle_housing_neighborhoods.csv"
fieldnames = ['id', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
             'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement',
             'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
             'sqft_living15', 'sqft_lot15', 'neighborhood']

with open(csv_file, 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(all_houses)

print(f"Generated {len(all_houses)} houses across 5 neighborhoods")
print(f"Data saved to {csv_file}")

# Sample output - display first 5 houses from each neighborhood
neighborhoods = ["Evergreen Estates", "Cedar Grove", "Greenfield Commons", 
                "Parkside Meadows", "Shoreline Terrace"]

for neighborhood in neighborhoods:
    houses = [h for h in all_houses if h["neighborhood"] == neighborhood]
    print(f"\n{neighborhood} - Sample houses (first 5 of {len(houses)}):")
    for house in houses[:5]:
        print(f"  {house['bedrooms']} bed, {house['bathrooms']} bath, {house['sqft_living']} sqft, Grade: {house['grade']}, Waterfront: {house['waterfront']}")

SyntaxError: invalid syntax (1920971931.py, line 216)

In [22]:
import random
import csv
import os

def generate_houses(
    neighborhood,       # Name of the neighborhood
    size,               # Number of houses
    bed_range,          # (min, max) bedrooms
    bath_range,         # (min, max) bathrooms
    sqft_range,         # (min, max) square footage
    grade_range,        # (min, max) grade
    waterfront_pct,     # Percentage of waterfront properties
    view_range,         # (min, max) view rating
    lot_multiplier,     # Multiplier for lot size
    zipcode,            # Zipcode
    coordinates         # (lat, long)
):
    """Generate houses for a neighborhood with specified characteristics."""
    houses = []
    
    # Extract ranges
    min_bed, max_bed = bed_range
    min_bath, max_bath = bath_range
    min_sqft, max_sqft = sqft_range
    min_grade, max_grade = grade_range
    min_view, max_view = view_range
    lat, long = coordinates
    
    # Bathroom options (standard increments)
    bath_options = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0, 
                   3.25, 3.5, 3.75, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0]
    valid_bath_options = [b for b in bath_options if min_bath <= b <= max_bath]
    
    # Floor options
    floor_options = [1.0, 1.5, 2.0, 2.5, 3.0]
    
    for i in range(size):
        # Generate ID
        id_num = str(random.randint(1000000000, 9999999999))
        
        # Generate basic features
        bedrooms = random.randint(min_bed, max_bed)
        bathrooms = random.choice(valid_bath_options)
        sqft_living = random.randint(min_sqft, max_sqft)
        
        # Lot size based on multiplier (with some randomness)
        sqft_lot = int(sqft_living * lot_multiplier * random.uniform(0.9, 1.1))
        
        # Floor options
        floors = random.choice(floor_options)
        
        # Waterfront based on percentage
        waterfront = 1 if random.random() < (waterfront_pct / 100) else 0
        
        # View rating
        view = random.randint(min_view, max_view)
        
        # Condition rating (1-5, mostly 3-5 for populated areas)
        condition = random.randint(3, 5)
        
        # Grade
        grade = random.randint(min_grade, max_grade)
        
        # Above ground and basement
        if floors < 1.5:
            sqft_above = sqft_living
            sqft_basement = 0
        else:
            sqft_above = int(sqft_living * random.uniform(0.6, 0.8))
            sqft_basement = sqft_living - sqft_above
        
        # Year built (most between 1970-2015)
        yr_built = random.randint(1970, 2015)
        
        # Year renovated (0 means no renovation)
        yr_renovated = 0
        if yr_built < 2000 and random.random() < 0.3:
            yr_renovated = random.randint(yr_built + 10, 2023)
        
        # Small variations in lat/long to spread houses out
        house_lat = lat + random.uniform(-0.01, 0.01)
        house_long = long + random.uniform(-0.01, 0.01)
        
        # Living area and lot 15 years ago (slightly smaller)
        sqft_living15 = int(sqft_living * random.uniform(0.85, 0.95))
        sqft_lot15 = int(sqft_lot * random.uniform(0.9, 1.0))
        
        # Add house to result
        house = {
            'id': id_num,
            'bedrooms': bedrooms,
            'bathrooms': bathrooms,
            'sqft_living': sqft_living,
            'sqft_lot': sqft_lot,
            'floors': floors,
            'waterfront': waterfront,
            'view': view,
            'condition': condition,
            'grade': grade,
            'sqft_above': sqft_above,
            'sqft_basement': sqft_basement,
            'yr_built': yr_built,
            'yr_renovated': yr_renovated,
            'zipcode': zipcode,
            'lat': house_lat,
            'long': house_long,
            'sqft_living15': sqft_living15,
            'sqft_lot15': sqft_lot15,
            'neighborhood': neighborhood
        }
        houses.append(house)
    
    return houses

# Generate all neighborhoods
all_houses = []

# 1. Evergreen Estates (Luxury Mansions)
all_houses.extend(generate_houses(
    neighborhood="Evergreen Estates",
    size=25,
    bed_range=(4, 6),
    bath_range=(3.5, 7.0),
    sqft_range=(4000, 9000),
    grade_range=(10, 13),
    waterfront_pct=40,
    view_range=(2, 4),
    lot_multiplier=3.0,
    zipcode="98039",
    coordinates=(47.6288, -122.2313)
))

# 2. Cedar Grove (Government Housing)
all_houses.extend(generate_houses(
    neighborhood="Cedar Grove",
    size=200,
    bed_range=(2, 3),
    bath_range=(1.0, 1.5),
    sqft_range=(800, 1200),
    grade_range=(5, 6),
    waterfront_pct=0,
    view_range=(0, 1),
    lot_multiplier=1.2,
    zipcode="98178",
    coordinates=(47.4924, -122.2359)
))

# 3. Greenfield Commons (Middle Class Neighborhood 1)
all_houses.extend(generate_houses(
    neighborhood="Greenfield Commons",
    size=250,
    bed_range=(3, 4),
    bath_range=(2.0, 2.5),
    sqft_range=(1600, 2200),
    grade_range=(7, 8),
    waterfront_pct=0,
    view_range=(0, 2),
    lot_multiplier=2.0,
    zipcode="98052",
    coordinates=(47.6769, -122.1069)
))

# 4. Parkside Meadows (Middle Class Neighborhood 2)
all_houses.extend(generate_houses(
    neighborhood="Parkside Meadows",
    size=200,
    bed_range=(3, 4),
    bath_range=(2.0, 2.5),
    sqft_range=(1800, 2400),
    grade_range=(7, 9),
    waterfront_pct=0,
    view_range=(0, 3),
    lot_multiplier=2.5,
    zipcode="98074",
    coordinates=(47.6157, -122.0355)
))

# 5. Shoreline Terrace (Beachfront Community)
all_houses.extend(generate_houses(
    neighborhood="Shoreline Terrace",
    size=100,
    bed_range=(2, 4),
    bath_range=(1.5, 2.5),
    sqft_range=(1500, 2500),
    grade_range=(7, 9),
    waterfront_pct=100,
    view_range=(3, 4),
    lot_multiplier=1.8,
    zipcode="98166",
    coordinates=(47.4435, -122.3569)
))

# Write to CSV
csv_file = "seattle_housing_neighborhoods.csv"
fieldnames = ['id', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
             'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement',
             'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
             'sqft_living15', 'sqft_lot15', 'neighborhood']

with open(csv_file, 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(all_houses)

print(f"Generated {len(all_houses)} houses across 5 neighborhoods")
print(f"Data saved to {csv_file}")

# Sample output - display first 5 houses from each neighborhood
neighborhoods = ["Evergreen Estates", "Cedar Grove", "Greenfield Commons", 
                "Parkside Meadows", "Shoreline Terrace"]

for neighborhood in neighborhoods:
    houses = [h for h in all_houses if h["neighborhood"] == neighborhood]
    print(f"\n{neighborhood} - Sample houses (first 5 of {len(houses)}):")
    for house in houses[:5]:
        print(f"  {house['bedrooms']} bed, {house['bathrooms']} bath, {house['sqft_living']} sqft, Grade: {house['grade']}, Waterfront: {house['waterfront']}")

Generated 775 houses across 5 neighborhoods
Data saved to seattle_housing_neighborhoods.csv

Evergreen Estates - Sample houses (first 5 of 25):
  4 bed, 4.0 bath, 7427 sqft, Grade: 11, Waterfront: 0
  4 bed, 4.5 bath, 6926 sqft, Grade: 11, Waterfront: 0
  6 bed, 4.0 bath, 6344 sqft, Grade: 13, Waterfront: 0
  5 bed, 7.0 bath, 6399 sqft, Grade: 12, Waterfront: 1
  6 bed, 6.5 bath, 4635 sqft, Grade: 12, Waterfront: 0

Cedar Grove - Sample houses (first 5 of 200):
  2 bed, 1.25 bath, 1138 sqft, Grade: 6, Waterfront: 0
  2 bed, 1.5 bath, 964 sqft, Grade: 6, Waterfront: 0
  2 bed, 1.25 bath, 814 sqft, Grade: 5, Waterfront: 0
  2 bed, 1.0 bath, 871 sqft, Grade: 5, Waterfront: 0
  2 bed, 1.5 bath, 987 sqft, Grade: 6, Waterfront: 0

Greenfield Commons - Sample houses (first 5 of 250):
  3 bed, 2.0 bath, 1622 sqft, Grade: 7, Waterfront: 0
  4 bed, 2.25 bath, 1878 sqft, Grade: 7, Waterfront: 0
  4 bed, 2.5 bath, 2072 sqft, Grade: 8, Waterfront: 0
  3 bed, 2.0 bath, 1724 sqft, Grade: 8, Waterfron

In [23]:
import random
import csv
import os

def generate_houses(
    neighborhood,       # Name of the neighborhood
    size,               # Number of houses
    bed_range,          # (min, max) bedrooms
    bath_range,         # (min, max) bathrooms
    sqft_range,         # (min, max) square footage
    grade_range,        # (min, max) grade
    waterfront_pct,     # Percentage of waterfront properties
    view_range,         # (min, max) view rating
    lot_multiplier,     # Multiplier for lot size
    zipcode,            # Zipcode
    coordinates         # (lat, long)
):
    """Generate houses for a neighborhood with specified characteristics."""
    houses = []
    
    # Extract ranges
    min_bed, max_bed = bed_range
    min_bath, max_bath = bath_range
    min_sqft, max_sqft = sqft_range
    min_grade, max_grade = grade_range
    min_view, max_view = view_range
    lat, long = coordinates
    
    # Bathroom options (standard increments)
    bath_options = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0, 
                   3.25, 3.5, 3.75, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0]
    valid_bath_options = [b for b in bath_options if min_bath <= b <= max_bath]
    
    # Floor options
    floor_options = [1.0, 1.5, 2.0, 2.5, 3.0]
    
    for i in range(size):
        # Generate ID
        id_num = str(random.randint(1000000000, 9999999999))
        
        # Generate basic features
        bedrooms = random.randint(min_bed, max_bed)
        bathrooms = random.choice(valid_bath_options)
        sqft_living = random.randint(min_sqft, max_sqft)
        
        # Lot size based on multiplier (with some randomness)
        sqft_lot = int(sqft_living * lot_multiplier * random.uniform(0.9, 1.1))
        
        # Floor options
        floors = random.choice(floor_options)
        
        # Waterfront based on percentage
        waterfront = 1 if random.random() < (waterfront_pct / 100) else 0
        
        # View rating
        view = random.randint(min_view, max_view)
        
        # Condition rating (1-5, mostly 3-5 for populated areas)
        condition = random.randint(3, 5)
        
        # Grade
        grade = random.randint(min_grade, max_grade)
        
        # Above ground and basement
        if floors < 1.5:
            sqft_above = sqft_living
            sqft_basement = 0
        else:
            sqft_above = int(sqft_living * random.uniform(0.6, 0.8))
            sqft_basement = sqft_living - sqft_above
        
        # Year built (most between 1970-2015)
        yr_built = random.randint(1970, 2015)
        
        # Year renovated (0 means no renovation)
        yr_renovated = 0
        if yr_built < 2000 and random.random() < 0.3:
            yr_renovated = random.randint(yr_built + 10, 2023)
        
        # Small variations in lat/long to spread houses out
        house_lat = lat + random.uniform(-0.01, 0.01)
        house_long = long + random.uniform(-0.01, 0.01)
        
        # Living area and lot 15 years ago (slightly smaller)
        sqft_living15 = int(sqft_living * random.uniform(0.85, 0.95))
        sqft_lot15 = int(sqft_lot * random.uniform(0.9, 1.0))
        
        # Add house to result
        house = {
            'id': id_num,
            'bedrooms': bedrooms,
            'bathrooms': bathrooms,
            'sqft_living': sqft_living,
            'sqft_lot': sqft_lot,
            'floors': floors,
            'waterfront': waterfront,
            'view': view,
            'condition': condition,
            'grade': grade,
            'sqft_above': sqft_above,
            'sqft_basement': sqft_basement,
            'yr_built': yr_built,
            'yr_renovated': yr_renovated,
            'zipcode': zipcode,
            'lat': house_lat,
            'long': house_long,
            'sqft_living15': sqft_living15,
            'sqft_lot15': sqft_lot15,
            'neighborhood': neighborhood
        }
        houses.append(house)
    
    return houses

# Create a directory to store CSV files
output_dir = "neighborhood_data"
os.makedirs(output_dir, exist_ok=True)

# Dictionary to store houses by neighborhood
neighborhood_houses = {}

# Field names for CSV
fieldnames = ['id', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
             'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement',
             'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
             'sqft_living15', 'sqft_lot15', 'neighborhood']

# 1. Evergreen Estates (Luxury Mansions)
houses = generate_houses(
    neighborhood="Evergreen Estates",
    size=25,
    bed_range=(4, 6),
    bath_range=(3.5, 7.0),
    sqft_range=(4000, 9000),
    grade_range=(10, 13),
    waterfront_pct=40,
    view_range=(2, 4),
    lot_multiplier=3.0,
    zipcode="98039",
    coordinates=(47.6288, -122.2313)
)
neighborhood_houses["Evergreen Estates"] = houses

# 2. Cedar Grove (Government Housing)
houses = generate_houses(
    neighborhood="Cedar Grove",
    size=200,
    bed_range=(2, 3),
    bath_range=(1.0, 1.5),
    sqft_range=(800, 1200),
    grade_range=(5, 6),
    waterfront_pct=0,
    view_range=(0, 1),
    lot_multiplier=1.2,
    zipcode="98178",
    coordinates=(47.4924, -122.2359)
)
neighborhood_houses["Cedar Grove"] = houses

# 3. Greenfield Commons (Middle Class Neighborhood 1)
houses = generate_houses(
    neighborhood="Greenfield Commons",
    size=250,
    bed_range=(3, 4),
    bath_range=(2.0, 2.5),
    sqft_range=(1600, 2200),
    grade_range=(7, 8),
    waterfront_pct=0,
    view_range=(0, 2),
    lot_multiplier=2.0,
    zipcode="98052",
    coordinates=(47.6769, -122.1069)
)
neighborhood_houses["Greenfield Commons"] = houses

# 4. Parkside Meadows (Middle Class Neighborhood 2)
houses = generate_houses(
    neighborhood="Parkside Meadows",
    size=200,
    bed_range=(3, 4),
    bath_range=(2.0, 2.5),
    sqft_range=(1800, 2400),
    grade_range=(7, 9),
    waterfront_pct=0,
    view_range=(0, 3),
    lot_multiplier=2.5,
    zipcode="98074",
    coordinates=(47.6157, -122.0355)
)
neighborhood_houses["Parkside Meadows"] = houses

# 5. Shoreline Terrace (Beachfront Community)
houses = generate_houses(
    neighborhood="Shoreline Terrace",
    size=100,
    bed_range=(2, 4),
    bath_range=(1.5, 2.5),
    sqft_range=(1500, 2500),
    grade_range=(7, 9),
    waterfront_pct=100,
    view_range=(3, 4),
    lot_multiplier=1.8,
    zipcode="98166",
    coordinates=(47.4435, -122.3569)
)
neighborhood_houses["Shoreline Terrace"] = houses

# Write each neighborhood to its own CSV file
total_houses = 0
for neighborhood, houses in neighborhood_houses.items():
    # Create a filename-friendly version of the neighborhood name
    filename = neighborhood.replace(" ", "_").lower() + ".csv"
    filepath = os.path.join(output_dir, filename)
    
    # Write the CSV
    with open(filepath, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(houses)
    
    total_houses += len(houses)
    print(f"Created {filepath} with {len(houses)} houses")
    
    # Sample output - display first 5 houses from each neighborhood
    print(f"\n{neighborhood} - Sample houses (first 5 of {len(houses)}):")
    for house in houses[:5]:
        print(f"  {house['bedrooms']} bed, {house['bathrooms']} bath, {house['sqft_living']} sqft, Grade: {house['grade']}, Waterfront: {house['waterfront']}")

print(f"\nGenerated {total_houses} houses across 5 neighborhoods")
print(f"Data saved to separate CSV files in the '{output_dir}' directory")

Created neighborhood_data\evergreen_estates.csv with 25 houses

Evergreen Estates - Sample houses (first 5 of 25):
  6 bed, 5.0 bath, 4295 sqft, Grade: 11, Waterfront: 0
  6 bed, 3.5 bath, 7965 sqft, Grade: 12, Waterfront: 0
  4 bed, 5.0 bath, 4063 sqft, Grade: 13, Waterfront: 0
  5 bed, 3.75 bath, 7120 sqft, Grade: 11, Waterfront: 0
  4 bed, 6.5 bath, 5406 sqft, Grade: 10, Waterfront: 0
Created neighborhood_data\cedar_grove.csv with 200 houses

Cedar Grove - Sample houses (first 5 of 200):
  2 bed, 1.25 bath, 809 sqft, Grade: 6, Waterfront: 0
  3 bed, 1.0 bath, 1112 sqft, Grade: 5, Waterfront: 0
  2 bed, 1.5 bath, 855 sqft, Grade: 5, Waterfront: 0
  3 bed, 1.25 bath, 1169 sqft, Grade: 5, Waterfront: 0
  2 bed, 1.25 bath, 929 sqft, Grade: 6, Waterfront: 0
Created neighborhood_data\greenfield_commons.csv with 250 houses

Greenfield Commons - Sample houses (first 5 of 250):
  4 bed, 2.25 bath, 1871 sqft, Grade: 7, Waterfront: 0
  3 bed, 2.5 bath, 1689 sqft, Grade: 7, Waterfront: 0
  4 be