In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import seaborn as sns

from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import itertools


import sklearn as skn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error , mean_absolute_percentage_error

from sklearn.cluster import KMeans

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor


import warnings
warnings.filterwarnings('ignore')

In [26]:
rs = 42

df = pd.read_csv('NY-House-Dataset.csv')
df.columns

Index(['BROKERTITLE', 'TYPE', 'PRICE', 'BEDS', 'BATH', 'PROPERTYSQFT',
       'ADDRESS', 'STATE', 'MAIN_ADDRESS', 'ADMINISTRATIVE_AREA_LEVEL_2',
       'LOCALITY', 'SUBLOCALITY', 'STREET_NAME', 'LONG_NAME',
       'FORMATTED_ADDRESS', 'LATITUDE', 'LONGITUDE'],
      dtype='object')

In [27]:
df.describe()

Unnamed: 0,PRICE,BEDS,BATH,PROPERTYSQFT,LATITUDE,LONGITUDE
count,4801.0,4801.0,4801.0,4801.0,4801.0,4801.0
mean,2356940.0,3.356801,2.373861,2184.207862,40.714227,-73.941601
std,31355250.0,2.602315,1.946962,2377.140894,0.087676,0.101082
min,2494.0,1.0,0.0,230.0,40.499546,-74.253033
25%,499000.0,2.0,1.0,1200.0,40.639375,-73.987143
50%,825000.0,3.0,2.0,2184.207862,40.726749,-73.949189
75%,1495000.0,4.0,3.0,2184.207862,40.771923,-73.870638
max,2147484000.0,50.0,50.0,65535.0,40.912729,-73.70245


In [28]:
sel_cols = ['TYPE', 'BEDS', 'BATH', 'PROPERTYSQFT', 'PRICE']
df = df[sel_cols]
df.head()


Unnamed: 0,TYPE,BEDS,BATH,PROPERTYSQFT,PRICE
0,Condo for sale,2,2.0,1400.0,315000
1,Condo for sale,7,10.0,17545.0,195000000
2,House for sale,4,2.0,2015.0,260000
3,Condo for sale,3,1.0,445.0,69000
4,Townhouse for sale,7,2.373861,14175.0,55000000


In [29]:
df.TYPE.unique()

array(['Condo for sale', 'House for sale', 'Townhouse for sale',
       'Co-op for sale', 'Multi-family home for sale', 'For sale',
       'Contingent', 'Land for sale', 'Foreclosure', 'Pending',
       'Coming Soon', 'Mobile house for sale', 'Condop for sale'],
      dtype=object)

In [30]:
df['TYPE'] = df['TYPE'].str.replace(' for sale' , '')
df.loc[df['TYPE'] == 'Condop' , 'TYPE'] = 'Condo'
df.TYPE.value_counts()

TYPE
Co-op                1450
House                1012
Condo                 896
Multi-family home     727
Townhouse             299
Pending               243
Contingent             88
Land                   49
For sale               20
Foreclosure            14
Coming Soon             2
Mobile house            1
Name: count, dtype: int64

In [31]:
keep_types = ['Co-op' , 'House' , 'Condo' , 'Multi-family home']
df.loc[~(df.TYPE.isin(keep_types)) , 'TYPE'] = 'Other'

df.describe()

Unnamed: 0,BEDS,BATH,PROPERTYSQFT,PRICE
count,4801.0,4801.0,4801.0,4801.0
mean,3.356801,2.373861,2184.207862,2356940.0
std,2.602315,1.946962,2377.140894,31355250.0
min,1.0,0.0,230.0,2494.0
25%,2.0,1.0,1200.0,499000.0
50%,3.0,2.0,2184.207862,825000.0
75%,4.0,3.0,2184.207862,1495000.0
max,50.0,50.0,65535.0,2147484000.0


In [32]:
lower_iqr = np.nanpercentile(df.PRICE , 25)
upper_iqr = np.nanpercentile(df.PRICE , 75)

iqr = upper_iqr - lower_iqr

lower_bound = lower_iqr - (1.5 * iqr)
upper_bound = upper_iqr + (1.5 * iqr)


df = df[(df['PRICE'] >= lower_bound) & (df['PRICE'] <= upper_iqr)]

In [33]:
len(df.TYPE.unique())

5

In [34]:
house_types = sorted(df.TYPE.unique())

fig = make_subplots(rows = 3 , cols=2 , subplot_titles=house_types)

row_idx = 1
col_idx = 1

for i in range(len(house_types)):
    if row_idx > 3:
        row_idx = 1
        col_idx += 1
    fig.add_trace(go.Histogram(x=df[df.TYPE == house_types[i]]['PRICE'] , name=house_types[i]) , row=row_idx , col=col_idx)
    row_idx += 1

fig.update_layout(title_text='Price Distribution by House Type' , height=800 , width=1000)

In [35]:
fig = px.scatter(data_frame=df, x = 'PROPERTYSQFT', y = 'PRICE' , facet_col= 'TYPE' , trendline= 'ols' )
fig.show()

In [36]:
fig = px.box(data_frame=df , x = 'TYPE' , y = 'PRICE' , color = 'TYPE')

fig.show()


In [37]:
house_types = df["TYPE"].unique()

res = []

# Compare all pairs of house types
for a, b in itertools.combinations(house_types, 2):
    prices_a = df[df["TYPE"] == a]["PRICE"]
    prices_b = df[df["TYPE"] == b]["PRICE"]
    t_stat, p_val = ttest_ind(prices_a, prices_b, equal_var=False)  # Welch’s t-test
    temp = {
        "comparison": f"{a} vs {b}",
        "t-stat": t_stat,
        "p-value": p_val
    }
    res.append(temp)

res = pd.DataFrame(res)


rejected, pvals_corrected, _, _ = multipletests(res['p-value'], alpha=0.05, method='bonferroni')

res['p_value_corrected'] = pvals_corrected
res['rejected'] = rejected

res

Unnamed: 0,comparison,t-stat,p-value,p_value_corrected,rejected
0,Condo vs House,-6.178246,8.902275e-10,8.902275e-09,True
1,Condo vs Co-op,15.887604,3.8418179999999997e-51,3.8418179999999997e-50,True
2,Condo vs Other,-0.157049,0.8752409,1.0,False
3,Condo vs Multi-family home,-16.303075,1.201109e-53,1.201109e-52,True
4,House vs Co-op,27.210122,1.7167469999999999e-136,1.716747e-135,True
5,House vs Other,5.218287,2.331359e-07,2.331359e-06,True
6,House vs Multi-family home,-12.081873,1.0201720000000001e-31,1.020172e-30,True
7,Co-op vs Other,-13.829338,2.1298759999999998e-38,2.129876e-37,True
8,Co-op vs Multi-family home,-37.994504,2.196711e-198,2.196711e-197,True
9,Other vs Multi-family home,-14.236903,3.852831e-41,3.852831e-40,True


In [38]:
fig = px.box(data_frame=df , x = 'TYPE' , y = 'PRICE' , color = 'TYPE')

fig.show()

In [39]:
bath_price = df.groupby('BATH')['PRICE'].mean().reset_index()

# Reshape price for clustering
X = bath_price['PRICE'].values.reshape(-1, 1)

# Cluster into 3 groups
kmeans = KMeans(n_clusters=3, random_state=rs).fit(X)
bath_price['BATH_GROUP'] = kmeans.labels_

# Map back
bath_group_map = dict(zip(bath_price['BATH'], bath_price['BATH_GROUP']))
df['BATH_GROUP'] = df['BATH'].map(bath_group_map)



fig = px.box(data_frame=df, x = 'BATH_GROUP', y = 'PRICE' , color= 'BATH_GROUP' )
fig.show()

In [40]:
bath_groups = sorted(df['BATH_GROUP'].unique())

for group in bath_groups:
    print(f"Group {group}:")
    print(f"Unique bath values: {df[df['BATH_GROUP'] == group]['BATH'].unique().astype(str)}")

Group 0:
Unique bath values: ['3.0' '6.0' '8.0' '4.0' '5.0' '9.0' '7.0']
Group 1:
Unique bath values: ['2.0' '2.3738608579684373' '0.0']
Group 2:
Unique bath values: ['1.0']


In [41]:
# Get average price per bath type
bed_price = df.groupby('BEDS')['PRICE'].mean().reset_index()

# Reshape price for clustering
X = bed_price['PRICE'].values.reshape(-1, 1)

# Cluster into 3 groups
kmeans = KMeans(n_clusters=3, random_state=rs).fit(X)
bed_price['BEDS_GROUP'] = kmeans.labels_

# Map back
bed_group_map = dict(zip(bed_price['BEDS'], bed_price['BEDS_GROUP']))
df['BEDS_GROUP'] = df['BEDS'].map(bed_group_map)



fig = px.box(data_frame=df, x = 'BEDS_GROUP', y = 'PRICE' , color= 'BEDS_GROUP' )
fig.show()

In [42]:
bath_groups = sorted(df['BEDS_GROUP'].unique())

for group in bath_groups:
    print(f"Group {group}:")
    print(f"Unique bed values: {df[df['BEDS_GROUP'] == group]['BEDS'].unique().astype(str)}")

Group 0:
Unique bed values: ['4' '5' '6' '10' '11' '7' '9' '12' '8' '15' '14' '16' '13']
Group 1:
Unique bed values: ['2' '3']
Group 2:
Unique bed values: ['1']


In [43]:
categorical_cols = ['BEDS_GROUP' , 'BATH_GROUP' , 'TYPE']

X = df.drop(columns = ['PRICE' ])
y = np.log(df['PRICE'])

for col in categorical_cols:
    X[col] = X[col].astype('category')
    X[col] = X[col].cat.codes

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X , y , test_size = 0.25 , random_state = 42)

x_scaler = MinMaxScaler()
X_train['PROPERTYSQFT'] = x_scaler.fit_transform(X_train['PROPERTYSQFT'].values.reshape(-1, 1))
X_test['PROPERTYSQFT'] = x_scaler.transform(X_test['PROPERTYSQFT'].values.reshape(-1, 1))

y_scaler = MinMaxScaler()
y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1)) 
y_test = y_scaler.transform(y_test.values.reshape(-1, 1))


print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (2703, 6)
y_train shape: (2703, 1)
X_test shape: (902, 6)
y_test shape: (902, 1)


In [45]:
def train_and_evaluate_model(model, scaler , X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    

    y_train_pred = model.predict(X_train)
    y_train_pred = y_scaler.inverse_transform(y_train_pred.reshape(-1, 1))

    
    y_test_pred = model.predict(X_test)
    y_test_pred = y_scaler.inverse_transform(y_test_pred.reshape(-1, 1))
    
    
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mape = mean_absolute_percentage_error(y_test, y_test_pred)
    
    ret = {
        'model_name' : type(model).__name__ , 
        'y_train_pred' : y_train_pred,
        'y_test_pred' : y_test_pred,
        'test_rmse' : rmse,
        'test_mape' : mape 
        
    }
    
    
    return ret

In [46]:
models = [
         LinearRegression(),
         DecisionTreeRegressor(criterion= 'squared_error' , max_depth = 5),
         RandomForestRegressor(criterion= 'squared_error' , max_depth = 5),
         KNeighborsRegressor(n_neighbors=5 , weights='distance' , algorithm='auto')
         ]



results = [train_and_evaluate_model(model, y_scaler, X_train, y_train, X_test, y_test) for model in models]


fig = make_subplots(rows=2, cols=2  , subplot_titles=[f"{result['model_name']}" for result in results])

row_idx = 1
col_idx = 1

y_train = np.exp(y_scaler.inverse_transform(y_train.reshape(-1, 1)))


for _ , result in enumerate(results):
    
    if row_idx > 2:
        row_idx = 1
        col_idx += 1
    
    y_train_pred = np.exp(result['y_train_pred'])
    res = y_train_pred - y_train

    fig.add_trace(go.Scatter(x=y_train_pred.ravel(), y=res.ravel(), mode='markers' , name = result['model_name'] , hovertext=y_train.ravel() , hoverinfo='text'),
              row=row_idx, col=col_idx)
    
    row_idx += 1


fig.update_layout(
    height=1200,  
    width=1300,
    xaxis_title="Predicted Price",
    yaxis_title="Residuals",  
    title_text="Residual Plots for each model"
    
)


fig.show()

In [47]:
y_test = np.exp(y_scaler.inverse_transform(y_test.reshape(-1, 1)))

fig = make_subplots(rows=2, cols=2  , subplot_titles=[f"{result['model_name']} RMSE = {result['test_rmse']:.2f} MAPE = {result['test_mape']:.2f}" for result in results])

row_idx = 1
col_idx = 1

for _ , result in enumerate(results):
    
    if row_idx > 2:
        row_idx = 1
        col_idx += 1
    
    y_test_pred = np.exp(result['y_test_pred'])

    fig.add_trace(go.Scatter(x=y_test.ravel(), y=y_test_pred.ravel(), mode='markers' , name = result['model_name']),
              row=row_idx, col=col_idx)
    
    row_idx += 1


fig.update_layout(
    xaxis_title="Actual Price",
    yaxis_title="Predicted Price",
    height=1200,  
    width=1300,  
    title_text="Predicted vs acutal prices on test set"
)


fig.show()

