In [91]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
import matplotlib.pyplot as plt

## Understanding and Cleaning the Data

In [92]:
df = pd.read_csv("/kaggle/input/new-york-housing-market/NY-House-Dataset.csv")
df.head()

Unnamed: 0,BROKERTITLE,TYPE,PRICE,BEDS,BATH,PROPERTYSQFT,ADDRESS,STATE,MAIN_ADDRESS,ADMINISTRATIVE_AREA_LEVEL_2,LOCALITY,SUBLOCALITY,STREET_NAME,LONG_NAME,FORMATTED_ADDRESS,LATITUDE,LONGITUDE
0,Brokered by Douglas Elliman -111 Fifth Ave,Condo for sale,315000,2,2.0,1400.0,2 E 55th St Unit 803,"New York, NY 10022","2 E 55th St Unit 803New York, NY 10022",New York County,New York,Manhattan,East 55th Street,Regis Residence,"Regis Residence, 2 E 55th St #803, New York, N...",40.761255,-73.974483
1,Brokered by Serhant,Condo for sale,195000000,7,10.0,17545.0,Central Park Tower Penthouse-217 W 57th New Yo...,"New York, NY 10019",Central Park Tower Penthouse-217 W 57th New Yo...,United States,New York,New York County,New York,West 57th Street,"217 W 57th St, New York, NY 10019, USA",40.766393,-73.980991
2,Brokered by Sowae Corp,House for sale,260000,4,2.0,2015.0,620 Sinclair Ave,"Staten Island, NY 10312","620 Sinclair AveStaten Island, NY 10312",United States,New York,Richmond County,Staten Island,Sinclair Avenue,"620 Sinclair Ave, Staten Island, NY 10312, USA",40.541805,-74.196109
3,Brokered by COMPASS,Condo for sale,69000,3,1.0,445.0,2 E 55th St Unit 908W33,"Manhattan, NY 10022","2 E 55th St Unit 908W33Manhattan, NY 10022",United States,New York,New York County,New York,East 55th Street,"2 E 55th St, New York, NY 10022, USA",40.761398,-73.974613
4,Brokered by Sotheby's International Realty - E...,Townhouse for sale,55000000,7,2.373861,14175.0,5 E 64th St,"New York, NY 10065","5 E 64th StNew York, NY 10065",United States,New York,New York County,New York,East 64th Street,"5 E 64th St, New York, NY 10065, USA",40.767224,-73.969856


In [93]:
df.describe()

Unnamed: 0,PRICE,BEDS,BATH,PROPERTYSQFT,LATITUDE,LONGITUDE
count,4801.0,4801.0,4801.0,4801.0,4801.0,4801.0
mean,2356940.0,3.356801,2.373861,2184.207862,40.714227,-73.941601
std,31355250.0,2.602315,1.946962,2377.140894,0.087676,0.101082
min,2494.0,1.0,0.0,230.0,40.499546,-74.253033
25%,499000.0,2.0,1.0,1200.0,40.639375,-73.987143
50%,825000.0,3.0,2.0,2184.207862,40.726749,-73.949189
75%,1495000.0,4.0,3.0,2184.207862,40.771923,-73.870638
max,2147484000.0,50.0,50.0,65535.0,40.912729,-73.70245


In [94]:
df.shape

(4801, 17)

In [95]:
df.columns

Index(['BROKERTITLE', 'TYPE', 'PRICE', 'BEDS', 'BATH', 'PROPERTYSQFT',
       'ADDRESS', 'STATE', 'MAIN_ADDRESS', 'ADMINISTRATIVE_AREA_LEVEL_2',
       'LOCALITY', 'SUBLOCALITY', 'STREET_NAME', 'LONG_NAME',
       'FORMATTED_ADDRESS', 'LATITUDE', 'LONGITUDE'],
      dtype='object')

In [96]:
df.isnull().sum()

BROKERTITLE                    0
TYPE                           0
PRICE                          0
BEDS                           0
BATH                           0
PROPERTYSQFT                   0
ADDRESS                        0
STATE                          0
MAIN_ADDRESS                   0
ADMINISTRATIVE_AREA_LEVEL_2    0
LOCALITY                       0
SUBLOCALITY                    0
STREET_NAME                    0
LONG_NAME                      0
FORMATTED_ADDRESS              0
LATITUDE                       0
LONGITUDE                      0
dtype: int64

#### Droping some categorical features since the analysis is focused on price and market trends

In [97]:
df=df.drop(['BROKERTITLE','ADDRESS','MAIN_ADDRESS','ADMINISTRATIVE_AREA_LEVEL_2',
                'LONG_NAME','FORMATTED_ADDRESS','LATITUDE','LONGITUDE'],axis=1)

In [98]:
df.head()

Unnamed: 0,TYPE,PRICE,BEDS,BATH,PROPERTYSQFT,STATE,LOCALITY,SUBLOCALITY,STREET_NAME
0,Condo for sale,315000,2,2.0,1400.0,"New York, NY 10022",New York,Manhattan,East 55th Street
1,Condo for sale,195000000,7,10.0,17545.0,"New York, NY 10019",New York,New York County,New York
2,House for sale,260000,4,2.0,2015.0,"Staten Island, NY 10312",New York,Richmond County,Staten Island
3,Condo for sale,69000,3,1.0,445.0,"Manhattan, NY 10022",New York,New York County,New York
4,Townhouse for sale,55000000,7,2.373861,14175.0,"New York, NY 10065",New York,New York County,New York


In [99]:
df.duplicated().sum()

240

In [100]:
df=df.drop_duplicates()

In [101]:
LE = LabelEncoder()
df['TYPE']=LE.fit_transform(df['TYPE'])
df['LOCALITY']=LE.fit_transform(df['LOCALITY'])
df['SUBLOCALITY']=LE.fit_transform(df['SUBLOCALITY'])
df['STREET_NAME']=LE.fit_transform(df['STREET_NAME'])
df['STATE']=LE.fit_transform(df['STATE'])

In [102]:
df.head()

Unnamed: 0,TYPE,PRICE,BEDS,BATH,PROPERTYSQFT,STATE,LOCALITY,SUBLOCALITY,STREET_NAME
0,2,315000,2,2.0,1400.0,207,4,10,56
1,2,195000000,7,10.0,17545.0,205,4,12,117
2,7,260000,4,2.0,2015.0,301,4,16,153
3,2,69000,3,1.0,445.0,161,4,12,117
4,12,55000000,7,2.373861,14175.0,227,4,12,117


## Buidling different models and finding what is better

In [103]:
X = df.drop(columns="PRICE", axis=1)
y = df["PRICE"]

In [104]:
scaler = StandardScaler()
Xs = scaler.fit_transform(X)

In [114]:
#splitting
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.2)

#### Linear Regression building an evaluation

In [115]:
# Linear Regression
LR = LinearRegression()
LR.fit(X_train, y_train)

In [116]:
#prediction and evaluation metrics
LR_pred = LR.predict(X_test)

mse = mean_squared_error(y_test, LR_pred)
r2 = r2_score(y_test, LR_pred)

print(f"MSE score: {mse}")
print(f"R-squared score: {r2}")

MSE score: 14888483149613.428
R-squared score: -0.22543426608822204


In [118]:
# decission tree preiction and evaluation
dt_model = DecisionTreeRegressor(random_state=42)


dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print(f"Decision Tree MSE: {mse_dt}")
print(f"Decision Tree R-squared: {r2_dt}")

Decision Tree MSE: 5591585784909.588
Decision Tree R-squared: 0.5397703880413145


In [119]:
# random forest
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest MSE: {mse_rf}")
print(f"Random Forest R-squared: {r2_rf}")

Random Forest MSE: 2978713860046.1665
Random Forest R-squared: 0.7548294210839561


In [120]:
# xgboost
import xgboost as xgb

# Initialize the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Fit the model to the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost MSE: {mse_xgb}")
print(f"XGBoost R-squared: {r2_xgb}")

XGBoost MSE: 3846899207280.185
XGBoost R-squared: 0.683371230002624


So it appears that the Random forest regrressor works the best with this data