In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

In [5]:
housing_price=pd.read_csv('NY-House-Dataset.csv')
housing_price.dropna(inplace=True)
target = 'PRICE'
3

3

In [6]:
categorical_cols = ['BROKERTITLE','TYPE','ADDRESS','STATE','MAIN_ADDRESS','ADMINISTRATIVE_AREA_LEVEL_2','LOCALITY','SUBLOCALITY','STREET_NAME','LONG_NAME','FORMATTED_ADDRESS']
numerical_cols = ['PRICE','BEDS','BATH','PROPERTYSQFT','LATITUDE','LONGITUDE']

encoder = OneHotEncoder(drop='first', sparse_output = False)
encoded_data = encoder.fit_transform(housing_price[categorical_cols])

encoded_columns = encoder.get_feature_names_out(categorical_cols)
df_encoded = pd.DataFrame(encoded_data, columns=encoded_columns, index=housing_price.index)

x = pd.concat([housing_price[numerical_cols], df_encoded], axis=1)
y = housing_price[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [10]:
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)

y_pred_linear = linear_model.predict(x_test)

print("Linear Regression Mean Squared Error:", mean_squared_error(y_test, y_pred_linear))
print("Linear Regression R^2 Score:", r2_score(y_test, y_pred_linear))

Linear Regression Mean Squared Error: 22617038008389.867
Linear Regression R^2 Score: -0.14459444151590461


In [11]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)

y_pred_rf = rf_model.predict(x_test)

print("Random Forest Regressor Mean Squared Error:", mean_squared_error(y_test, y_pred_rf))
print("Random Forest Regressor R^2 Score:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Mean Squared Error: 339919328715.0335
Random Forest Regressor R^2 Score: 0.9827974921354989


In [14]:
regressor = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=4,
    max_features='sqrt',
    random_state=42
)

regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print("Random Forest Regressor Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Random Forest Regressor R^2 Score:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Mean Squared Error: 14055630507508.166
Random Forest Regressor R^2 Score: 0.9827974921354989


In [15]:
regressor = RandomForestRegressor(
    n_estimators=300,
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=4,
    max_features='sqrt',
    random_state=42
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    regressor, x, y, cv=kf, scoring=make_scorer(mean_squared_error)
)

mean_mse = np.mean(scores)
std_mse = np.std(scores)

print(f"Mean MSE from cross-validation: {mean_mse:.2f}")
print(f"R^2 Score: {r2_score(y_test, y_pred):.2f}")

Mean MSE from cross-validation: 978753831465017.00
R^2 Score: 0.29


In [17]:
gbm_model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=3, random_state=42)
gbm_model.fit(x_train, y_train)

y_pred_gbm = gbm_model.predict(x_test)

print("Gradient Boosting Regressor Mean Squared Error:", mean_squared_error(y_test, y_pred_gbm))
print("Gradient Boosting Regressor R^2 Score:", r2_score(y_test, y_pred_gbm))

Gradient Boosting Regressor Mean Squared Error: 90062238277.06476
Gradient Boosting Regressor R^2 Score: 0.9954421645626554


In [19]:
ridge = Ridge(alpha=1.0)
ridge.fit(x_train, y_train)
ridge_predictions = ridge.predict(x_test)
print("Ridge MSE:", mean_squared_error(y_test, ridge_predictions))
print("Ridge R^2 Score:", r2_score(y_test, ridge_predictions))

lasso = Lasso(alpha=0.1)
lasso.fit(x_train, y_train)
lasso_predictions = lasso.predict(x_test)
print("Lasso MSE:", mean_squared_error(y_test, lasso_predictions))
print("Lasso R62 Score:", r2_score(y_test, lasso_predictions))

elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(x_train, y_train)
elastic_net_predictions = elastic_net.predict(x_test)
print("Elastic Net MSE:", mean_squared_error(y_test, elastic_net_predictions))
print("Elastic Net R^2 Score:", r2_score(y_test, elastic_net_predictions))

Ridge MSE: 9125817298878.18
Ridge R^2 Score: 0.5381641154464651
Lasso MSE: 0.00010241741407229526
Lasso R62 Score: 1.0
Elastic Net MSE: 9171136629287.148
Elastic Net R^2 Score: 0.5358706120416389


  model = cd_fast.enet_coordinate_descent(
