In [None]:
# import libaries
import numpy as np
import pandas as pd

from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error

In [62]:
df = pd.read_csv('feature_dataset/NYC_data.csv')
# drop Nan
df = df.dropna(axis = 1)

# drop useless feature and symbols
df.drop(columns=['SentimentScore.1','Neighborhood','Board','Index of housing price appreciation, 1 family building','Index of housing price appreciation, 2-4 family building','Index of housing price appreciation, 5+ family building','Index of housing price appreciation, condominium','Index of housing price appreciation, all property types'], inplace=True)
for col in df.columns:
    df[col] = df[col].map(lambda x: ''.join(filter(str.isdigit, x)) if isinstance(x, str) else x)
X = df.drop(columns = ['SentimentScore'])
y = df['SentimentScore'].rank(ascending=False, method='min')

In [61]:
pd.set_option('display.max_columns', None)  # for debug

# Linear Regression

In [None]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# list to calculate RSME
errors = []
weights = []  
intercepts = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    weights.append(model.coef_[0])  
    intercepts.append(model.intercept_)
    
    # predict
    y_pred = model.predict(X_test)
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Weights (Coefficients): {weights}")
print(f"Intercepts: {intercepts}")

Individual RMSEs: [25.40534262040839, 28.719464309850878, 9.625950555577333, 1.886607923147949, 55.01929161565471, 0.7627129326444333, 12.386477589499577, 15.437786324996466, 16.662434726809863, 19.95705756274839, 17.811876967711243, 76.7836466431844, 44.42657509322669, 146.34017117376249, 85.80094020434734, 0.1380681402044388, 3.4270987842775185, 25.74909719409311, 19.377212865938418, 20.974602318745937, 26.896803663085166, 63.46363250360912, 16.676997093817363, 23.974771094944856]
Mean RMSE: 31.571025829261924
Weights (Coefficients): [-0.006407502817827284, -0.000441852220629118, 0.0006417758801631978, -0.0005267275991136151, -0.0046306790800415415, -0.00020850561142950294, 0.0029584201429184546, 0.005489645193283423, -0.0058712230222778055, 0.0016590640278298832, -0.008567705912449397, 0.0016288220925649993, 0.0009839553054340241, 0.0009096939826755653, 0.0013459341030587215, -0.0003100852051924702, -0.0017992124505995634, 0.006003476119283545, 0.003612712892085592, -0.0014828571911

# Lasso

In [86]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.linear_model import Lasso
model = Lasso(alpha=1e-4)

# list to calculate RSME
errors = []
weights = []  
intercepts = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    weights.append(model.coef_[0])  
    intercepts.append(model.intercept_)
    
    # predict
    y_pred = model.predict(X_test)
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Weights (Coefficients): {weights}")
print(f"Intercepts: {intercepts}")

Individual RMSEs: [12.165634669666005, 0.6314000750769964, 15.738514406291166, 5.184580085044587, 0.5884948143517761, 4.242825184588156, 18.067770744995798, 7.1535477481077265, 5.829209375471493, 18.909785791824874, 7.6945561014669295, 20.03539232494012, 4.505082041448389, 13.968638476062353, 14.066807393905094, 4.700194838655804, 5.925989712654584, 3.2131211353351574, 15.477142950852624, 7.359879412379513, 34.93402705041017, 5.099828139071036, 9.768161280914143, 10.547243232252185]
Mean RMSE: 10.241992791073612
Weights (Coefficients): [-0.39958734444851524, -0.14364462237600975, 0.024372678147059956, -0.11640346329510745, -0.18119389584742132, -0.2672573060480509, 0.0951794158670457, -0.5508108746267267, -0.2894361680700207, 0.12827772372656301, -0.3963142903210309, -0.06284449849095199, -0.15403217944133146, -0.14725933987172637, -0.09035073521846486, -0.216726382391264, -0.11993186115278119, -0.09561940663911914, -0.40722160528567614, 0.009460707666531655, -0.27218304194999815, -0.1

# Random Forests

In [None]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# list to calculate RSME
errors = []
feature_importances = [] 
predictions = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    feature_importances.append(model.feature_importances_[0])
    
    # predict
    y_pred = model.predict(X_test)
    predictions.append(y_pred[0])
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Feature Importances: {feature_importances}")
print(f"Predictions: {predictions}")

Individual RMSEs: [2.9800000000000004, 0.96, 0.14000000000000057, 1.9800000000000004, 2.79, 1.0300000000000002, 1.4499999999999993, 2.1700000000000017, 0.17999999999999972, 1.0199999999999996, 4.0, 0.6799999999999997, 1.1399999999999997, 0.6999999999999993, 1.1199999999999992, 0.0, 1.1900000000000013, 1.1500000000000004, 3.3000000000000007, 0.4200000000000017, 4.620000000000001, 0.9399999999999995, 1.9300000000000002, 2.66]
Mean RMSE: 1.60625
Feature Importances: [0.003341861515723032, 0.0015016244348558796, 0.0008067076793299138, 0.0018667413592107368, 0.004053191770122354, 0.0006962252004336594, 0.0009090257916979825, 0.0009925249981814117, 0.0006270872991323162, 0.001782456414689827, 0.0008582250608672148, 0.0018756693442621316, 0.0026453510697768763, 0.001972247700242685, 0.001501923069719098, 0.0005971542859680326, 0.000992176873849841, 0.0015574746764950996, 0.0015933509802558005, 0.001324239054928977, 0.0008403706410527468, 0.0020593605896341545, 0.0020103133048071055, 0.0019076

# Decision Tree

In [88]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=42)

# list to calculate RSME
errors = []
predictions = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    # predict
    y_pred = model.predict(X_test)
    predictions.append(y_pred[0])
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Predictions: {predictions}")

Individual RMSEs: [1.0, 1.0, 1.0, 5.0, 3.0, 4.0, 5.0, 1.0, 3.0, 2.0, 1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 1.0, 2.0, 1.0, 1.0, 1.0]
Mean RMSE: 1.9166666666666667
Predictions: [21.0, 7.0, 17.0, 18.0, 6.0, 2.0, 12.0, 17.0, 2.0, 22.0, 2.0, 12.0, 5.0, 15.0, 9.0, 20.0, 18.0, 13.0, 15.0, 20.0, 22.0, 10.0, 1.0, 24.0]


# SVM

In [89]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.svm import SVR
model = SVR(kernel='rbf', C=1.0, epsilon=0.1)

# list to calculate RSME
errors = []
predictions = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    # predict
    y_pred = model.predict(X_test)
    predictions.append(y_pred[0])
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Predictions: {predictions}")

Individual RMSEs: [9.98407410617398, 4.6446800312732055, 5.951383362824291, 0.9214433651036007, 9.753064589047481, 6.925674855882281, 5.969195041848987, 4.105022372911408, 7.899186547348963, 8.136168343594816, 11.892654799857457, 3.423485117859151, 8.543635813964077, 3.4827847442307824, 3.1383404280148994, 6.896426352645422, 5.011673403487233, 3.0116601412795436, 0.9214242338477892, 8.992543335633565, 12.115929230131185, 1.8633679824749674, 10.962464986946404, 10.929535834844248]
Mean RMSE: 6.478159125884406
Predictions: [12.01592589382602, 12.644680031273206, 12.048616637175709, 12.0785566348964, 12.753064589047481, 12.925674855882281, 12.969195041848987, 11.894977627088592, 12.899186547348963, 11.863831656405184, 12.892654799857457, 12.423485117859151, 12.543635813964077, 10.517215255769218, 13.1383404280149, 12.103573647354578, 11.988326596512767, 11.988339858720456, 12.92142423384779, 12.007456664366435, 11.884070769868815, 12.863367982474967, 12.962464986946404, 12.070464165155752