In [31]:
# import libaries
import numpy as np
import pandas as pd

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error

In [2]:
df = pd.read_csv('feature_dataset/NYC_data.csv')
# drop Nan
df = df.dropna(axis = 1)

# drop useless feature and symbols
df.drop(columns=['SentimentScore.1','Neighborhood','Board','Index of housing price appreciation, 1 family building','Index of housing price appreciation, 2-4 family building','Index of housing price appreciation, 5+ family building','Index of housing price appreciation, condominium','Index of housing price appreciation, all property types'], inplace=True)
for col in df.columns:
    df[col] = df[col].map(lambda x: ''.join(filter(str.isdigit, x)) if isinstance(x, str) else x)
X = df.drop(columns = ['SentimentScore'])
y = df['SentimentScore'].rank(ascending=False, method='min')

In [61]:
pd.set_option('display.max_columns', None)  # for debug

# Linear Regression

In [3]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# list to calculate RSME
errors = []
weights = []  
intercepts = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    weights.append(model.coef_[0])  
    intercepts.append(model.intercept_)
    
    # predict
    y_pred = model.predict(X_test)
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Weights (Coefficients): {weights}")
print(f"Intercepts: {intercepts}")

Individual RMSEs: [25.40534262040839, 28.719464309850878, 9.625950555577333, 1.886607923147949, 55.01929161565471, 0.7627129326444333, 12.386477589499577, 15.437786324996466, 16.662434726809863, 19.95705756274839, 17.811876967711243, 76.7836466431844, 44.42657509322669, 146.34017117376249, 85.80094020434734, 0.1380681402044388, 3.4270987842775185, 25.74909719409311, 19.377212865938418, 20.974602318745937, 26.896803663085166, 63.46363250360912, 16.676997093817363, 23.974771094944856]
Mean RMSE: 31.571025829261924
Weights (Coefficients): [-0.006407502817827284, -0.000441852220629118, 0.0006417758801631978, -0.0005267275991136151, -0.0046306790800415415, -0.00020850561142950294, 0.0029584201429184546, 0.005489645193283423, -0.0058712230222778055, 0.0016590640278298832, -0.008567705912449397, 0.0016288220925649993, 0.0009839553054340241, 0.0009096939826755653, 0.0013459341030587215, -0.0003100852051924702, -0.0017992124505995634, 0.006003476119283545, 0.003612712892085592, -0.0014828571911

## Result
1. no need to do grid search
2. 

# Lasso

In [25]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.linear_model import Lasso
model = Lasso(alpha=2.1544346900318822e-07)

## set the grid search
#param_grid = {'alpha': np.logspace(-10, -4, 10)}  # alphaの値を0.001から10まで対数スケールで探索
#lasso = Lasso()
#
## use MSE as the evaluation score
#def neg_rmse(y_true, y_pred):
#    return -np.sqrt(mean_squared_error(y_true, y_pred))
#
#scorer = make_scorer(neg_rmse)
#
## execute grid search
#grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=loo, scoring=scorer)
#grid_search.fit(X, y)
#
## show the result
#print("Best Parameters:", grid_search.best_params_)
#print("Best RMSE:", grid_search.best_score_)
#print("All Results:")
#for params, mean_test_score in zip(grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score']):
#    print(f"alpha: {params['alpha']}, RMSE: {mean_test_score}")


# list to calculate RSME
errors = []
weights = []  
intercepts = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    weights.append(model.coef_[0])  
    intercepts.append(model.intercept_)
    
    # predict
    y_pred = model.predict(X_test)
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Weights (Coefficients): {weights}")
print(f"Intercepts: {intercepts}")


Individual RMSEs: [11.982451969205286, 0.1269567617616758, 15.330870823196463, 4.9806056080218895, 0.8091682898960926, 4.283951676079283, 18.643427940983827, 7.1315144310981395, 5.624318390788858, 18.923556918861323, 7.719751974091963, 19.447310425905556, 4.028691402284011, 13.807453977085046, 13.27131286676773, 4.879168198811385, 5.906741810148574, 2.8808268571684152, 16.321209075321775, 7.225603746520662, 34.844535288585824, 5.227746230602463, 9.39131856313918, 10.260775557506904]
Mean RMSE: 10.127052865993013
Weights (Coefficients): [-0.3966080519687018, -0.16730427521292512, 0.02073695961087896, -0.10736806291323153, -0.19548021866783138, -0.26636287330288133, 0.09831798804486057, -0.5526664612887832, -0.30822156654314603, 0.12543748801194046, -0.3978252267799436, -0.06590183506121514, -0.16460732769617195, -0.15034840743618247, -0.10881523509369588, -0.22310539939590893, -0.11960637388608134, -0.11017049101252849, -0.4184619836889878, 0.007091300208744331, -0.2726192752774813, -0.

## Result
1. The model will converge below the alpha = 1e-4, so the range of grid search will be 1e-10~1e-4.
2. The best parameter is 2.1544346900318822e-07.
3. No doubt that the Lasso is better than the linear regression.

# Random Forests

In [None]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth= 3, min_samples_split= 2, n_estimators= 200,random_state=42)

## set grid search
#param_grid = {
#    'n_estimators': [50, 100, 200],  # the number of trees
#    'max_depth': [None, 3, 5],       # the depth
#    'min_samples_split': [2, 5]     # samples needed to split
#}
#
## initialize random forest
#rf = RandomForestRegressor(random_state=42)
#loo = LeaveOneOut()
#
## use RMSE as the evaluation metric
#def neg_rmse(y_true, y_pred):
#    return -np.sqrt(mean_squared_error(y_true, y_pred))
#
#scorer = make_scorer(neg_rmse)
#
## do grid search
#grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=loo, scoring=scorer)
#grid_search.fit(X, y)
#
## show the result
#print("Best Parameters:", grid_search.best_params_)
#print("Best RMSE:", -grid_search.best_score_)  
#print("All Results:")
#for params, mean_test_score in zip(grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score']):
#    print(f"Params: {params}, RMSE: {-mean_test_score}") # reverse the negative score


# list to calculate RSME
errors = []
feature_importances = [] 
predictions = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    feature_importances.append(model.feature_importances_[0])
    
    # predict
    y_pred = model.predict(X_test)
    predictions.append(y_pred[0])
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Feature Importances: {feature_importances}")
print(f"Predictions: {predictions}")


Individual RMSEs: [2.9891111111111037, 0.898315476190473, 0.26828571428571735, 2.445293650793646, 2.664255952380951, 1.163559523809523, 1.3362341269841256, 1.825523809523812, 0.6137678571428555, 1.1499960317460278, 3.888932900432902, 0.6811428571428593, 0.8572261904761911, 0.7040714285714262, 1.3970238095238088, 0.03079365079365104, 1.502426587301585, 0.734666666666671, 3.3225972222222193, 0.32642857142857906, 4.31979166666666, 0.38538095238095593, 2.2212916666666658, 2.5086924603174623]
Mean RMSE: 1.593117078523328
Feature Importances: [0.002386125065212781, 0.001682174897408351, 0.0011466841630689243, 0.0009375636528740214, 0.0014167613858935353, 0.0010803061936910854, 0.0011636146845574308, 0.0017801974875295782, 0.0007541526333800193, 0.0008539176802156372, 0.0006725432960132843, 0.0013734988567772263, 0.0017353765935551114, 0.0013464002885213476, 0.0013791713764291621, 0.000982200053372363, 0.0012871733359278395, 0.0013185914038109864, 0.0011335271017208662, 0.0007547260383856773,

## Result
1. So far, the random forest is the SOTA.

# Decision Tree

In [30]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(max_depth = None, min_samples_leaf = 1, min_samples_split =  10, random_state=42)


## set grid research
#param_grid = {
#    'max_depth': [None, 2, 3, 5],        
#    'min_samples_split': [2, 5, 10],   
#    'min_samples_leaf': [1, 2, 4]   
#}
#
## initialize decision tree
#dt = DecisionTreeRegressor(random_state=42)
#loo = LeaveOneOut()
#
## same as above
#def neg_rmse(y_true, y_pred):
#    return -np.sqrt(mean_squared_error(y_true, y_pred))
#
#scorer = make_scorer(neg_rmse)
#
#grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=loo, scoring=scorer)
#grid_search.fit(X, y)
#
#print("Best Parameters:", grid_search.best_params_)
#print("Best RMSE:", -grid_search.best_score_) 
#print("All Results:")
#for params, mean_test_score in zip(grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score']):
#    print(f"Params: {params}, RMSE: {-mean_test_score}")  


# list to calculate RSME
errors = []
predictions = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    # predict
    y_pred = model.predict(X_test)
    predictions.append(y_pred[0])
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Predictions: {predictions}")

Individual RMSEs: [0.6000000000000014, 1.8000000000000007, 3.0, 3.5, 0.6000000000000001, 3.0, 3.0, 0.5999999999999996, 1.7999999999999998, 1.8000000000000007, 3.5, 0.5999999999999996, 0.6000000000000001, 1.8000000000000007, 0.5999999999999996, 3.0, 1.8000000000000007, 0.5999999999999996, 3.5, 0.6000000000000014, 3.0, 1.8000000000000007, 1.7999999999999998, 1.8000000000000007]
Mean RMSE: 1.8625
Predictions: [21.4, 9.8, 15.0, 16.5, 3.6, 3.0, 10.0, 15.4, 3.2, 21.8, 4.5, 9.6, 3.4, 15.8, 9.4, 22.0, 15.2, 15.6, 15.5, 21.6, 21.0, 9.2, 3.8, 21.2]


## Result
1. The result is better than the Lasso.
2. A little bit lower than the Random Forest, ,which makes sense.

# SVM

In [39]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.svm import SVR
model = SVR(C = 10, epsilon =  0.01, kernel = 'rbf')

## scaling the features
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
#
## set grid research
#param_grid = {
#    'C': [0.1, 1, 10, 100],         
#    'epsilon': [0.01, 0.1, 0.2],  
#    'kernel': ['linear', 'rbf']     
#}
#
## initial SVR
#svr = SVR()
#loo = LeaveOneOut()
#
#def neg_rmse(y_true, y_pred):
#    return -np.sqrt(mean_squared_error(y_true, y_pred))
#
#scorer = make_scorer(neg_rmse)
#
#grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=loo, scoring=scorer)
#grid_search.fit(X_scaled, y)
#
#
#print("Best Parameters:", grid_search.best_params_)
#print("Best RMSE:", -grid_search.best_score_) 
#print("All Results:")
#for params, mean_test_score in zip(grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score']):
#    print(f"Params: {params}, RMSE: {-mean_test_score}")  

# list to calculate RSME
errors = []
predictions = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    # predict
    y_pred = model.predict(X_test)
    predictions.append(y_pred[0])
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Predictions: {predictions}")

Individual RMSEs: [9.364964943601171, 3.554936067260634, 5.177361825052316, 1.1236253064041541, 9.077964570739613, 7.514658119484697, 6.870760667652064, 3.9636953344949237, 8.297987048378229, 8.180267115171453, 12.244660752633623, 1.3952289740076047, 7.219526706557851, 3.2529605214684363, 3.071433481263851, 5.872459145917034, 4.5025259762789105, 2.4770788009097586, 1.4826758835703444, 8.404001079137725, 12.031417772381994, 1.9994202954357156, 11.823605270440499, 10.038632257298241]
Mean RMSE: 6.205910329814202
Predictions: [12.635035056398829, 11.554936067260634, 12.822638174947684, 14.123625306404154, 12.077964570739613, 13.514658119484697, 13.870760667652064, 12.036304665505076, 13.297987048378229, 11.819732884828547, 13.244660752633623, 7.604771025992395, 11.21952670655785, 10.747039478531564, 13.07143348126385, 13.127540854082966, 12.49747402372109, 12.522921199090241, 13.482675883570344, 12.595998920862275, 11.968582227618006, 12.999420295435716, 13.823605270440499, 12.96136774270

## Result
1. It needs to explore more.