In [31]:
# import libaries
import numpy as np
import pandas as pd

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error

In [66]:
df = pd.read_csv('feature_dataset/nyc_features_final.csv')
df = df.drop(columns = ['Neighborhood'])
print(df.shape)

(24, 15)


In [68]:
val = pd.read_csv('feature_dataset/boston_features_final.csv')
val = val.drop(columns = ['Neighborhood'])
print(val.shape)

(10, 15)


In [69]:
X = df.drop(columns = ['SentimentScore'])
y = df['SentimentScore'].rank(ascending=False, method='min')
X_val = val.drop(columns = ['SentimentScore'])
y_val = val['SentimentScore'].rank(ascending=False, method='min')

In [70]:
pd.set_option('display.max_columns', None)  # for debug

# Linear Regression

In [71]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# list to calculate RSME
errors = []
weights = []  
intercepts = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    weights.append(model.coef_[0])  
    intercepts.append(model.intercept_)
    
    # predict
    y_pred = model.predict(X_test)
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Weights (Coefficients): {weights}")
print(f"Intercepts: {intercepts}")

Individual RMSEs: [24.54462695005715, 7.570959611653436, 6.37292978462817, 1.7690428059412255, 9.460883862604447, 0.08755271200507764, 6.846757188955351, 0.06881682076094364, 10.180100873914284, 16.270705667198, 17.674763220445186, 3.7595519787172833, 19.17347215274617, 2.121831151570788, 0.9296949795323712, 13.703634971697142, 10.956005853284793, 21.297800281321955, 4.964674076059254, 9.785616063801243, 38.265514346803954, 33.25207402566551, 37.35176094708305, 15.453290183677169]
Mean RMSE: 12.994252521255165
Weights (Coefficients): [-0.44198188254007503, -0.07235192615095021, -0.2617413529233815, -0.1436602830208433, -0.07645911668476771, -0.13731670570554746, -0.09501533901902127, -0.13713167328429662, -0.13656474117952624, 0.18215924905530081, 0.03099268335334702, -0.11352888424236592, -0.6554906800886398, -0.128819290200421, -0.14834605450430755, -0.17538295081576882, -0.03270911632399867, -0.010930550605936059, -0.1730810737577186, -0.1892790284824883, -0.2731014924643862, -0.736

## Result
1. no need to do grid search

# Lasso

In [80]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.linear_model import Lasso
model = Lasso(alpha=46415.888336127726)

## set the grid search
#param_grid = {'alpha': np.logspace(-10, -4, 10)}  # alphaの値を0.001から10まで対数スケールで探索
#lasso = Lasso()
#
## use MSE as the evaluation score
#def neg_rmse(y_true, y_pred):
#    return -np.sqrt(mean_squared_error(y_true, y_pred))
#
#scorer = make_scorer(neg_rmse)
#
## execute grid search
#grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=loo, scoring=scorer)
#grid_search.fit(X, y)
#
## show the result
#print("Best Parameters:", grid_search.best_params_)
#print("Best RMSE:", grid_search.best_score_)
#print("All Results:")
#for params, mean_test_score in zip(grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score']):
#    print(f"alpha: {params['alpha']}, RMSE: {mean_test_score}")


# list to calculate RSME
errors = []
weights = []  
intercepts = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    weights.append(model.coef_[0])  
    intercepts.append(model.intercept_)
    
    # predict
    y_pred = model.predict(X_test)
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Weights (Coefficients): {weights}")
print(f"Intercepts: {intercepts}")


Individual RMSEs: [9.91304347826087, 4.695652173913043, 7.826086956521738, 4.695652173913043, 9.91304347826087, 6.782608695652174, 5.739130434782609, 1.5652173913043477, 7.826086956521738, 6.782608695652174, 12.0, 3.6521739130434785, 8.869565217391305, 0.5217391304347831, 3.6521739130434785, 2.608695652173912, 5.739130434782609, 0.5217391304347831, 2.608695652173912, 8.869565217391305, 12.0, 1.5652173913043477, 10.956521739130435, 10.956521739130435]
Mean RMSE: 6.260869565217391
Weights (Coefficients): [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0]
Intercepts: [12.08695652173913, 12.695652173913043, 12.173913043478262, 12.304347826086957, 12.91304347826087, 12.782608695652174, 12.73913043478261, 12.565217391304348, 12.826086956521738, 12.217391304347826, 13.0, 12.652173913043478, 12.869565217391305, 12.478260869565217, 12.347826086956522, 12.391304347826088, 12.26086956521739, 12.521739130

## Result
1. The model will converge lager the alpha = 1e-2, so the range of grid search will be 1e-2~1e10.
2. The best parameter is 46415.888336127726.
3. No doubt that the Lasso is better than the linear regression.

# Random Forests

In [82]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth= None, min_samples_split= 2, n_estimators= 200,random_state=42)

## set grid search
#param_grid = {
#    'n_estimators': [50, 100, 200],  # the number of trees
#    'max_depth': [None, 3, 5],       # the depth
#    'min_samples_split': [2, 5]     # samples needed to split
#}
#
## initialize random forest
#rf = RandomForestRegressor(random_state=42)
#loo = LeaveOneOut()
#
## use RMSE as the evaluation metric
#def neg_rmse(y_true, y_pred):
#    return -np.sqrt(mean_squared_error(y_true, y_pred))
#
#scorer = make_scorer(neg_rmse)
#
## do grid search
#grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=loo, scoring=scorer)
#grid_search.fit(X, y)
#
## show the result
#print("Best Parameters:", grid_search.best_params_)
#print("Best RMSE:", -grid_search.best_score_)  
#print("All Results:")
#for params, mean_test_score in zip(grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score']):
#    print(f"Params: {params}, RMSE: {-mean_test_score}") # reverse the negative score


# list to calculate RSME
errors = []
feature_importances = [] 
predictions = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    feature_importances.append(model.feature_importances_[0])
    
    # predict
    y_pred = model.predict(X_test)
    predictions.append(y_pred[0])
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Feature Importances: {feature_importances}")
print(f"Predictions: {predictions}")


Individual RMSEs: [8.295, 2.914999999999999, 6.24, 1.75, 11.1, 6.220000000000001, 5.359999999999999, 1.4000000000000004, 6.885, 5.875, 15.350000000000001, 0.33000000000000007, 6.140000000000001, 0.6999999999999993, 4.529999999999999, 0.0649999999999995, 6.539999999999999, 1.0099999999999998, 1.2449999999999992, 9.09, 15.96, 1.6099999999999994, 11.44, 10.97]
Mean RMSE: 5.8758333333333335
Feature Importances: [0.043001538461551254, 0.02176238875398498, 0.02216508700613175, 0.037033583689724474, 0.025532747249473374, 0.023601510261014206, 0.02836451842525643, 0.036967475650471385, 0.03540790367347639, 0.034392555950860604, 0.03855854179812625, 0.022535189587409844, 0.020117241748763292, 0.02580841060053149, 0.03744777102937323, 0.031969410787008296, 0.029476786133001625, 0.028367031369100144, 0.02865555514062782, 0.035434652310560276, 0.047329986478379556, 0.03200632001861963, 0.04111344566262217, 0.03637326415185113]
Predictions: [13.705, 10.915, 13.76, 15.25, 14.1, 12.22, 12.36, 12.4, 1

## Result
1. It does not perform well compared to the Decision Tree.
2. May be it is more robust.

# Decision Tree

In [86]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(max_depth = 3, min_samples_leaf = 1, min_samples_split =  2, random_state=42)


## set grid research
#param_grid = {
#    'max_depth': [None, 2, 3, 5],        
#    'min_samples_split': [2, 5, 10],   
#    'min_samples_leaf': [1, 2, 4]   
#}
#
## initialize decision tree
#dt = DecisionTreeRegressor(random_state=42)
#loo = LeaveOneOut()
#
## same as above
#def neg_rmse(y_true, y_pred):
#    return -np.sqrt(mean_squared_error(y_true, y_pred))
#
#scorer = make_scorer(neg_rmse)
#
#grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=loo, scoring=scorer)
#grid_search.fit(X, y)
#
#print("Best Parameters:", grid_search.best_params_)
#print("Best RMSE:", -grid_search.best_score_) 
#print("All Results:")
#for params, mean_test_score in zip(grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score']):
#    print(f"Params: {params}, RMSE: {-mean_test_score}")  


# list to calculate RSME
errors = []
predictions = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    # predict
    y_pred = model.predict(X_test)
    predictions.append(y_pred[0])
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Predictions: {predictions}")

Individual RMSEs: [6.75, 1.0, 4.75, 4.0, 4.5, 8.833333333333334, 1.0, 3.0, 3.5, 2.5, 19.5, 1.0, 1.0, 4.5, 1.0, 0.3333333333333339, 8.571428571428571, 1.833333333333334, 4.166666666666666, 0.0, 15.333333333333334, 0.5, 7.625, 2.5]
Mean RMSE: 4.487351190476191
Predictions: [15.25, 7.0, 15.25, 21.0, 7.5, 14.833333333333334, 8.0, 14.0, 1.5, 21.5, 20.5, 8.0, 3.0, 8.5, 15.0, 15.333333333333334, 9.428571428571429, 13.833333333333334, 14.166666666666666, 21.0, 8.666666666666666, 13.5, 9.625, 20.5]


## Result
1. The result is better than the Lasso.
2. So far, it is SOTA.

# SVM

In [89]:
# Leave-One-Out Cross-Validation 
loo = LeaveOneOut()

# define model
from sklearn.svm import SVR
model = SVR(C = 10, epsilon =  0.01, kernel = 'rbf')

## scaling the features
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
#
## set grid research
#param_grid = {
#    'C': [0.1, 1, 10, 100],         
#    'epsilon': [0.01, 0.1, 0.2],  
#    'kernel': ['linear', 'rbf']     
#}
#
## initial SVR
#svr = SVR()
#loo = LeaveOneOut()
#
#def neg_rmse(y_true, y_pred):
#    return -np.sqrt(mean_squared_error(y_true, y_pred))
#
#scorer = make_scorer(neg_rmse)
#
#grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=loo, scoring=scorer)
#grid_search.fit(X_scaled, y)
#
#
#print("Best Parameters:", grid_search.best_params_)
#print("Best RMSE:", -grid_search.best_score_) 
#print("All Results:")
#for params, mean_test_score in zip(grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score']):
#    print(f"Params: {params}, RMSE: {-mean_test_score}")  

# list to calculate RSME
errors = []
predictions = []  

# loop to do the leave one out
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # learn the model
    model.fit(X_train, y_train)
    
    # predict
    y_pred = model.predict(X_test)
    predictions.append(y_pred[0])
    
    # add the error to the list
    error = np.sqrt(mean_squared_error(y_test, y_pred))  # RMSE
    errors.append(error)

# caculate the average
mean_rmse = np.mean(errors)

print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")
print(f"Predictions: {predictions}")

Individual RMSEs: [9.790452536622347, 3.578269289945869, 7.875028934546162, 4.881792577733821, 7.859007314868007, 6.902519496379485, 5.884742605947743, 0.4834468874172835, 8.663952169667462, 6.65317406151849, 11.539696443024587, 1.5148186271405972, 9.867587234196863, 1.3632830966999716, 3.9385905417928395, 2.0282362829489493, 4.974767076338855, 0.5051572446608183, 3.70729364392154, 8.68268099192101, 13.648089367651515, 2.3317017736759738, 10.024673733513, 11.688233273620144]
Mean RMSE: 6.182799800239722
Predictions: [12.209547463377653, 11.578269289945869, 12.124971065453838, 12.11820742226618, 10.859007314868007, 12.902519496379485, 12.884742605947743, 11.483446887417283, 13.663952169667462, 12.34682593848151, 12.539696443024587, 10.514818627140597, 13.867587234196863, 11.636716903300028, 12.06140945820716, 12.97176371705105, 13.025232923661145, 11.494842755339182, 13.70729364392154, 12.31731900807899, 10.351910632348485, 16.331701773675974, 12.024673733513, 11.311766726379856]


## Result
1. It needs to explore more.