In [11]:
# import libaries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('feature_dataset/nyc_features_final.csv')
df = df.drop(columns = ['Neighborhood'])
print(df.shape)

(24, 15)


In [3]:
val = pd.read_csv('feature_dataset/boston_features_final.csv')
val = val.drop(columns = ['Neighborhood'])
print(val.shape)

(10, 15)


In [4]:
X = df.drop(columns = ['SentimentScore'])
y = df['SentimentScore'].rank(ascending=False, method='min')
X_val = val.drop(columns = ['SentimentScore'])
y_val = val['SentimentScore'].rank(ascending=False, method='min')

# Define the model
- According to the Universal Approximation Theorem, we use one hidden layer NN here.
- Though deeper NN performances better, but we just use the one hidden layer for the interprebality.

In [36]:
class OneLayerNN(nn.Module):
    def __init__(self, input_size, hidden_size, func='relu'):
        super(OneLayerNN, self).__init__()
        self.func = func
        self.hidden = nn.Linear(input_size, hidden_size)
        self.output = nn.Linear(hidden_size, 1)
    
    def forward(self,x):
        z = self.hidden(x)
        if(self.func == 'relu'):
            a = torch.relu(z) 
        elif(self.func == 'sigmoid'):
            a = nn.Sigmoid(z)
        else:
            raise ValueError("Activation function should be relu or sigmoid.")
        output = self.output(a)
        return output

# Define the train function

In [97]:
# define model and loss functions
device = 'mps' if torch.backends.mps.is_available() else 'cpu'


def train(X_train,y_train,X_test,y_test,hidden_size = 128, epochs=1000, verbose=False):
    X_train = torch.tensor(X_train.values, dtype=torch.float32)
    y_train = torch.tensor(y_train.values, dtype=torch.float32)
    X_test = torch.tensor(X_test.values, dtype=torch.float32)
    y_test = torch.tensor(y_test.values, dtype=torch.float32)
    
    model = OneLayerNN(X_train.shape[1],hidden_size)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(),lr = 0.01)
    criterion = nn.MSELoss()
    train_loss = []
    test_loss = []
    X_test = X_test[0].unsqueeze(0)
    y_test = y_test[0].unsqueeze(0).unsqueeze(0)
    for epoch in range(epochs):
        if((epoch+1) % (epochs/10) == 0):
            if(verbose):
                print(f'{epoch+1}/{epochs} of the training is done')
        model.train()
        losses = 0
        for i in range(len(X_train)):
            sample, label = X_train[i].to(device), y_train[i].to(device)
            sample = sample.unsqueeze(0)
            label = label.unsqueeze(0).unsqueeze(0) 
            optimizer.zero_grad()
            output = model(sample)
            loss = criterion(output, label)
            losses += loss.item()
            loss.backward()
            optimizer.step()
        with torch.no_grad():
            X_test, y_test = X_test.to(device), y_test.to(device)
            output = model(X_test)
            loss = criterion(output, y_test)
        train_loss.append(losses)
        test_loss.append(loss.item())
    
    return train_loss, test_loss

            

In [98]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
count = 1
errors = []
for train_index, test_index in loo.split(X):
    # split the data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    epoch, error = train(X_train,y_train,X_test,y_test)
    errors.append(error)
    print(f'{count}/{len(X)} is done')
    count += 1
    
mean_rmse = np.mean(errors)
print(f"Individual RMSEs: {errors}")
print(f"Mean RMSE: {mean_rmse}")

1/24 is done
2/24 is done
3/24 is done
4/24 is done
5/24 is done
6/24 is done
7/24 is done
8/24 is done
9/24 is done
10/24 is done
11/24 is done
12/24 is done
13/24 is done
14/24 is done
15/24 is done
16/24 is done
17/24 is done
18/24 is done
19/24 is done
20/24 is done
21/24 is done
22/24 is done
23/24 is done
24/24 is done
Individual RMSEs: [[7166300.0, 5961847.5, 313405.125, 293011.75, 430983.59375, 467508.875, 312209.75, 286644.375, 256414.953125, 187518.234375, 207590.046875, 174444.515625, 171253.265625, 146609.859375, 129987.296875, 114050.6796875, 93495.5625, 56373.40625, 25004.451171875, 12133.1484375, 12245.51171875, 21254.580078125, 20317.328125, 6296.7021484375, 202.87989807128906, 7362.2744140625, 37551.81640625, 9939.0302734375, 83.57379913330078, 140467.25, 350924.0625, 1538040.25, 2303114.25, 1260018.875, 5345759.5, 38310.703125, 364290.28125, 7196.28955078125, 85675.0625, 31003.947265625, 41020.28125, 36908.76171875, 41422.62109375, 50560.87890625, 59143.171875, 66257.