# Notebook to make estimations with Neural Networks

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
import copy
from scipy import stats
from gensim.models import KeyedVectors

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext as tt
from torch.utils.data import TensorDataset, DataLoader

from scripts import preprocess_text

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Importing the necessary data

In [None]:
embeddings = KeyedVectors.load_word2vec_format('../../data/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english'))

In [None]:
descriptions_ = pd.read_csv("../../data/airbnb_listings_description/london_listings_description_ward.csv")

In [None]:
imd_per_ward = pd.read_csv("../../data/imd_per_ward.csv")[['WD17CD','Index of Multiple Deprivation (IMD) Score','Education, Skills and Training Score','Employment Score (rate)','Income Score (rate)']]
imd_per_ward = imd_per_ward.rename(columns={"Index of Multiple Deprivation (IMD) Score": "IMD", "Education, Skills and Training Score" : "IMD_Edu", 'Employment Score (rate)' : 'IMD_Emp', 'Income Score (rate)': 'IMD_Inc'})

## Removing some wards (Ethics)

In [None]:
descriptions_per_ward = descriptions_.groupby('ward', as_index=False).agg(lambda x: list(x))

In [None]:
wards = []
for i in range(descriptions_per_ward.shape[0]):
    if (len(descriptions_per_ward['full_description'][i]) < 5):
        wards.append(descriptions_per_ward['ward'][i])

In [None]:
rows = []
for i in range(descriptions_.shape[0]):
    if (descriptions_['ward'][i] in wards):
        rows.append(i)
descriptions = descriptions_.drop(rows).reset_index()

## Data pre-processing

### Helper functions

In [None]:
# Function to get a subset of the pre-trained Word2Vec library (Not to overload CPU/GPU)
def get_embeddings_subset(w2v, word_list):
  for i in range(len(word_list)):
    if (word_list[i] not in embeddings.vocab):
      word_list[i] = 'UNK'
  word_list = list(set(word_list))
  
  vectors = []
  for token in word_list:
    vectors.append(w2v[token])
  
  embeddings_sub = KeyedVectors(300)
  zero_vec = np.zeros(300)
  embeddings_sub.add('<0>', zero_vec)
  embeddings_sub.add(word_list, vectors)

  return embeddings_sub

# Function to get the index of each word in the pre-trained library
def get_embeddings_idx(data, w2v):
  data_idx = []
  for i in data:
    current_sequence = []
    for token in i:
      if (token in w2v.vocab):
        token_to_find = token
      else:
        token_to_find = 'UNK'
      current_sequence.append(w2v.vocab[token_to_find].index)
    data_idx.append(current_sequence)
  return data_idx

# Function to batchify data per description length
def batchify_per_len(x, y, max_length):
  lengths = []
  for i in x:
    lengths.append(len(i))
  lengths = list(set(lengths))

  batches_x = []
  batches_y = []
  for l in lengths:
    current_x_batch = []
    current_y_batch = []
    for item in range(len(x)):
      if (len(x[item]) == l):
        current_x_batch.append(x[item])
        current_y_batch.append(y[item])
      if (len(current_x_batch) == max_length):
        batches_x.append(current_x_batch)
        batches_y.append(current_y_batch)
        current_x_batch = []
        current_y_batch = []
    if (len(current_x_batch) != 0):
      batches_x.append(current_x_batch)
      batches_y.append(current_y_batch)

  return (batches_x, batches_y)

### Pre-processing

In [None]:
descriptions_imd = descriptions.merge(imd_per_ward, left_on='ward', right_on="WD17CD").drop(['id', 'ward', 'WD17CD', 'IMD_Edu', 'IMD_Emp', 'IMD_Inc'], axis=1)

In [None]:
X_original = descriptions_imd['full_description'].tolist()
y_original = descriptions_imd['IMD'].tolist()

In [None]:
X_tokens = []
for i in X_original:
    X_tokens.append(nltk.word_tokenize(preprocess_text(i)))

In [None]:
token_list = set()
for i in X_tokens:
    for j in i:
        token_list.add(j)
token_list = list(token_list)

embeddings_subset = get_embeddings_subset(embeddings, token_list)

In [None]:
X_embeddings = get_embeddings_idx(X_tokens, embeddings_subset)

In [None]:
X = np.array(X_embeddings, dtype=object)
y = np.array(y_original)

## Doing estimations

### Helper functions

In [None]:
# Function to shuffle and split/prepare the data
def get_data(X, y):
    # Shuffle the original data
    shuffler = np.random.permutation(len(X))
    X_shuffled = X[shuffler]
    y_shuffled = y[shuffler]

    trainxs = X_shuffled[:round(0.64 * X.shape[0])]
    trainys = y_shuffled[:round(0.64 * X.shape[0])]
    validxs = X_shuffled[round(0.64 * X.shape[0]):round(0.8 * X.shape[0])]
    validys = y_shuffled[round(0.64 * X.shape[0]):round(0.8 * X.shape[0])]
    testxs = X_shuffled[round(0.8 * X.shape[0]):]
    testys = y_shuffled[round(0.8 * X.shape[0]):]

    return trainxs, trainys, validxs, validys, testxs, testys

### Model

In [None]:
class Net(nn.Module):
    def __init__(self, freeze_embeddings):
        super(Net, self).__init__()

        weights = torch.FloatTensor(embeddings_subset.vectors)
        self.embedding = nn.Embedding.from_pretrained(weights, freeze=freeze_embeddings)

        self.fc1 = nn.Linear(300, 150)
        self.fc2 = nn.Linear(150, 50)
        self.fc3 = nn.Linear(50, 25)
        self.fc4 = nn.Linear(25, 5)
        self.fc5 = nn.Linear(5, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(0, 2, 1)
        x = F.avg_pool1d(embedded, embedded.size(2)).squeeze(2)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x

### Training/Testing functions

In [None]:
# Function to get the model's loss on one specific dataset
def get_scores(model, xs, ys):
  criterion = nn.MSELoss()
  model.eval()
  with torch.no_grad():
    train_loss_run = 0
    for i in range(len(xs)):
      x = torch.tensor(xs[i]).long().to(device)
      y = torch.FloatTensor(ys[i]).to(device)
      output = model(x).squeeze(1)
      train_loss_run += torch.sqrt(criterion(output, y))
    train_loss_run = train_loss_run / len(xs)
  return train_loss_run.to('cpu').item()

# Function to test the model with the defined metrics (RMSE-MAE-Spearman Corr)
def test_model(model, xs, ys):
  (xs_batches, ys_batches) = batchify_per_len(xs, ys, 128)
  criterion_mse = nn.MSELoss()
  criterion_mae = nn.L1Loss()

  outputs = []
  targets = []

  with torch.no_grad():
    loss_rmse = 0
    loss_mae = 0
    for i in range(len(xs_batches)):
      x = torch.tensor(xs_batches[i]).long().to(device)
      y = torch.FloatTensor(ys_batches[i]).to(device)
      output = model(x).squeeze(1)
      outputs.append(output)
      targets.append(y)
      loss_rmse += torch.sqrt(criterion_mse(output, y))
      loss_mae += criterion_mae(output, y)
    loss_rmse = loss_rmse / len(xs_batches)
    loss_mae = loss_mae / len(xs_batches)
  outputs = torch.cat(outputs)
  targets = torch.cat(targets)
  loss_corr = stats.spearmanr(targets.cpu().numpy(), outputs.cpu().numpy())[0]
  print("Test Set --> RMSE Loss : {} / MAE Loss : {} / Spearman Correlation : {}".format(loss_rmse, loss_mae, loss_corr))
  return loss_rmse.to('cpu').item(), loss_mae.to('cpu').item(), loss_corr

# Function to train a model
def train_model(model, train_x, train_y, valid_x, valid_y, batch_size, learning_rate, max_epochs=1000, num_iterations=15, verbose=True):
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  criterion = nn.MSELoss()

  training_losses = []
  validation_losses = []

  min_loss = 10000
  min_loss_epoch = None
  best_model = None
  convergence_counter = 0

  epoch = 0
  while (convergence_counter < num_iterations and epoch < max_epochs):
    # Shuffle the training dataset
    shuffler = list(np.random.permutation(len(train_x)))
    train_x_shuffled = train_x[shuffler]
    train_y_shuffled = train_y[shuffler]

    # Batchify the data per length of sentence
    (train_batches_x, train_batches_y) = batchify_per_len(train_x_shuffled, train_y_shuffled, batch_size)
    (valid_batches_x, valid_batches_y) = batchify_per_len(valid_x, valid_y, batch_size)
    
    # Iterate through the batches and train
    model.train()
    for i in range(len(train_batches_x)):
      x = torch.tensor(train_batches_x[i]).long().to(device)
      y = torch.FloatTensor(train_batches_y[i]).to(device)
      output = model(x).squeeze(1)
      loss = criterion(output, y)
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
    
    # Evaluating the model
    model.eval()
    train_loss = get_scores(model, train_batches_x, train_batches_y)
    valid_loss = get_scores(model, valid_batches_x, valid_batches_y)
    # Storing and printing evaluation values
    training_losses.append(train_loss)
    validation_losses.append(valid_loss)
    if (verbose):
      print("Epoch {} | Training Loss : {} | Validation Loss : {}".format(epoch, train_loss, valid_loss))
    
    # Checking current model's performance, and manage early stopping
    if (valid_loss < min_loss):
      min_loss = valid_loss
      best_model = copy.deepcopy(model)
      min_loss_epoch = epoch
      convergence_counter = 0
    else:
      convergence_counter += 1
    epoch += 1

  # Plot the training/validation curves
  plt.plot(training_losses, label="Training")
  plt.plot(validation_losses, label="Validation")
  plt.legend()
  plt.xlabel('Epochs')
  plt.ylabel('Mean-Squared Error Loss')
  plt.show()
  print("Trained for {} epochs | Best Validation Loss : {} (Epoch : {})".format(epoch, min_loss, min_loss_epoch))

  return best_model

### Training and testing the models

In [None]:
RMSEs, MAEs, Corrs = [], [], []
for i in range(10):
    trainxs, trainys, validxs, validys, testxs, testys = get_data(copy.deepcopy(X), copy.deepcopy(y))
    # Change the freeze_embedding attribute to change between non-fine-tuned and fine-tuned
    model = Net(freeze_embeddings=False).to(device)
    trained_model = train_model(model, trainxs, trainys, validxs, validys, 128, 0.01, verbose=False)
    rmse, mae, corr = test_model(trained_model, testxs, testys)
    RMSEs.append(rmse)
    MAEs.append(mae)
    Corrs.append(corr)

## Output the results to .csv

In [None]:
results = pd.DataFrame()
results['RMSE'] = RMSEs
results['MAE'] = MAEs
results['Spearman Correlation'] = Corrs

In [None]:
results.to_csv("../../data/temp_results/london_w2v_meanPooling_FineTune.csv", index=False)