In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
import json
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

In [2]:
with open('./data_full.json') as f:
    full_data = json.load(f)
train, val, test, labels = [], [], [], []

In [3]:
# choose 20 random labels
train_index, val_index, test_index = 500, 100, 150
for i in range(20):
    labels.append(full_data['train'][train_index][1])
    train += full_data['train'][train_index : train_index+100]
    train_index += 100
    val += full_data['val'][val_index : val_index+20]
    val_index += 20
    test += full_data['test'][test_index : test_index+30]
    test_index += 30

In [4]:
# check the chosen labels
print(labels)
label_dict = {}
for i in range(20):
    label_dict[labels[i]] = i

['insurance_change', 'find_phone', 'travel_alert', 'pto_request', 'improve_credit_score', 'fun_fact', 'change_language', 'payday', 'replacement_card_duration', 'time', 'application_status', 'flight_status', 'flip_coin', 'change_user_name', 'where_are_you_from', 'shopping_list_update', 'what_can_i_ask_you', 'maybe', 'oil_change_how', 'restaurant_reservation']


In [5]:
# create dataset from the chosen labels
train = pd.DataFrame(train, columns = ['text', 'label'])
val = pd.DataFrame(train, columns = ['text', 'label'])
test = pd.DataFrame(train, columns = ['text', 'label'])

In [6]:
# quantify the labels
train['label'] = train['label'].apply(lambda x: label_dict[x])
val['label'] = val['label'].apply(lambda x: label_dict[x])
test['label'] = test['label'].apply(lambda x: label_dict[x])

In [7]:
# store lengths for the text data
train['length'] = train['text'].apply(lambda x: len(x.split()))
val['length'] = val['text'].apply(lambda x: len(x.split()))
test['length'] = test['text'].apply(lambda x: len(x.split()))

In [8]:
# tokenize the text data
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [9]:
# create vocabulary from words and convert to indices
counts = Counter()
for index, row in train.iterrows():
    counts.update(tokenize(row['text']))
    
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
        
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [10]:
def encode_sentence(text, vocab2index, N=10):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [11]:
# store encoded vectors and their lengths for the sentences
train['encoded'] = train['text'].apply(lambda x: np.array(encode_sentence(x,vocab2index ), dtype=object))
val['encoded'] = val['text'].apply(lambda x: np.array(encode_sentence(x,vocab2index ), dtype=object))
test['encoded'] = test['text'].apply(lambda x: np.array(encode_sentence(x,vocab2index ), dtype=object))

In [12]:
class IntentsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [13]:
# create the dataset in required form
X_train, Y_train = train['encoded'], train['label']
X_val, Y_val = val['encoded'], val['label']
X_test, Y_test = test['encoded'], test['label']

train_ds = IntentsDataset(X_train, Y_train)
valid_ds = IntentsDataset(X_val, Y_val)
test_ds = IntentsDataset(X_test, Y_test)

In [14]:
# training and evaluation
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [15]:
# intialization
batch_size = 50
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)
test_dl = DataLoader(test_ds, batch_size=batch_size)

In [16]:
# define the lstm model
class LSTM(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.3)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 20)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out

In [17]:
model = LSTM(vocab_size, 50, 50)

In [18]:
# train the model
train_model(model, epochs=30, lr=0.01)

train loss 0.443, val loss 0.136, val accuracy 0.969
train loss 0.051, val loss 0.008, val accuracy 0.999
train loss 0.015, val loss 0.002, val accuracy 1.000
train loss 0.013, val loss 0.006, val accuracy 0.999
train loss 0.014, val loss 0.001, val accuracy 1.000
train loss 0.011, val loss 0.001, val accuracy 1.000


In [19]:
torch.save(model.state_dict(), './trained_model')

In [20]:
# testing on test dataset
test_loss, test_acc, test_rmse = validation_metrics(model, test_dl)
print("test loss %.3f, test accuracy %.3f" % (test_loss, test_acc))

test loss 0.000, test accuracy 1.000


### Discussion

Input feature representation: For the purpose of saving time I decided to go with a simple bag of words model by counting the number of word in the vocabulary and assigning them an index.

Model Architecture: I chose to use an LSTM because it can best encode memory into a sentence and retain older information and relationships between parts of sentences that a simple linear classifier cannot.

Training parameters: Again, for shortage of time, I did a simple tuning of the learning rate for a good loss decrease but could not exoeriment much with embedding sizes or multiple lstm layers.

Evaluation: Cross Entropy Loss works well for classification in balanced dataset, so I decided to go with that. I also check accuracy as is custom.

Future work: Improve feature representations using learned embeddings like word2vec or glove. Tune other hyperparameters such as embedding size, hidden size. 