### Load Data

In [1]:
from fastNLP.io import SSTLoader

# initialize the SSTLoader
loader = SSTLoader()
# download the dataset to the default cache directory and returns the directory
data_dir = loader.download()
# load the dataset from the directory to a DataBundle   
data_bundle = loader.load(data_dir)  

In [2]:
print(data_bundle)

In total 3 datasets:
	dev has 1101 instances.
	test has 2210 instances.
	train has 8544 instances.



### Preprocessing

In [3]:
from fastNLP.io import SSTPipe

pipe = SSTPipe(subtree=False, 
               train_subtree=True, 
               lower=False, 
               granularity=5, tokenizer='spacy')

# (1) tokenization;
# (2) create vocabulary and indices from words;
data_bundle = pipe.process(data_bundle)  

print(data_bundle)

In total 3 datasets:
	dev has 1101 instances.
	test has 2210 instances.
	train has 318582 instances.
In total 2 vocabs:
	words has 20204 entries.
	target has 5 entries.



In [4]:
print(data_bundle.get_dataset('train')[:5])

+------------------------+--------+------------------------+---------+
| raw_words              | target | words                  | seq_len |
+------------------------+--------+------------------------+---------+
| The Rock is destine... | 1      | [21, 1215, 11, 5536... | 39      |
| The Rock               | 0      | [21, 1215]             | 2       |
| The                    | 0      | [21]                   | 1       |
| Rock                   | 0      | [1215]                 | 1       |
| is destined to be t... | 3      | [11, 5536, 8, 26, 2... | 37      |
+------------------------+--------+------------------------+---------+


In [5]:
vocab = data_bundle.get_vocab('words')
print(vocab)

Vocabulary(['The', 'Rock', 'is', 'destined', 'to']...)


In [6]:
index = vocab.to_index('Rock')
print("The index of the word 'Rock' is {}".format(index))
print("index:{} corresponds to the word {}".format(index, vocab.to_word(index)))

The index of the word 'Rock' is 1215
index:1215 corresponds to the word Rock


### Word Embedding

In [7]:
from fastNLP.embeddings import StaticEmbedding

glove_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-42b-300d', requires_grad=True)

Found 14858 out of 20204 words in the pre-training embedding.


### Load Training/Testing/Validation Set

In [8]:
train_data = data_bundle.get_dataset('train')
test_data = data_bundle.get_dataset('test')
val_data = data_bundle.get_dataset('dev')

print("#entries in training set:{}\n#entries in testing set:{}\n#entries in validation set:{}\n"
      .format(len(train_data), len(test_data), len(val_data)))

#entries in training set:318582
#entries in testing set:2210
#entries in validation set:1101



In [9]:
# NOTE: field whose is_input is true will become batch_x for iterations in DataSetIter 
#       while field whose is_target is true will become batch_y for iterations in DataSetIter

train_data.print_field_meta()

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
|   is_input  |   False   | False  |  True |   True  |
|  is_target  |   False   |  True  | False |  False  |
| ignore_type |           | False  | False |  False  |
|  pad_value  |           |   0    |   0   |    0    |
+-------------+-----------+--------+-------+---------+


<prettytable.prettytable.PrettyTable at 0x7f5becb3b890>

### Create Model

In [23]:
from fastNLP.modules import LSTM, MLP
from torch import nn

# dimention for word embedding
EMBED_DIM = 300

class LSTMText(nn.Module):
    def __init__(self, embed, num_classes, hidden_dim=64, num_layers=2, dropout=0.5):
        super().__init__()
        
        self.embedding = embed
        self.lstm = LSTM(self.embedding.embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True)
        self.mlp = MLP([hidden_dim*2,num_classes], dropout=dropout)

    def forward(self, words):
        embedded = self.embedding(words)
        _,(hidden,_) = self.lstm(embedded)
        pred = self.mlp(torch.cat((hidden[-1],hidden[-2]),dim=1))
        return {"pred":pred}
    
model_lstm = LSTMText(embed=glove_embed, num_classes=5, 
                      hidden_dim=128, num_layers=2, dropout=0.5)

print(model_lstm)

LSTMText(
  (embedding): StaticEmbedding(
    (dropout_layer): Dropout(p=0, inplace=False)
    (embedding): Embedding(19326, 300, padding_idx=0)
  )
  (lstm): LSTM(
    (lstm): LSTM(300, 128, num_layers=2, batch_first=True, bidirectional=True)
  )
  (mlp): MLP(
    (hiddens): ModuleList()
    (output): Linear(in_features=256, out_features=5, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


### Evaluation Metric

In [24]:
from fastNLP import AccuracyMetric
from fastNLP import Const

# `pred` corresponds to one key from the returned dict by the `forward` method of the model
# `target` corresponds to the field name representing the lable of the DataSet
metrics = AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)

### Loss Function & Optimizer

In [25]:
from fastNLP import CrossEntropyLoss

loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)

In [26]:
import torch.optim as optim

optimizer = optim.Adam(model_lstm.parameters(), lr=1e-3, betas=(0.9, 0.999), 
                       eps=1e-08, weight_decay=0, amsgrad=False)


### Train the Model

In [28]:
from fastNLP import Trainer
import torch

N_EPOCHS = 5 
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

trainer = Trainer(model=model_lstm, train_data=train_data, dev_data=val_data, 
                  loss=loss, metrics=metrics, optimizer=optimizer,
                  n_epochs=N_EPOCHS, batch_size=BATCH_SIZE, device=device, 
                  save_path='./saved_models/sst5-lstm.pt')
trainer.train()

input fields after batch(if batch size is 2):
	words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 39]) 
	seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 
target fields after batch(if batch size is 2):
	target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 

training epochs started 2021-04-18-10-43-06-089928


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=49780.0), HTML(value='')), layout=Layout(…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=35.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.33 seconds!


  "type " + obj.__name__ + ". It won't be checked "


Evaluation on dev at Epoch 1/5. Step:9956/49780: 
AccuracyMetric: acc=0.425068



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=35.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.33 seconds!
Evaluation on dev at Epoch 2/5. Step:19912/49780: 
AccuracyMetric: acc=0.400545



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=35.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.23 seconds!
Evaluation on dev at Epoch 3/5. Step:29868/49780: 
AccuracyMetric: acc=0.416894



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=35.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.17 seconds!
Evaluation on dev at Epoch 4/5. Step:39824/49780: 
AccuracyMetric: acc=0.404178



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=35.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.13 seconds!
Evaluation on dev at Epoch 5/5. Step:49780/49780: 
AccuracyMetric: acc=0.419619

Reloaded the best model.

In Epoch:1/Step:9956, got best dev performance:
AccuracyMetric: acc=0.425068


{'best_eval': {'AccuracyMetric': {'acc': 0.425068}},
 'best_epoch': 1,
 'best_step': 9956,
 'seconds': 738.6}

### Test the Model

In [29]:
from fastNLP import Tester

tester = Tester(test_data, model_lstm, metrics=AccuracyMetric())
tester.test()

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=139.0), HTML(value='')), layout=Layout(di…

Evaluate data in 0.46 seconds!
[tester] 
AccuracyMetric: acc=0.462443


{'AccuracyMetric': {'acc': 0.462443}}