### Load Data

In [1]:
from fastNLP.io import SSTLoader

# initialize the SSTLoader
loader = SSTLoader()
# download the dataset to the default cache directory and returns the directory
data_dir = loader.download()
# load the dataset from the directory to a DataBundle   
data_bundle = loader.load(data_dir)  

In [2]:
print(data_bundle)

In total 3 datasets:
	dev has 1101 instances.
	test has 2210 instances.
	train has 8544 instances.



### Preprocessing

In [3]:
from fastNLP.io import SSTPipe

pipe = SSTPipe(subtree=False, 
               train_subtree=True, 
               lower=False, 
               granularity=2, tokenizer='spacy')

# (1) tokenization;
# (2) create vocabulary and indices from words;
data_bundle = pipe.process(data_bundle)  

print(data_bundle)

In total 3 datasets:
	dev has 872 instances.
	test has 1821 instances.
	train has 98794 instances.
In total 2 vocabs:
	words has 19446 entries.
	target has 2 entries.



In [4]:
print(data_bundle.get_dataset('train')[:5])

+------------------------+--------+------------------------+---------+
| raw_words              | target | words                  | seq_len |
+------------------------+--------+------------------------+---------+
| The Rock is destine... | 0      | [25, 1448, 11, 4755... | 39      |
| is destined to be t... | 0      | [11, 4755, 9, 24, 3... | 37      |
| is destined to be t... | 0      | [11, 4755, 9, 24, 3... | 36      |
| destined to be the ... | 0      | [4755, 9, 24, 3, 47... | 35      |
| new                    | 0      | [115]                  | 1       |
+------------------------+--------+------------------------+---------+


In [5]:
vocab = data_bundle.get_vocab('words')
print(vocab)

Vocabulary(['The', 'Rock', 'is', 'destined', 'to']...)


In [6]:
index = vocab.to_index('new')
print("The index of the word 'new' is {}".format(index))
print("index:{} corresponds to the word {}".format(index, vocab.to_word(index)))

The index of the word 'new' is 115
index:115 corresponds to the word new


### Word Embedding

In [7]:
from fastNLP.embeddings import StaticEmbedding

glove_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-42b-300d', requires_grad=True)

Found 14417 out of 19446 words in the pre-training embedding.


### Load Training/Testing/Validation Set

In [8]:
train_data = data_bundle.get_dataset('train')
test_data = data_bundle.get_dataset('test')
val_data = data_bundle.get_dataset('dev')

print("#entries in training set:{}\n#entries in testing set:{}\n#entries in validation set:{}\n"
      .format(len(train_data), len(test_data), len(val_data)))

#entries in training set:98794
#entries in testing set:1821
#entries in validation set:872



In [9]:
# NOTE: field whose is_input is true will become batch_x for iterations in DataSetIter 
#       while field whose is_target is true will become batch_y for iterations in DataSetIter

train_data.print_field_meta()

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
|   is_input  |   False   | False  |  True |   True  |
|  is_target  |   False   |  True  | False |  False  |
| ignore_type |           | False  | False |  False  |
|  pad_value  |           |   0    |   0   |    0    |
+-------------+-----------+--------+-------+---------+


<prettytable.prettytable.PrettyTable at 0x7f45ab17a650>

### Create Model

In [10]:
from fastNLP.modules import LSTM, MLP
from torch import nn

# dimention for word embedding
EMBED_DIM = 300

class LSTMText(nn.Module):
    def __init__(self, embed, num_classes, hidden_dim=64, num_layers=2, dropout=0.5):
        super().__init__()
        
        self.embedding = embed
        self.lstm = LSTM(self.embedding.embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True)
        self.mlp = MLP([hidden_dim*2,num_classes], dropout=dropout)

    def forward(self, words):
        embedded = self.embedding(words)
        _,(hidden,_) = self.lstm(embedded)
        pred = self.mlp(torch.cat((hidden[-1],hidden[-2]),dim=1))
        return {"pred":pred}
    
model_lstm = LSTMText(embed=glove_embed, num_classes=2, 
                      hidden_dim=64, num_layers=2, dropout=0.5)

print(model_lstm)

LSTMText(
  (embedding): StaticEmbedding(
    (dropout_layer): Dropout(p=0, inplace=False)
    (embedding): Embedding(18718, 300, padding_idx=0)
  )
  (lstm): LSTM(
    (lstm): LSTM(300, 64, num_layers=2, batch_first=True, bidirectional=True)
  )
  (mlp): MLP(
    (hiddens): ModuleList()
    (output): Linear(in_features=128, out_features=2, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


### Evaluation Metric

In [11]:
from fastNLP import AccuracyMetric
from fastNLP import Const

# `pred` corresponds to one key from the returned dict by the `forward` method of the model
# `target` corresponds to the field name representing the lable of the DataSet
metrics = AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)

### Loss Function & Optimizer

In [12]:
from fastNLP import CrossEntropyLoss

loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)

In [13]:
import torch.optim as optim

optimizer = optim.Adam(model_lstm.parameters(), lr=1e-3, betas=(0.9, 0.999), 
                       eps=1e-08, weight_decay=0, amsgrad=False)


### Train the Model

In [14]:
from fastNLP import Trainer
import torch

N_EPOCHS = 10 
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

trainer = Trainer(model=model_lstm, train_data=train_data, dev_data=val_data, 
                  loss=loss, metrics=metrics, optimizer=optimizer,
                  n_epochs=N_EPOCHS, batch_size=BATCH_SIZE, device=device,
                  save_path='./saved_models/sst2-lstm.pt')
trainer.train()

input fields after batch(if batch size is 2):
	words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 39]) 
	seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 
target fields after batch(if batch size is 2):
	target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 

training epochs started 2021-04-19-17-43-04-647486


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=30880.0), HTML(value='')), layout=Layout(…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.14 seconds!
Evaluation on dev at Epoch 1/10. Step:3088/30880: 
AccuracyMetric: acc=0.834862



  "type " + obj.__name__ + ". It won't be checked "


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.14 seconds!
Evaluation on dev at Epoch 2/10. Step:6176/30880: 
AccuracyMetric: acc=0.847477



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.12 seconds!
Evaluation on dev at Epoch 3/10. Step:9264/30880: 
AccuracyMetric: acc=0.84289



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.14 seconds!
Evaluation on dev at Epoch 4/10. Step:12352/30880: 
AccuracyMetric: acc=0.83945



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.1 seconds!
Evaluation on dev at Epoch 5/10. Step:15440/30880: 
AccuracyMetric: acc=0.844037



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.1 seconds!
Evaluation on dev at Epoch 6/10. Step:18528/30880: 
AccuracyMetric: acc=0.834862



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.11 seconds!
Evaluation on dev at Epoch 7/10. Step:21616/30880: 
AccuracyMetric: acc=0.823394



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.1 seconds!
Evaluation on dev at Epoch 8/10. Step:24704/30880: 
AccuracyMetric: acc=0.817661



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.1 seconds!
Evaluation on dev at Epoch 9/10. Step:27792/30880: 
AccuracyMetric: acc=0.818807



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.1 seconds!
Evaluation on dev at Epoch 10/10. Step:30880/30880: 
AccuracyMetric: acc=0.803899

Reloaded the best model.

In Epoch:2/Step:6176, got best dev performance:
AccuracyMetric: acc=0.847477


{'best_eval': {'AccuracyMetric': {'acc': 0.847477}},
 'best_epoch': 2,
 'best_step': 6176,
 'seconds': 392.51}

### Test the Model

In [15]:
from fastNLP import Tester

tester = Tester(test_data, model_lstm, metrics=AccuracyMetric())
tester.test()

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=114.0), HTML(value='')), layout=Layout(di…

Evaluate data in 0.39 seconds!
[tester] 
AccuracyMetric: acc=0.861065


{'AccuracyMetric': {'acc': 0.861065}}