### Load Data

In [1]:
from fastNLP.io import SSTLoader

# initialize the SSTLoader
loader = SSTLoader()
# download the dataset to the default cache directory and returns the directory
data_dir = loader.download()
# load the dataset from the directory to a DataBundle   
data_bundle = loader.load(data_dir)  

In [2]:
print(data_bundle)

In total 3 datasets:
	dev has 1101 instances.
	test has 2210 instances.
	train has 8544 instances.



### Preprocessing

In [3]:
from fastNLP.io import SSTPipe

pipe = SSTPipe(subtree=False, 
               train_subtree=True, 
               lower=False, 
               granularity=5, tokenizer='spacy')

# (1) tokenization;
# (2) create vocabulary and indices from words;
data_bundle = pipe.process(data_bundle)  

print(data_bundle)

In total 3 datasets:
	dev has 1101 instances.
	test has 2210 instances.
	train has 318582 instances.
In total 2 vocabs:
	words has 20204 entries.
	target has 5 entries.



In [4]:
print(data_bundle.get_dataset('train')[:5])

+------------------------+--------+------------------------+---------+
| raw_words              | target | words                  | seq_len |
+------------------------+--------+------------------------+---------+
| The Rock is destine... | 1      | [21, 1215, 11, 5536... | 39      |
| The Rock               | 0      | [21, 1215]             | 2       |
| The                    | 0      | [21]                   | 1       |
| Rock                   | 0      | [1215]                 | 1       |
| is destined to be t... | 3      | [11, 5536, 8, 26, 2... | 37      |
+------------------------+--------+------------------------+---------+


In [5]:
vocab = data_bundle.get_vocab('words')
print(vocab)

Vocabulary(['The', 'Rock', 'is', 'destined', 'to']...)


In [6]:
index = vocab.to_index('Rock')
print("The index of the word 'Rock' is {}".format(index))
print("index:{} corresponds to the word {}".format(index, vocab.to_word(index)))

The index of the word 'Rock' is 1215
index:1215 corresponds to the word Rock


### Word Embedding

In [7]:
from fastNLP.embeddings import StaticEmbedding

glove_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-42b-300d', requires_grad=True)

Found 14858 out of 20204 words in the pre-training embedding.


### Load Training/Testing/Validation Set

In [8]:
train_data = data_bundle.get_dataset('train')
test_data = data_bundle.get_dataset('test')
val_data = data_bundle.get_dataset('dev')

print("#entries in training set:{}\n#entries in testing set:{}\n#entries in validation set:{}\n"
      .format(len(train_data), len(test_data), len(val_data)))

#entries in training set:318582
#entries in testing set:2210
#entries in validation set:1101



In [9]:
# NOTE: field whose is_input is true will become batch_x for iterations in DataSetIter 
#       while field whose is_target is true will become batch_y for iterations in DataSetIter

train_data.print_field_meta()

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
|   is_input  |   False   | False  |  True |   True  |
|  is_target  |   False   |  True  | False |  False  |
| ignore_type |           | False  | False |  False  |
|  pad_value  |           |   0    |   0   |    0    |
+-------------+-----------+--------+-------+---------+


<prettytable.prettytable.PrettyTable at 0x7f8065262590>

### Create Model

In [11]:
import torch
import torch.nn as nn

from fastNLP.core.utils import seq_len_to_mask
from fastNLP.modules import encoder

from fastNLP.embeddings.utils import get_embeddings

class CNNTextPretrained(torch.nn.Module):
    def __init__(self, embed,
                 num_classes,
                 kernel_nums=(30, 40, 50),
                 kernel_sizes=(1, 3, 5),
                 dropout=0.5):
        
        super(CNNTextPretrained, self).__init__()


        self.embedding = get_embeddings(embed)
        self.conv_pool = encoder.ConvMaxpool(
            in_channels=self.embedding.embedding_dim,
            out_channels=kernel_nums,
            kernel_sizes=kernel_sizes)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(sum(kernel_nums), num_classes)


    def forward(self, words, seq_len=None):
        x = self.embedding(words)  # [N,L] -> [N,L,C]
        if seq_len is not None:
            mask = seq_len_to_mask(seq_len)
            x = self.conv_pool(x, mask)
        else:
            x = self.conv_pool(x)  # [N,L,C] -> [N,C]
        x = self.dropout(x)
        x = self.fc(x)  # [N,C] -> [N, N_class]
        return {'pred': x}
    
model_cnn = CNNTextPretrained(glove_embed, kernel_nums=(100, 100, 100),
                 kernel_sizes=(1, 3, 5),num_classes=5, dropout=0.5)

print(model_cnn)

CNNTextPretrained(
  (embedding): StaticEmbedding(
    (dropout_layer): Dropout(p=0, inplace=False)
    (embedding): Embedding(19326, 300, padding_idx=0)
  )
  (conv_pool): ConvMaxpool(
    (convs): ModuleList(
      (0): Conv1d(300, 100, kernel_size=(1,), stride=(1,), bias=False)
      (1): Conv1d(300, 100, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
      (2): Conv1d(300, 100, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
    )
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=5, bias=True)
)


### Evaluation Metric

In [12]:
from fastNLP import AccuracyMetric
from fastNLP import Const

# `pred` corresponds to one key from the returned dict by the `forward` method of the model
# `target` corresponds to the field name representing the lable of the DataSet
metrics = AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)

### Loss Function & Optimizer

In [13]:
from fastNLP import CrossEntropyLoss

loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)

In [14]:
import torch.optim as optim

optimizer = optim.Adam(model_cnn.parameters(), lr=1e-3, betas=(0.9, 0.999), 
                       eps=1e-08, weight_decay=0, amsgrad=False)


### Train the Model

In [15]:
from fastNLP import Trainer
import torch

N_EPOCHS = 5 
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=val_data, 
                  loss=loss, metrics=metrics, optimizer=optimizer,
                  n_epochs=N_EPOCHS, batch_size=BATCH_SIZE, device=device,
                  save_path='./saved_models/sst5-cnn-non-static.pt')
trainer.train()

input fields after batch(if batch size is 2):
	words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 39]) 
	seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 
target fields after batch(if batch size is 2):
	target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 

training epochs started 2021-04-19-17-22-15-320738


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=49780.0), HTML(value='')), layout=Layout(…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=35.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.08 seconds!
Evaluation on dev at Epoch 1/5. Step:9956/49780: 
AccuracyMetric: acc=0.445958



  "type " + obj.__name__ + ". It won't be checked "


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=35.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.08 seconds!
Evaluation on dev at Epoch 2/5. Step:19912/49780: 
AccuracyMetric: acc=0.440509



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=35.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.07 seconds!
Evaluation on dev at Epoch 3/5. Step:29868/49780: 
AccuracyMetric: acc=0.436876



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=35.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.07 seconds!
Evaluation on dev at Epoch 4/5. Step:39824/49780: 
AccuracyMetric: acc=0.423252



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=35.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.08 seconds!
Evaluation on dev at Epoch 5/5. Step:49780/49780: 
AccuracyMetric: acc=0.430518

Reloaded the best model.

In Epoch:1/Step:9956, got best dev performance:
AccuracyMetric: acc=0.445958


{'best_eval': {'AccuracyMetric': {'acc': 0.445958}},
 'best_epoch': 1,
 'best_step': 9956,
 'seconds': 305.76}

### Test the Model

In [16]:
from fastNLP import Tester

tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())
tester.test()

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=139.0), HTML(value='')), layout=Layout(di…

Evaluate data in 0.27 seconds!
[tester] 
AccuracyMetric: acc=0.438462


{'AccuracyMetric': {'acc': 0.438462}}