### Load Data

In [1]:
from fastNLP.io import SSTLoader

# initialize the SSTLoader
loader = SSTLoader()
# download the dataset to the default cache directory and returns the directory
data_dir = loader.download()
# load the dataset from the directory to a DataBundle   
data_bundle = loader.load(data_dir)  

In [2]:
print(data_bundle)

In total 3 datasets:
	dev has 1101 instances.
	test has 2210 instances.
	train has 8544 instances.



### Preprocessing

In [3]:
from fastNLP.io import SSTPipe

pipe = SSTPipe(subtree=False, 
               train_subtree=True, 
               lower=False, 
               granularity=2, tokenizer='spacy')

# (1) tokenization;
# (2) create vocabulary and indices from words;
data_bundle = pipe.process(data_bundle)  

print(data_bundle)

In total 3 datasets:
	dev has 872 instances.
	test has 1821 instances.
	train has 98794 instances.
In total 2 vocabs:
	words has 19446 entries.
	target has 2 entries.



In [4]:
print(data_bundle.get_dataset('train')[:5])

+------------------------+--------+------------------------+---------+
| raw_words              | target | words                  | seq_len |
+------------------------+--------+------------------------+---------+
| The Rock is destine... | 0      | [25, 1448, 11, 4755... | 39      |
| is destined to be t... | 0      | [11, 4755, 9, 24, 3... | 37      |
| is destined to be t... | 0      | [11, 4755, 9, 24, 3... | 36      |
| destined to be the ... | 0      | [4755, 9, 24, 3, 47... | 35      |
| new                    | 0      | [115]                  | 1       |
+------------------------+--------+------------------------+---------+


In [5]:
vocab = data_bundle.get_vocab('words')
print(vocab)

Vocabulary(['The', 'Rock', 'is', 'destined', 'to']...)


In [6]:
index = vocab.to_index('new')
print("The index of the word 'new' is {}".format(index))
print("index:{} corresponds to the word {}".format(index, vocab.to_word(index)))

The index of the word 'new' is 115
index:115 corresponds to the word new


### Word Embedding

In [7]:
from fastNLP.embeddings import StaticEmbedding

glove_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-42b-300d', requires_grad=True)

Found 14417 out of 19446 words in the pre-training embedding.


### Load Training/Testing/Validation Set

In [8]:
train_data = data_bundle.get_dataset('train')
test_data = data_bundle.get_dataset('test')
val_data = data_bundle.get_dataset('dev')

print("#entries in training set:{}\n#entries in testing set:{}\n#entries in validation set:{}\n"
      .format(len(train_data), len(test_data), len(val_data)))

#entries in training set:98794
#entries in testing set:1821
#entries in validation set:872



In [9]:
# NOTE: field whose is_input is true will become batch_x for iterations in DataSetIter 
#       while field whose is_target is true will become batch_y for iterations in DataSetIter

train_data.print_field_meta()

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
|   is_input  |   False   | False  |  True |   True  |
|  is_target  |   False   |  True  | False |  False  |
| ignore_type |           | False  | False |  False  |
|  pad_value  |           |   0    |   0   |    0    |
+-------------+-----------+--------+-------+---------+


<prettytable.prettytable.PrettyTable at 0x7f7c7d4b98d0>

### Create Model

In [10]:
from fastNLP.models import CNNText

# dimention for word embedding
EMBED_DIM = 300

model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=2, kernel_nums=(100, 100, 100),
                 kernel_sizes=(1, 3, 5), dropout=0.5)


print(model_cnn)

CNNText(
  (embed): Embedding(
    (embed): Embedding(19446, 300)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (conv_pool): ConvMaxpool(
    (convs): ModuleList(
      (0): Conv1d(300, 100, kernel_size=(1,), stride=(1,), bias=False)
      (1): Conv1d(300, 100, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
      (2): Conv1d(300, 100, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
    )
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=2, bias=True)
)


### Evaluation Metric

In [11]:
from fastNLP import AccuracyMetric
from fastNLP import Const

# `pred` corresponds to one key from the returned dict by the `forward` method of the model
# `target` corresponds to the field name representing the lable of the DataSet
metrics = AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)

### Loss Function & Optimizer

In [12]:
from fastNLP import CrossEntropyLoss

loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)

In [13]:
import torch.optim as optim

optimizer = optim.Adam(model_cnn.parameters(), lr=1e-3, betas=(0.9, 0.999), 
                       eps=1e-08, weight_decay=0, amsgrad=False)


### Train the Model

In [14]:
from fastNLP import Trainer
import torch

N_EPOCHS = 5 
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=val_data, 
                  loss=loss, metrics=metrics, optimizer=optimizer,
                  n_epochs=N_EPOCHS, batch_size=BATCH_SIZE, device=device,
                  save_path='./saved_models/sst2-cnn-rand.pt')
trainer.train()

input fields after batch(if batch size is 2):
	words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 39]) 
	seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 
target fields after batch(if batch size is 2):
	target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 

training epochs started 2021-04-19-17-39-18-242916


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=15440.0), HTML(value='')), layout=Layout(…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.06 seconds!
Evaluation on dev at Epoch 1/5. Step:3088/15440: 
AccuracyMetric: acc=0.838303



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.06 seconds!
Evaluation on dev at Epoch 2/5. Step:6176/15440: 
AccuracyMetric: acc=0.831422



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.06 seconds!
Evaluation on dev at Epoch 3/5. Step:9264/15440: 
AccuracyMetric: acc=0.824541



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.06 seconds!
Evaluation on dev at Epoch 4/5. Step:12352/15440: 
AccuracyMetric: acc=0.813073



HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=28.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.09 seconds!
Evaluation on dev at Epoch 5/5. Step:15440/15440: 
AccuracyMetric: acc=0.817661

Reloaded the best model.

In Epoch:1/Step:3088, got best dev performance:
AccuracyMetric: acc=0.838303


{'best_eval': {'AccuracyMetric': {'acc': 0.838303}},
 'best_epoch': 1,
 'best_step': 3088,
 'seconds': 100.99}

### Test the Model

In [15]:
from fastNLP import Tester

tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())
tester.test()

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=114.0), HTML(value='')), layout=Layout(di…

Evaluate data in 0.33 seconds!
[tester] 
AccuracyMetric: acc=0.83251


{'AccuracyMetric': {'acc': 0.83251}}