## debugging the py files

## 1. preprocessing.py

In [1]:

import json

from sklearn.model_selection import train_test_split

In [2]:
from transformers import DistilBertTokenizer

In [3]:
import pandas as pd
'''
input:
    - data_file: json data file path
    - speech_type: FED or ECB, or both (needs to be implemented)
    - task_type: classif or reg, process the data of the two tasks separatively
    - val_size: validation set size ratio

1. turn each speech into a list of words (for now it's just for summarization, so no need for max_len)
2. get rid of strange tokens (needs to be implemented)

returns:
    - X_train: a list of word_list for training
    - X_val: a list of word_list for validation
    - y_train: label (classification label or regression price) for training
    - y_val: label (classification label or regression price) for validation

'''
def read_data(data_file, speech_type=['ECB', 'FED'], task_type='classif', val_size=0.2):
    with open(data_file, 'r') as fp:
        data = json.load(fp)
    
    #data = json.loads(data)
    
    spch_list = []
    label_list = []
    for data_dict in data:
        label_list.append(data_dict['target_' + task_type])
        
        s = data_dict['speech'][-1]
        if s[speech_type[0]]:
            l = s[speech_type[0]][0].strip()
        else:
            l = s[speech_type[1]][0].strip()
        #words = l.split(' ')
        spch_list.append(l)
    
    print('[Info] Get {} instances from {}'.format(len(spch_list), data_file))

    X_train, X_test, y_train, y_test = train_test_split(spch_list, label_list, test_size=val_size, random_state=42)
    X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=val_size, random_state=42)

    train_ds = pd.DataFrame(list(zip(X_train, y_train)),
               columns =['text', 'label'])
    
    dev_ds = pd.DataFrame(list(zip(X_dev, y_dev)),
               columns =['text', 'label'])
    
    test_ds = pd.DataFrame(list(zip(X_test, y_test)),
               columns =['text', 'label'])

    return train_ds, dev_ds, test_ds

In [12]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# get training set and validation set
X_train, X__val, y_train, y_val = read_data(
    'summary_text.json')

# Build vocabulary
#if opt.vocab:
#    predefined_data = torch.load(opt.vocab)
#    assert 'dict' in predefined_data

#    print('[Info] Pre-defined vocabulary found.')
#    src_word2idx = predefined_data['dict']['src']
#    tgt_word2idx = predefined_data['dict']['tgt']
#else:



# word to index
print('[Info] Convert training word instances into sequences of word index.')

X_train_insts = [tokenizer(i, return_tensors='pt') for i in X_train]

print('[Info] Convert validation word instances into sequences of word index.')
X_val_insts = [tokenizer(i, return_tensors='pt') for i in X__val]

data = {
    #'settings': opt,
    'train': {
        'X': X_train_insts,
        'y': y_train},
    'valid': {
        'X': X_val_insts,
        'y': y_val}}

print('[Info] Dumping the processed data to pickle file', 'data_processed')
#torch.save(data, opt.save_data)
print('[Info] Finish.')

[Info] Get 1254 instances from summary_text.json
[Info] Convert training word instances into sequences of word index.


Token indices sequence length is longer than the specified maximum sequence length for this model (557 > 512). Running this sequence through the model will result in indexing errors


[Info] Convert validation word instances into sequences of word index.
[Info] Dumping the processed data to pickle file data_processed
[Info] Finish.


In [13]:
len(X_train_insts)

1003

In [14]:
X_train_insts[0]['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [48]:
X_train_insts[0]

[100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 3930,
 4254,
 1997,
 8332,
 100,
 100,
 100,
 2006,
 1996,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 1998,
 100,
 100,
 100,
 100,
 12194,
 3343,
 5656,
 1999,
 2204,
 1998,
 2919,
 100,
 8220,
 2013,
 1996,
 3522,
 100,
 100,
 100,
 100,
 1004,
 100,
 1004,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 1997,
 100,
 100,
 100,
 1998,
 100,
 100,
 2019,
 3623,
 1999,
 14338,
 1998,
 1037,
 9963,
 1997,
 4722,
 5157,
 2875,
 27143,
 2550,
 100,
 20242,
 5816,
 3976,
 100,
 100,
 100]

In [47]:
print([sum([j ==  100 for j in i])/len(i) for i in X_train_insts])

[0.6041666666666666, 1.0, 0.3076923076923077, 0.5855855855855856, 0.3055555555555556, 1.0, 1.0, 0.2867132867132867, 0.7411764705882353, 0.6304347826086957, 1.0, 1.0, 0.24761904761904763, 0.0, 0.6666666666666666, 0.9, 0.22916666666666666, 1.0, 0.2422680412371134, 0.14545454545454545, 0.3364485981308411, 0.0, 0.4845360824742268, 0.16304347826086957, 0.1282051282051282, 0.22105263157894736, 0.168141592920354, 1.0, 0.3253968253968254, 0.2077922077922078, 1.0, 0.24060150375939848, 1.0, 0.8076923076923077, 0.1711229946524064, 0.2840909090909091, 0.1956521739130435, 0.27586206896551724, 0.18681318681318682, 0.0, 0.21604938271604937, 0.24242424242424243, 0.18548387096774194, 0.0, 0.2677165354330709, 0.14285714285714285, 0.23076923076923078, 0.26277372262773724, 0.2328767123287671, 0.27485380116959063, 1.0, 0.18627450980392157, 1.0, 0.16666666666666666, 1.0, 0.22608695652173913, 0.23076923076923078, 0.28, 0.2782608695652174, 0.5081081081081081, 0.0, 0.2288135593220339, 1.0, 0.26605504587155965,

In [33]:
s = tokenizer.convert_ids_to_tokens([100])
s

['[UNK]']

In [61]:
ids = tokenizer('this is a test', return_tensors='pt')
ids

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3231,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [59]:
ids = tokenizer('This is a test.')
ids

{'input_ids': [101, 2023, 2003, 1037, 3231, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [44]:
import torch

In [45]:
torch.save(data, 'data_processed')

## 2. Dataset.py

In [1]:
import numpy as np
import torch
import torch.utils.data

In [None]:
class SpeechDataset(torch.utils.data.Dataset):
    def __init__(
        self, src_word2idx, tgt_word2idx,
        src_insts=None, tgt_insts=None):

        assert src_insts
        assert not tgt_insts or (len(src_insts) == len(tgt_insts))

        src_idx2word = {idx:word for word, idx in src_word2idx.items()} #字典 key:词汇编号 value:词汇
        self._src_word2idx = src_word2idx
        self._src_idx2word = src_idx2word
        self._src_insts = src_insts

        tgt_idx2word = {idx:word for word, idx in tgt_word2idx.items()}
        self._tgt_word2idx = tgt_word2idx
        self._tgt_idx2word = tgt_idx2word
        self._tgt_insts = tgt_insts

    @property
    def n_insts(self):
        ''' Property for dataset size '''
        return len(self._src_insts) #数据集大小

    @property
    def src_vocab_size(self):
        ''' Property for vocab size '''
        return len(self._src_word2idx) #原文词汇集大小

    @property
    def src_word2idx(self):
        ''' Property for word dictionary '''
        return self._src_word2idx

    @property
    def tgt_word2idx(self):
        ''' Property for word dictionary '''
        return self._tgt_word2idx

    @property
    def src_idx2word(self):
        ''' Property for index dictionary '''
        return self._src_idx2word

    @property
    def tgt_idx2word(self):
        ''' Property for index dictionary '''
        return self._tgt_idx2word

    def __len__(self):
        return self.n_insts

    def __getitem__(self, idx): #按照index取语句 译文语句可能不存在
        if self._tgt_insts:
            return self._src_insts[idx], self._tgt_insts[idx]
        return self._src_insts[idx]

## 3. Model.py

In [4]:
from typing import Tuple
import copy
import gc
import math
import tqdm
import torch
import transformers

In [5]:
class Model(torch.nn.Module):
    def __init__(self, model_name: str, hidden: int, dropout: float) -> None:
        super().__init__()
        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        self.prediction_model = torch.nn.Sequential(
                torch.nn.Dropout(dropout),
                torch.nn.Linear(self.transformer.config.hidden_size, hidden),
                torch.nn.ReLU(),
                torch.nn.Linear(hidden, 1),
            )
        self.loss = torch.nn.BCEWithLogitsLoss()

    def forward(self, label: torch.Tensor, *args: torch.Tensor, **kwargs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        # Feed all arguments to the transformer
        transformer_output = self.transformer(*args, **kwargs)
        # Get a fixed-size representation with mean pooling
        intermediate: torch.Tensor = transformer_output.last_hidden_state.mean(1)
        # Apply our MLP
        output: torch.Tensor = self.prediction_model(intermediate)[:, 0]
        # We feed the output without the sigmoid to the BCEWithLogitsLoss for numerical stability, and only use the prediction for debugging/metric computation
        loss: torch.Tensor = self.loss(output, label.to(dtype=torch.float32))
        prediction: torch.Tensor = torch.nn.functional.sigmoid(output)
        return loss, prediction, transformer_output

In [39]:
class Trainer:
    def __init__(self, tokenizer, train_ds, dev_ds, test_ds, config):
        self.config = config
        self.tokenizer = tokenizer
        self.device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
        self.prepare_data(train=train_ds, dev=dev_ds, test=test_ds)
        self.prepare_model()

    def prepare_data(self, **kwargs):
        def tokenization(batched_text):
            return self.tokenizer(batched_text['text'], padding='max_length', truncation=True, max_length=self.tokenizer.model_max_length)

        self.dataset = {}
        self.iterator = {}
        for split in ["train", "dev", "test"]:
            data = kwargs[split]
            tokenized = data.map(tokenization,
                                 batched=True,
                                 batch_size=len(data),
                                 remove_columns=['text'])
            tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
            self.dataset[split] = tokenized
            self.iterator[split] = lambda split=split, trainer=self: torch.utils.data.DataLoader(dataset=trainer.dataset[split],
                                                                                                 batch_size=trainer.config["batch_size"],
                                                                                                 shuffle=True)

    def prepare_model(self) -> None:
        self.model = Model(self.config["model_name"], self.config["hidden"], self.config["dropout"])
        self.model.to(self.device)
        self.optimizer = transformers.AdamW(self.model.parameters(),
                                            lr=self.config["learning_rate"],
                                            weight_decay=self.config.get("weight_decay", 0))
        max_iterations: int = self.config["max_epoch"]*self.config.get("batch_per_epoch", len(self.dataset["train"]))
        # Decrease learning rate linearly
        self.scheduler = transformers.get_linear_schedule_with_warmup(self.optimizer, self.config["warmup_step"], max_iterations)
        self.scaler = torch.cuda.amp.GradScaler()  # Used to scale gradients to avoid rounding to 0 when using float16

    def eval(self, split: str) -> float:
        with torch.no_grad():
            # Accumulate gradient to better estimate it even with small batches
            accuracy_accumulator = 0
            num_samples = 0
            self.model.eval()
            for batch in self.iterator[split]():
                batch = {key: value.to(self.device) for key, value in batch.items()}
                with torch.cuda.amp.autocast():  # Use mixed-precision float16
                    loss, prediction, transformer_output = self.model(**batch)
                accuracy_accumulator += ((prediction > 0.5) == batch["label"]).sum().item()
                num_samples += batch["label"].shape[0]
            accuracy = accuracy_accumulator / num_samples
            return accuracy
      
    def train_step(self) -> None:
        """ Apply the gradients to the parameters. """
        self.scaler.unscale_(self.optimizer)  # Gradients are scaled in order to avoid rounding to 0 when casting to float16
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config["max_grad_norm"])  # Clip to gradients to a maximum value (to be robust to outliers)
        self.scaler.step(self.optimizer)  # Add the gradient to the parameters
        self.scheduler.step()  # Update the learning rate
        self.scaler.update()  # Update by how much the gradients should be scaled to avoid rounding to 0
        self.optimizer.zero_grad()  # Set the gradients to 0 to prepare for the next iteration

    def run(self) -> None:
        loop = tqdm.trange(self.config["max_epoch"], desc="Training")
        best_dev = -math.inf
        best_dev_epoch = -math.inf
        best_state_dict = None  # Used to save the best model
        self.train_acc = []
        self.dev_acc = []
        for epoch in loop:
            # Accumulate gradient to better estimate it even with small batches
            accuracy_accumulator = 0
            num_samples = 0
            self.model.train()
            for batch_id, batch in enumerate(self.iterator["train"]()):
                if batch_id >= self.config.get("batch_per_epoch", math.inf):
                    break
                batch = {key: value.to(self.device) for key, value in batch.items()}  
                with torch.cuda.amp.autocast():  # Use mixed-precision float16
                    loss, prediction, transformer_output = self.model(**batch)
                accuracy_accumulator += ((prediction > 0.5) == batch["label"]).sum().item()
                num_samples += batch["label"].shape[0]
                self.scaler.scale(loss).backward()  # Accumulate the (scaled) gradients
                if (1+batch_id) % self.config.get("accumulation", 1) == 0:
                    self.train_step()
            self.train_step()
            accuracy = accuracy_accumulator / num_samples
            dev = self.eval("dev")
            loop.set_postfix(epoch=f"{epoch+1:3}", DEV=f"{dev:.4f}", TRAIN=f"{accuracy:.4f}")
            self.train_acc.append(accuracy)
            self.dev_acc.append(dev)
            if dev > best_dev:  # If dev score improved
                best_dev = dev
                best_dev_epoch = epoch
                best_state_dict = copy.deepcopy(self.model.state_dict())  # Save the weights of the model
            elif epoch - best_dev_epoch > self.config.get("patience", 0):  # If dev score worsened for several steps (or 1 if patience is 0)
                break # Early stopping
        if best_state_dict is not None:  # If we improved over random initialization
            self.model.load_state_dict(best_state_dict)  # Load the model with the best dev score
        for split in ["train", "dev", "test"]:
            print(f"Accuracy on {split} split: {self.eval(split)}")


In [22]:
train_ds_pre_df, dev_ds_pre_df, test_ds_pre_df = read_data(
    'sums.json')

[Info] Get 1254 instances from sums.json


In [24]:
sum(train_ds_pre_df['text'].isna())

0

In [25]:
train_ds_pre_df.to_csv("data_file_pre/train.csv", index=False)
dev_ds_pre_df.to_csv("data_file_pre/dev.csv", index=False)
test_ds_pre_df.to_csv("data_file_pre/test.csv", index=False)

In [26]:
from datasets import load_dataset

data_files = {"train": "train.csv", "dev": "dev.csv", "test": "test.csv"}
dataset_pre = load_dataset('data_file_pre', data_files=data_files)

Using custom data configuration data_file_pre-f04bcf91d7bc917b


Downloading and preparing dataset csv/data_file_pre to /Users/meteor/.cache/huggingface/datasets/csv/data_file_pre-f04bcf91d7bc917b/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


100%|██████████| 3/3 [00:00<00:00, 3192.01it/s]
100%|██████████| 3/3 [00:00<00:00, 480.37it/s]


Dataset csv downloaded and prepared to /Users/meteor/.cache/huggingface/datasets/csv/data_file_pre-f04bcf91d7bc917b/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 555.05it/s]


In [27]:
train_ds = dataset_pre['train'].shuffle(seed=42)#.select(range(100))
dev_ds = dataset_pre['dev'].shuffle(seed=42)#.select(range(100))
test_ds = dataset_pre['test'].shuffle(seed=42)#.select(range(100))

In [40]:
import gc

trainer_config = {
    "batch_size": 32,
    "accumulation": 16,
    "learning_rate": 2e-5,
    "weight_decay": 0.001,
    "batch_per_epoch": 80,
    "max_epoch": 50,
    "max_grad_norm": 1.0,
    "hidden": 256,  # Size of the hidden layer of the MLP.
    "dropout": 0,  # dropout of the transformers' output, before the MLP.
    "patience": 5,  # If the dev score worsen 3 epoch in a row, stop the training.
    "warmup_step": 20,

}

model = "distilbert-base-uncased"
trainer_config["model_name"] = model
tokenizer = transformers.AutoTokenizer.from_pretrained(model)
trainer = Trainer(tokenizer, train_ds, dev_ds, test_ds, trainer_config)
trainer.run()


100%|██████████| 1/1 [00:00<00:00,  1.93ba/s]
100%|██████████| 1/1 [00:00<00:00,  7.07ba/s]
100%|██████████| 1/1 [00:00<00:00,  5.61ba/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Training:   6%|▌         | 3/50 [2:56:19<46:02:32, 3526.65s/it, DEV=0.5124, TRAIN=0.5686, epoch=3]


KeyboardInterrupt: 

In [16]:
import copy
import numpy as np
import json
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

class PredictDataset(Dataset):

    def __init__(self, data_file, test_size, max_len, tokenizer):
        self.max_len = max_len # use to padding all sequence to a fixed length
        self.tokenizer = tokenizer
        with open(data_file, 'r') as fp:
            data = json.load(fp)
        
        self.data = []
        for data_dict in data:
            s = data_dict['speech'][-1]
            if s['ECB']:
                l = s['ECB'][0].strip()
            else:
                l = s['FED'][0].strip()
            #words = l.split(' ')
            text_token = self.tokenizer(l, padding='max_length', truncation=True, max_length=self.max_len)
            #print(text_token)

            text = torch.tensor(text_token['input_ids'], dtype=float)
            text_att = torch.tensor(text_token['attention_mask'], dtype=int)
            stock = torch.tensor(data_dict['stock'], dtype=float)
            tgt_c = torch.tensor(data_dict['target_classif'], dtype=int)
            tgt_r = torch.tensor(data_dict['target_reg'], dtype=float)

            
            self.data.append({'text_token':text, 'text_att':text_att, 'stock':stock, 'target_classif':tgt_c, 'target_reg':tgt_r})
    

        self.train_data, self.test_data = train_test_split(self.data, test_size=test_size,random_state=0)
        print("="*50)
        print("Data Preprocess Done!")
        print("Dataset size:{}, train:{}, val:{}".
              format(len(self.data),len(self.train_data),len(self.test_data)))
        print("="*50)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return self.data[item]

    def train_set(self):
        '''call this method to switch to train mode'''
        self.data = self.train_data
        return copy.deepcopy(self)

    def test_set(self):
        '''call this method to switch to test mode'''
        self.data = self.test_data
        return copy.deepcopy(self)

In [17]:
from transformers import DistilBertTokenizer
import transformers

model = "distilbert-base-uncased"
tokenizer = transformers.AutoTokenizer.from_pretrained(model)

dataset = PredictDataset('sums.json', 0.2, 512, tokenizer)

Data Preprocess Done!
Dataset size:1254, train:1003, val:251


In [77]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Generator
class Encoder(nn.Module):
    """
    Encoder.
    """

    def __init__(self, model_name, dropout=0.5, fine_tune=True):
        super(Encoder, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        #self.output_size = output_size

        if fine_tune:
            self.fine_tune()

    def forward(self, speech_text):
        """
        Forward propagation.

        :param images: images, a tensor of dimensions (batch_size, 3, image_size, image_size)
        :return: encoded images
        """
        out = self.transformer(speech_text)  # (batch_size, transformer_output_size)
        out = out.last_hidden_state.mean(1)
        #out = out.permute(0, 2, 3, 1)  # (batch_size, encoded_image_size, encoded_image_size, 1)
        return out

    def fine_tune(self, fine_tune=True):
        """
        Allow or prevent the computation of gradients for convolutional blocks 2 through 4 of the encoder.

        :param fine_tune: Allow?
        """
        for p in self.transformer.parameters():
            p.requires_grad = False
        # If fine-tuning, only fine-tune convolutional blocks 2 through 4
        for c in list(self.transformer.children())[5:]:
            for p in c.parameters():
                p.requires_grad = fine_tune

In [76]:
class Decoder_Classif(nn.Module):
    def __init__(self, stock_data_size, input_size, encoded_size, hidden_size, dropout=0.5):
        """
        :param encoder_dim: feature size of encoded images
        :param decoder_dim: size of decoder's RNN
        """
        super().__init__()
        self.input_size = input_size
        self.encoded_size = encoded_size 
        self.stock_data_size = stock_data_size
        self.hidden_size = hidden_size
        self.dropout = dropout

        self.encoder = torch.nn.Linear(input_size, encoded_size)

        self.network = torch.nn.Sequential(
                torch.nn.Dropout(self.dropout),
                torch.nn.Linear(encoded_size + stock_data_size, hidden_size),
                torch.nn.ReLU(),
                torch.nn.Linear(hidden_size, 1),
            )
        self.sigmoid = torch.nn.Sigmoid()
        #self.init_weights()

    def forward(self, input, stock_price):
        """
        Forward propagation.

        :param input: encoded speech text, the output of the Encoder
        :param stock_price: the stock price        
        :return: classification prob
        """

        encoded_output = self.encoder(input)
        merged_data = torch.cat([encoded_output, stock_price],dim=1)
        out = self.network(merged_data)
        p = self.sigmoid(out)

        return p

In [65]:
import torch

class AverageMeter(object):
    """
    Keeps track of most recent, average, sum, and count of a metric.
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def clip_gradient(optimizer, grad_clip):
    """
    Clips gradients computed during backpropagation to avoid explosion of gradients.

    :param optimizer: optimizer with the gradients to be clipped
    :param grad_clip: clip value
    """
    for group in optimizer.param_groups:
        for param in group['params']:
            if param.grad is not None:
                param.grad.data.clamp_(-grad_clip, grad_clip)

def adjust_learning_rate(optimizer, shrink_factor):
    """
    Shrinks learning rate by a specified factor.

    :param optimizer: optimizer whose learning rate must be shrunk.
    :param shrink_factor: factor in interval (0, 1) to multiply learning rate with.
    """
    print("\nDECAYING learning rate.")
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * shrink_factor
    print("The new learning rate is %f\n" % (optimizer.param_groups[0]['lr'],))

def save_checkpoint(save_dir, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer,
                    loss, is_best):
    """
    Saves model checkpoint.
    :param save_dir: store checkpoints here
    :param epoch: epoch number
    :param epochs_since_improvement: number of epochs since last improvement in BLEU-4 score
    :param encoder: encoder model
    :param decoder: decoder model
    :param encoder_optimizer: optimizer to update encoder's weights, if fine-tuning
    :param decoder_optimizer: optimizer to update decoder's weights
    :param loss: validation loss score for this epoch
    :param is_best: is this checkpoint the best so far?
    """
    state = {'epoch': epoch,
             'epochs_since_improvement': epochs_since_improvement,
             'testLoss': loss,
             'encoder': encoder,
             'decoder': decoder,
             'encoder_optimizer': encoder_optimizer,
             'decoder_optimizer': decoder_optimizer}
    filename = save_dir +'checkpoint_' + 'epoch_' + str(epoch) + '_loss:{:.2f}'.format(loss) + '.pth'
    torch.save(state, filename)
    # If this checkpoint is the best so far, store a copy so it doesn't get overwritten by a worse checkpoint
    if is_best:
        filename = save_dir +'checkpoint_' + 'best.pth'
        torch.save(state, filename)

In [109]:
import time
import torch.utils.data as Data
import torch.nn as nn



grad_clip = 5.  # clip gradients at an absolute value of
print_freq = 1  # print training/validation stats every __ batches

def train_classif(train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch):
    """
    Performs one epoch's training.

    :param train_loader: DataLoader for training data
    :param encoder: encoder model
    :param decoder: decoder model
    :param criterion: loss layer
    :param encoder_optimizer: optimizer to update encoder's weights (if fine-tuning)
    :param decoder_optimizer: optimizer to update decoder's weights
    :param epoch: epoch number
    """

    encoder.train()
    decoder.train()

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss

    start = time.time()

    for i, data in enumerate(train_loader):
        text = data['text_token'].type(torch.IntTensor)
        stock_price = data['stock'].type(torch.FloatTensor)
        label = data['target_classif'].type(torch.FloatTensor)

        # Forward prop.
        decoder_input = encoder(text)
        label_pred = decoder(decoder_input, stock_price).reshape(-1)

        # Calculate loss
        loss = criterion(label_pred, label)

        # Back prop.
        decoder_optimizer.zero_grad()

        if encoder_optimizer is not None:
            encoder_optimizer.zero_grad()
        loss.backward()

        # Clip gradients
        if grad_clip is not None:
            clip_gradient(decoder_optimizer, grad_clip)
            if encoder_optimizer is not None:
                clip_gradient(encoder_optimizer, grad_clip)

        # Update weights
        decoder_optimizer.step()
        if encoder_optimizer is not None:
            encoder_optimizer.step()

        # Keep track of metric
        losses.update(loss.item())
        batch_time.update(time.time() - start)

        start = time.time()

        # Print status
        if i % print_freq == 0:
            print('Epoch: [{0}] [{1}/{2}]\n'
                  'Batch Time {batch_time.val:.3f}s (Average:{batch_time.avg:.3f}s)\n'
                  'Data Load Time {data_time.val:.3f}s (Average:{data_time.avg:.3f}s)\n'
                  'Loss {loss.val:.4f} (Average:{loss.avg:.4f})\n'.format(epoch, i, len(train_loader),
                                                                          batch_time=batch_time,
                                                                          data_time=data_time, loss=losses))


def validate(val_loader, encoder, decoder, criterion):

    decoder.eval()  # eval mode (no dropout or batchnorm)
    if encoder is not None:
        encoder.eval()

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss

    start = time.time()

    # explicitly disable gradient calculation to avoid CUDA memory error
    with torch.no_grad():
        accuracy_accumulator = 0
        num_samples = 0
        for i, data in enumerate(val_loader):
            text = data['text_token'].type(torch.IntTensor)
            stock_price = data['stock'].type(torch.FloatTensor)
            label = data['target_classif'].type(torch.FloatTensor)

            # Forward prop.
            decoder_input = encoder(text)
            label_pred = decoder(decoder_input, stock_price).reshape(-1)

            # Calculate loss
            loss = criterion(label_pred, label)

            # Keep track of metrics
            losses.update(loss.item())
            batch_time.update(time.time() - start)

            start = time.time()
            accuracy_accumulator += ((label_pred > 0.5) == label).sum().item()
            num_samples += label.shape[0]

            print(accuracy_accumulator)
            print(num_samples)

            if i % print_freq == 0:
                print('Validation: [{0}/{1}]\n'
                        'Batch Time {batch_time.val:.3f}s (Average:{batch_time.avg:.3f}s)\n'
                        'Loss {loss.val:.4f} (Average:{loss.avg:.4f})\n'.format(i, len(val_loader), batch_time=batch_time,loss=losses))
        
        accuracy = accuracy_accumulator / num_samples

    return losses.avg, label, label_pred, accuracy


In [99]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
torch.backends.cudnn.benchmark = True

torch.manual_seed(0)
torch.cuda.manual_seed(0)


model = "distilbert-base-uncased"

data_path = 'sums.json'
stock_data_size = 20
input_size = 768
encoded_size = 20 
hidden_size = 64
dropout = 0.5

start_epoch = 1
epochs = 20  # number of epochs to train for (if early stopping is not triggered)
epochs_since_improvement = 0  # keeps track of number of epochs since there's been an improvement
batch_size = 32
workers = 1  # for data-loading; right now, only 1 works with h5py
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
grad_clip = 5.  # clip gradients at an absolute value of
alpha_c = 1.  # regularization parameter for 'doubly stochastic attention', as in the paper
best_loss = 1.  # best loss score right now
print_freq = 8  # print training/validation stats every __ batches
fine_tune_encoder = False # fine-tune encoder?
checkpoint = None  # path to checkpoint, None if none
save_path = './checkpoint/' # checkpoint save path

#max_len = 15 # the longest sequence

In [103]:


if checkpoint is None:
    decoder = Decoder_Classif(stock_data_size=stock_data_size, input_size=input_size, encoded_size=encoded_size, hidden_size=hidden_size)
    decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
                                        lr=decoder_lr)
    encoder = Encoder(model)
    encoder.fine_tune(fine_tune_encoder)
    encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                        lr=encoder_lr) if fine_tune_encoder else None

else:
    checkpoint = torch.load(checkpoint)
    start_epoch = checkpoint['epoch'] + 1
    epochs_since_improvement = checkpoint['epochs_since_improvement']
    best_loss = checkpoint['testLoss']
    decoder = checkpoint['decoder']
    decoder_optimizer = checkpoint['decoder_optimizer']
    encoder = checkpoint['encoder']
    encoder_optimizer = checkpoint['encoder_optimizer']
    if fine_tune_encoder is True and encoder_optimizer is None:
        encoder.fine_tune(fine_tune_encoder)
        encoder_optimizer = torch.optim.Adam(
            params=filter(lambda p: p.requires_grad, encoder.parameters()),
            lr=encoder_lr)

decoder = decoder.to(device)
encoder = encoder.to(device)

criterion = nn.BCEWithLogitsLoss().to(device)

tokenizer = transformers.AutoTokenizer.from_pretrained(model)
dataset = PredictDataset(data_file=data_path, test_size=0.2, max_len=512, tokenizer=tokenizer)
train_loader = Data.DataLoader(dataset.train_set(), batch_size=batch_size, shuffle=False)
val_loader = Data.DataLoader(dataset.test_set(), batch_size=batch_size, shuffle=False)



Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Data Preprocess Done!
Dataset size:1254, train:1003, val:251


In [104]:
for epoch in range(start_epoch, start_epoch + epochs):

    # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
    if epochs_since_improvement == 20:
        break
    if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
        adjust_learning_rate(decoder_optimizer, 0.8)
        if fine_tune_encoder:
            adjust_learning_rate(encoder_optimizer, 0.8)

    # One epoch's training
    train_classif(train_loader=train_loader,
            encoder=encoder,
            decoder=decoder,
            criterion=criterion,
            encoder_optimizer=encoder_optimizer,
            decoder_optimizer=decoder_optimizer,
            epoch=epoch)

    # One epoch's validation, return the average loss of each batch in this epoch
    loss, label, label_pred, acc = validate(val_loader=val_loader,
                                encoder=encoder, decoder=decoder, criterion=criterion)
    
    print('Validation: Epoch [{0}/{1}]\n'
                        'Loss {loss:.4f}\n'
                        'Accuracy {acc:.4f}\n'.format(epoch, epochs, loss=loss, acc=acc))


    # Check if there was an improvement
    is_best = loss < best_loss
    best_loss = min(loss, best_loss)
    if not is_best:
        epochs_since_improvement += 1
        print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))
    else:
        epochs_since_improvement = 0

    # Save checkpoint
    save_checkpoint(save_path, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer,
                    decoder_optimizer, loss, is_best)

Epoch: [1] [0/32]
Batch Time 31.924s (Average:31.924s)
Data Load Time 0.000s (Average:0.000s)
Loss 0.7337 (Average:0.7337)

Epoch: [1] [1/32]
Batch Time 25.804s (Average:28.864s)
Data Load Time 0.000s (Average:0.000s)
Loss 0.7639 (Average:0.7488)

Epoch: [1] [2/32]
Batch Time 27.389s (Average:28.372s)
Data Load Time 0.000s (Average:0.000s)
Loss 0.7566 (Average:0.7514)

Epoch: [1] [3/32]
Batch Time 26.142s (Average:27.815s)
Data Load Time 0.000s (Average:0.000s)
Loss 0.7297 (Average:0.7460)

Epoch: [1] [4/32]
Batch Time 27.230s (Average:27.698s)
Data Load Time 0.000s (Average:0.000s)
Loss 0.7648 (Average:0.7497)

Epoch: [1] [5/32]
Batch Time 26.931s (Average:27.570s)
Data Load Time 0.000s (Average:0.000s)
Loss 0.6998 (Average:0.7414)

Epoch: [1] [6/32]
Batch Time 26.574s (Average:27.428s)
Data Load Time 0.000s (Average:0.000s)
Loss 0.7645 (Average:0.7447)

Epoch: [1] [7/32]
Batch Time 25.488s (Average:27.185s)
Data Load Time 0.000s (Average:0.000s)
Loss 0.7536 (Average:0.7458)

Epoch: [

KeyboardInterrupt: 