In [7]:
import os
import csv
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import time

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

In [2]:
torch.manual_seed(3)
np.random.seed(3)
torch.cuda.manual_seed_all(3)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
gme_df = pd.read_csv('tokenized_and_tagged_gme_coarse_grained.csv', sep='\t', keep_default_na=False)
gme_df

Unnamed: 0,ix,token,is_modal,is_prej,modal_type,sentence_number,span
0,,,,,,,
1,#,Sent_number =,0,,,,
2,#,sentence_text =,"The American Department of State , in its annu...",,,,
3,#,modal_count =,0,,,,
4,#,source_document =,02.54-18922,,,,
5,138,The,O,_,_,0,_
6,139,American,O,_,_,0,_
7,140,Department,O,_,_,0,_
8,141,of,O,_,_,0,_
9,142,State,O,_,_,0,_


In [4]:
def split_to_train_and_test_sets(df, train_size: float):
    sent_numbers = df['sentence_number'].unique()
    train_sents, test_sents = sent_numbers[1:(int(len(sent_numbers)*train_size))], sent_numbers[(int(len(sent_numbers)*train_size)):]
    train_set, test_set = df[df['sentence_number'].isin(train_sents)], df[df['sentence_number'].isin(test_sents)]
    
    return train_set, test_set

In [5]:
train_df, test_df = split_to_train_and_test_sets(gme_df, 0.8)

In [29]:
class SentenceGetter(object):
    def __init__(self, dataframe, max_sent=None):
        self.df = dataframe
        self.tags = self.df['is_modal'].unique().tolist()
        self.tags.insert(0,'PAD')
        
        self.index = 0
        self.max_sent = max_sent
        self.tokens = dataframe['token']
        self.modal_tags = dataframe['is_modal']
        
    
          
    def get_tokens_and_tags_by_sentences(self):
        sent = []
        counter = 0
        
        for token,tag in zip(self.tokens, self.modal_tags):
            sent.append((token, tag))
            if token.strip() in ['.', '?', '!']:
                yield sent
                sent = []
                counter += 1
            if self.max_sent is not None and counter >= self.max_sent:
                return

    def get_tag2idx(self):
        return {tag:idx for idx, tag in enumerate(self.tags)}
            
    def get_idx2tag(self):
        return {idx:tag for idx, tag in enumerate(self.tags)}
    
    def get_2Dlist_of_sentences(self):
        return [[token for token, tag in sent] for sent in self.get_tokens_and_tags_by_sentences()]
    
    def get_2Dlist_of_tags(self):
        return [[tag for token, tag in sent] for sent in self.get_tokens_and_tags_by_sentences()]
        
        
# train_getter = SentenceGetter(train_df)
# test_getter = SentenceGetter(test_df)

# train_sentences = train_getter.get_2Dlist_of_sentences()
# train_tags = train_getter.get_2Dlist_of_tags()

# test_sentences = test_getter.get_2Dlist_of_sentences()
# test_tags = test_getter.get_2Dlist_of_tags()


In [28]:
len(train_sentences), len(test_sentence)

(8839, 2210)

In [45]:
class BertTrainer(object):
        
    MAX_LEN = 150
    bs = 32
    
    def __init__(self, train_df, test_df, pre_trained='bert-base-cased'):
        self.train_df = train_df
        self.test_df = test_df
        self.train_getter = SentenceGetter(self.train_df)
        self.test_getter = SentenceGetter(self.test_df)
#         self.train_sentence = self.train_getter.get_2Dlist_of_sentences()
#         self.train_tags = self.get_2Dlist_of_tags()
        
        self.device, self.n_gpu = self.set_cuda()
        self.tokenizer = BertTokenizer.from_pretrained(pre_trained, do_lower_case=False)
        

        
        
    def set_cuda(self):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        n_gpu = torch.cuda.device_count()
        return device, n_gpu
    
    def tokenize(self, sentences, orig_labels):    
        tokenized_texts = []
        labels = []
        sents, tags_li = [], []
        for sent, sent_labels in zip(sentences, orig_labels):
            bert_tokens = []
            bert_labels = []
            for orig_token, orig_label in zip(sent, sent_labels):
                b_tokens = self.tokenizer.tokenize(orig_token)
                bert_tokens.extend(b_tokens)
                for b_token in b_tokens:
                    bert_labels.append(orig_label)
            tokenized_texts.append(bert_tokens)
            labels.append(bert_labels)
            assert len(bert_tokens) == len(bert_labels)
        return tokenized_texts, labels


    def pad_sentences_and_labels(self, tokenized_texts, labels):
        input_ids = pad_sequences([self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                                  maxlen = MAX_LEN, dtype = "int", truncating = "post", padding = "post")
        tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], 
                             maxlen = MAX_LEN, value = tag2idx['PAD'], padding = "post",
                            dtype = "int", truncating = "post")
        attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
        return input_ids, tags, attention_masks

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 6)

In [43]:
bert = BertTrainer(train_df, test_df, pre_trained='bert-base-cased')

train_sentences, train_tags = bert.train_getter.get_2Dlist_of_sentences(), bert.train_getter.get_2Dlist_of_tags()
tag2idx = {**bert.train_getter.get_tag2idx(), **bert.test_getter.get_tag2idx()}
idx2tag = {**bert.train_getter.get_idx2tag(), **bert.test_getter.get_idx2tag()}


train_tokenized_texts, train_tokenized_labels = bert.tokenize(train_sentence, train_tags)
input_ids, tags, attention_masks = bert.pad_sentences_and_labels(train_tokenized_texts, train_tokenized_labels)

AttributeError: 'BertTrainer' object has no attribute 'MAX_LEN'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print("Device: " + str(device))
print("Number of gpus: " + str(n_gpu))
if device == 'cuda':
    print("Name of gpu: " + torch.cuda.get_device_name(0))

In [None]:
tag2idx = {**train_getter.get_tag2idx(), **test_getter.get_tag2idx()}
idx2tag = {**train_getter.get_idx2tag(), **test_getter.get_idx2tag()}

In [None]:
idx2tag

In [None]:
# tokenizer = BertTokenizer.from_pretrained('../resources/wwm_cased_L-24_H-1024_A-16/')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)


In [None]:
MAX_LEN = 150
bs = 32

In [None]:
def tokenize(sentences, orig_labels):    
    tokenized_texts = []
    labels = []
    sents, tags_li = [], []
    for sent, sent_labels in zip(sentences, orig_labels):
        bert_tokens = []
        bert_labels = []
        for orig_token, orig_label in zip(sent, sent_labels):
            b_tokens = tokenizer.tokenize(orig_token)
            bert_tokens.extend(b_tokens)
            for b_token in b_tokens:
                bert_labels.append(orig_label)
        tokenized_texts.append(bert_tokens)
        labels.append(bert_labels)
        assert len(bert_tokens) == len(bert_labels)
    return tokenized_texts, labels


def pad_sentences_and_labels(tokenized_texts, labels):
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen = MAX_LEN, dtype = "int", truncating = "post", padding = "post")
    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], 
                         maxlen = MAX_LEN, value = tag2idx['PAD'], padding = "post",
                        dtype = "int", truncating = "post")
    attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
    return input_ids, tags, attention_masks


train_tokenized_texts, train_tokenized_labels = tokenize(train_sentence, train_tags)
input_ids, tags, attention_masks = pad_sentences_and_labels(train_tokenized_texts, train_tokenized_labels)

In [None]:
tr_inputs = torch.tensor(input_ids, dtype=torch.long)
tr_tags = torch.tensor(tags, dtype=torch.long)
tr_masks = torch.tensor(attention_masks, dtype=torch.long)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
# train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, batch_size=bs, shuffle=True)

In [None]:
# model = BertForTokenClassification.from_pretrained('../resources/wwm_cased_L-24_H-1024_A-16/', num_labels=len(tag2idx))
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(tag2idx))
model.cuda()
FULL_FINETUNING = True

In [None]:
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [None]:
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


In [None]:
epochs = 15
max_grad_norm = 1.0

In [None]:
for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss / nb_tr_steps))