In [120]:
import os
import glob
import pandas as pd
import pickle
from collections import defaultdict
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [1]:
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel, BertSelfAttention
import pytorch_pretrained_bert.modeling as modeling
import torch
import torch.nn as nn
import numpy as np
import copy

In [2]:
def gelu(x):
    """Implementation of the gelu activation function.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

def identity(x):
    return x


In [3]:
class BertForMultitask(BertPreTrainedModel):

    def __init__(self, config, cls_num_labels=2, tok_num_labels=2, tok2id=None):
        super(BertForMultitask, self).__init__(config)
        self.bert = BertModel(config)

        self.cls_dropout = nn.Dropout(config.hidden_dropout_prob)
        self.cls_classifier = nn.Linear(config.hidden_size, cls_num_labels)
        
        self.tok_dropout = nn.Dropout(config.hidden_dropout_prob)
        self.tok_classifier = nn.Linear(config.hidden_size, tok_num_labels)
        
        self.apply(self.init_bert_weights)


    def forward(self, input_ids, token_type_ids=None, attention_mask=None, 
        labels=None, rel_ids=None, pos_ids=None, categories=None, pre_len=None):
        global ARGS
        sequence_output, pooled_output = self.bert(
            input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)

        cls_logits = self.cls_classifier(pooled_output)
        cls_logits = self.cls_dropout(cls_logits)

        # NOTE -- dropout is after proj, which is non-standard
        tok_logits = self.tok_classifier(sequence_output)
        tok_logits = self.tok_dropout(tok_logits)

        return cls_logits, tok_logits

In [5]:
class ConcatCombine(nn.Module):
    def __init__(self, hidden_size, feature_size, out_size, layers,
            dropout_prob, small=False, pre_enrich=False, activation=False,
            include_categories=False, category_emb=False,
            add_category_emb=False):
        super(ConcatCombine, self).__init__()

        self.include_categories = include_categories
        self.add_category_emb = add_category_emb
        if include_categories:
            if category_emb and not add_category_emb:
                feature_size *= 2
            elif not category_emb:
                feature_size += 43

        if layers == 1:
            self.out = nn.Sequential(
                nn.Linear(hidden_size + feature_size, out_size),
                nn.Dropout(dropout_prob))
        elif layers == 2:
            waist_size = min(hidden_size, feature_size) if small else max(hidden_size, feature_size)
            if activation:
                self.out = nn.Sequential(
                    nn.Linear(hidden_size + feature_size, waist_size),
                    nn.Dropout(dropout_prob),
                    nn.ReLU(),
                    nn.Linear(waist_size, out_size),
                    nn.Dropout(dropout_prob))
            else:
                self.out = nn.Sequential(
                    nn.Linear(hidden_size + feature_size, waist_size),
                    nn.Dropout(dropout_prob),
                    nn.Linear(waist_size, out_size),
                    nn.Dropout(dropout_prob))
        if pre_enrich:
            if activation:
                self.enricher = nn.Sequential(
                    nn.Linear(feature_size, feature_size),
                    nn.ReLU())
            else:
                self.enricher = nn.Linear(feature_size, feature_size)
        else:
            self.enricher = None
        # manually set cuda because module doesn't see these combiners for bottom 
        if CUDA:
            self.out = self.out.cuda()
            if self.enricher: 
                self.enricher = self.enricher.cuda()
                
    def forward(self, hidden, features, categories=None):
        if self.include_categories:
            categories = categories.unsqueeze(1)
            categories = categories.repeat(1, features.shape[1], 1)
            if self.add_category_emb:
                features = features + categories
            else:
                features = torch.cat((features, categories), -1)

        if self.enricher is not None:
            features = self.enricher(features)

        return self.out(torch.cat((hidden, features), dim=-1))


In [6]:
class AddCombine(nn.Module):
    def __init__(self, hidden_dim, feat_dim, layers, dropout_prob, small=False,
            out_dim=-1, pre_enrich=False, include_categories=False,
            category_emb=False, add_category_emb=False):
        super(AddCombine, self).__init__()

        self.include_categories = include_categories
        if include_categories:
            feat_dim += 43

        if layers == 1:
            self.expand = nn.Sequential(
                nn.Linear(feat_dim, hidden_dim),
                nn.Dropout(dropout_prob))
        else:
            waist_size = min(feat_dim, hidden_dim) if small else max(feat_dim, hidden_dim)
            self.expand = nn.Sequential(
                nn.Linear(feat_dim, waist_size),
                nn.Dropout(dropout_prob),
                nn.Linear(waist_size, hidden_dim),
                nn.Dropout(dropout_prob))
        
        if out_dim > 0:
            self.out = nn.Linear(hidden_dim, out_dim)
        else:
            self.out = None

        if pre_enrich:
            self.enricher = nn.Linear(feature_size, feature_size)        
        else:
            self.enricher = None

        # manually set cuda because module doesn't see these combiners for bottom         
        if CUDA:
            self.expand = self.expand.cuda()
            if out_dim > 0:
                self.out = self.out.cuda()
            if self.enricher is not None:
                self.enricher = self.enricher.cuda()

    def forward(self, hidden, feat, categories=None):
        if self.include_categories:
            categories = categories.unsqueeze(1)
            categories = categories.repeat(1, features.shape[1], 1)
            if self.add_category_emb:
                features = features + categories
            else:
                features = torch.cat((features, categories), -1)

        if self.enricher is not None:
            feat = self.enricher(feat)
    
        combined = self.expand(feat) + hidden
    
        if self.out is not None:
            return self.out(combined)

        return combined

In [7]:
class BertForMultitaskWithFeaturesOnTop(BertPreTrainedModel):
    """ stick the features on top of the model """
    def __init__(self, config, cls_num_labels=2, tok_num_labels=2, tok2id=None):
        super(BertForMultitaskWithFeaturesOnTop, self).__init__(config)
        global ARGS
        
        self.bert = BertModel(config)
        
        self.featurizer = features.Featurizer(
            tok2id, lexicon_feature_bits=ARGS.lexicon_feature_bits) 
        # TODO -- don't hardcode this...
        nfeats = 90 if ARGS.lexicon_feature_bits == 1 else 118

        if ARGS.extra_features_method == 'concat':
            self.tok_classifier = ConcatCombine(
                config.hidden_size, nfeats, tok_num_labels, 
                ARGS.combiner_layers, config.hidden_dropout_prob,
                ARGS.small_waist, pre_enrich=ARGS.pre_enrich,
                activation=ARGS.activation_hidden,
                include_categories=ARGS.concat_categories,
                category_emb=ARGS.category_emb,
                add_category_emb=ARGS.add_category_emb)
        else:
            self.tok_classifier = AddCombine(
                config.hidden_size, nfeats, ARGS.combiner_layers,
                config.hidden_dropout_prob, ARGS.small_waist,
                out_dim=tok_num_labels, pre_enrich=ARGS.pre_enrich,
                include_categories=ARGS.concat_categories,
                category_emb=ARGS.category_emb,
                add_category_emb=ARGS.add_category_emb)

        self.cls_dropout = nn.Dropout(config.hidden_dropout_prob)
        self.cls_classifier = nn.Linear(config.hidden_size, cls_num_labels)

        self.category_emb = ARGS.category_emb
        if ARGS.category_emb:
            self.category_embeddings = nn.Embedding(43, nfeats)

        self.apply(self.init_bert_weights)


    def forward(self, input_ids, token_type_ids=None, attention_mask=None, 
        labels=None, rel_ids=None, pos_ids=None, categories=None, pre_len=None):
        global ARGS
        global CUDA

        features = self.featurizer.featurize_batch(
            input_ids.detach().cpu().numpy(), 
            rel_ids.detach().cpu().numpy(), 
            pos_ids.detach().cpu().numpy(), 
            padded_len=input_ids.shape[1])
        features = torch.tensor(features, dtype=torch.float)
        if CUDA:
            features = features.cuda()

        sequence_output, pooled_output = self.bert(
            input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)

        pooled_output = self.cls_dropout(pooled_output)
        cls_logits = self.cls_classifier(pooled_output)

        if ARGS.category_emb:
            categories = self.category_embeddings(
                categories.max(-1)[1].type(
                    'torch.cuda.LongTensor' if CUDA else 'torch.LongTensor'))

        tok_logits = self.tok_classifier(sequence_output, features, categories)

        return cls_logits, tok_logits


In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import BertForSequenceClassification, BertConfig

In [None]:
# Get tokenizer and fine-tuned LM (my-cc, uncased) + trained-vanilla
# model

In [14]:
PRETRAINED_MODELS_DIR = '../BERT/trained_models'
DATA_NAME = 'vanilla'
BASE_MOD = 'uncased_LM'
CASING = 'uncased'
model_path = os.path.join(PRETRAINED_MODELS_DIR,DATA_NAME,BASE_MOD,
                         CASING)
print(model_path)

../BERT/trained_models/vanilla/uncased_LM/uncased


In [44]:
DATA_DIR = '../data_creation/scripts/save/vanilla'

In [53]:
eval_dev_set = pd.read_csv(os.path.join(DATA_DIR,'dev.tsv'),
                          sep='\t',header=None)
eval_dev_set.columns = ['text','label','outlet']

In [54]:
eval_dev_set

Unnamed: 0,text,label,outlet
0,The fall-off in ice volume is so fast it is go...,0,1
1,"More warming leads to more fires, which releas...",0,0
2,"Just to drive home to people what that means, ...",0,1
3,"Gradual loss of these largely pristine, intact...",0,1
4,The worst-case scenario would be the creation ...,0,1
...,...,...,...
135,The theory that carbon dioxide is a pollutant ...,2,0
136,Global temperatures today are the same as they...,2,0
137,Carbon dioxide emissions provide the environme...,2,0
138,Young people like Thunberg are simply being us...,2,0


In [313]:
config = BertConfig.from_pretrained(model_path, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                          config=config)

archive_file: ../BERT/LM_finetuned/uncased_LM_cc_output/pytorch_model.bin


In [86]:
with open (os.path.join(model_path,'vocab.txt'),'r') as f:
    vocab = f.readlines()

In [91]:
vocab = [l.strip() for l in vocab]

In [314]:
to_predict = eval_dev_set.text.values

In [315]:
out = defaultdict(list)

In [316]:
for dev_ix in range(0,2):
    sent = to_predict[dev_ix]
    print(sent)
    label = eval_dev_set.label.values[dev_ix]
    encoded_sent = tokenizer.encode(sent,add_special_tokens=True)
    out['input_ids'].append(encoded_sent)
    out['sentences'].append(sent)
    out['labels'].append(label)
    
out['input_ids'] = pad_sequences(
        out['input_ids'], 
        maxlen=128, 
        dtype="long", 
        value=0, 
        truncating="post", 
        padding="post")

# get attn masks
for sent in out['input_ids']:
    tok_type_ids = [0 for tok_id in sent]
    mask = [int(tok_id > 0) for tok_id in sent]
    out['attention_mask'].append(mask)
    out['token_type_ids'].append(tok_type_ids)
print(len(out['labels']))
print(sum(out['labels']))

The fall-off in ice volume is so fast it is going to bring us to zero very quickly.
More warming leads to more fires, which release more carbon, which causes more warming, and so on.
2
0


In [317]:
objs = [{'input_ids':torch.LongTensor([list(out['input_ids'][ix])]),
        'token_type_ids':torch.LongTensor([out['token_type_ids'][ix]]),
        'attention_mask':torch.LongTensor([out['attention_mask'][ix]])} 
        for ix in range(len(out['input_ids']))]

In [158]:
# feed processed 'out' to model for prediction

In [318]:
modeled_logits = [model(**objs[ix])[0] for ix in range(len(objs))]

In [319]:
modeled_results = [torch.softmax(x, dim=1).tolist()[0] 
                  for x in modeled_logits]

In [320]:
predicted_labels = [get_pred_label(x) for x in modeled_results]

In [321]:
predicted_labels

[0, 0]

In [258]:
import pickle
from tqdm import tqdm
import json
import torch
import pandas as pd
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from tensorboardX import SummaryWriter
import numpy as np
import time
import datetime
import random
from collections import defaultdict
import argparse
import os
import scipy
import sklearn
import math

CUDA = (torch.cuda.device_count() > 0)


parser = argparse.ArgumentParser()

# # Required parameters
# parser.add_argument(
#     "--max_seq_len",
#     default=512,
#     type=int,
#     help="max seq len"
# )
# parser.add_argument(
#     "--context_size",
#     default=0,
#     type=int,
#     help="num messages to include in context"
# )
# parser.add_argument(
#     "--working_dir",
#     default='working_dir',
#     type=str,
#     help="num messages to include in context"
# )
# parser.add_argument(
#     "--epochs",
#     default=70,
#     type=int,
#     help="fine tuning epochs"
# )
# parser.add_argument(
#     "--batch_size",
#     default=10,
#     type=int,
#     help="fine tuning epochs"
# )
# parser.add_argument(
#     "--learning_rate",
#     default=2e-5,
#     type=int,
#     help="fine tuning epochs"
# )
# parser.add_argument(
#     "--seed",
#     default=420,
#     type=int,
#     help="fine tuning epochs"
# )
# parser.add_argument(
#     "--length_discard",
#     action='store_true',
#     help="discard examples that are too long"
# )
# parser.add_argument(
#     "--include_metadata",
#     action='store_true',
#     help="discard examples that are too long"
# )
# parser.add_argument(
#     "--downsample",
#     default=0.2,
#     type=float,
#     help="p = prop examples to throw out"
# )
# ARGS = parser.parse_args()


# random.seed(ARGS.seed)
# np.random.seed(ARGS.seed)
# torch.manual_seed(ARGS.seed)
# torch.cuda.manual_seed_all(ARGS.seed)

In [322]:
CLASSES = ['for','against','neutral']
NUM_LABELS = 3

def get_pred_label(res_,to_str=False):
    if to_str:
        return CLASSES[res_.index(max(res_))]
    else:
        return res_.index(max(res_))

In [323]:
PRETRAINED_MODELS_DIR = '../BERT/trained_models'

DATA_NAME = 'mturk_windowed_1_downsampled'
BASE_MOD = 'uncased_LM'
CASING = 'uncased'
DATA_DIR = os.path.join('../data_creation/scripts/save',DATA_NAME)
print(DATA_DIR)

#model_path = os.path.join(PRETRAINED_MODELS_DIR,DATA_NAME,BASE_MOD,
#                         CASING)
model_path = '../BERT/LM_finetuned/uncased_LM_cc_output'
print(model_path)

../data_creation/scripts/save/mturk_windowed_1_downsampled
../BERT/LM_finetuned/uncased_LM_cc_output


In [500]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [324]:
SEED = 1234

In [470]:
def get_out_data(dat_path,max_seq_length=500):
    #eval_set = 'train' # can also be 'test'
    data = pd.read_csv(dat_path,
                              sep='\t',header=None)
    data.columns = ['text','label']#,'outlet']
    
    out = defaultdict(list)
    
    print('Number of examples to predict:',len(data))
    to_predict = data.text.values
    true = data.label.values
    
    for dat_ix in range(len(data)):
        sent = to_predict[dat_ix]
        #print(sent)
        label = true[dat_ix]
        encoded_sent = tokenizer.encode(sent,add_special_tokens=True)
        out['input_ids'].append(encoded_sent)
        out['sentences'].append(sent)
        out['label'].append(label)

    out['input_ids'] = pad_sequences(
            out['input_ids'], 
            maxlen=max_seq_length, 
            dtype="long", 
            value=0, 
            truncating="post", 
            padding="post")


    print('Adding attention masks...')
    # get attn masks
    for sent in out['input_ids']:
        tok_type_ids = [0 for tok_id in sent]
        mask = [int(tok_id > 0) for tok_id in sent]
        out['attention_mask'].append(mask)
        out['token_type_ids'].append(tok_type_ids)
    #print(len(out['labels']))
    #print(sum(out['labels']))
    
    print('Preparing input examples for prediction...')
    #out['input_ids'] = [torch.LongTensor(x) for x in out['input_ids']]
#     objs = [{'input_ids':torch.LongTensor([list(out['input_ids'][ix])]),
#         'token_type_ids':torch.LongTensor([out['token_type_ids'][ix]]),
#         'attention_mask':torch.LongTensor([out['attention_mask'][ix]]),
#             'label':torch.LongTensor([out['label'][ix]])} 
#         for ix in range(len(out['input_ids']))]

#     objs = [{'input_ids':[list(out['input_ids'][ix])],
#         'token_type_ids':[out['token_type_ids'][ix]],
#         'attention_mask':[out['attention_mask'][ix]],
#             'label':[out['label'][ix]]} 
#         for ix in range(len(out['input_ids']))]
    
    return out

In [326]:
WORKING_DIR = '.'
writer = SummaryWriter(WORKING_DIR + '/events')

In [264]:
# Load model
config = BertConfig.from_pretrained(model_path, num_labels=NUM_LABELS)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                          config=config)
with open(os.path.join(model_path,'vocab.txt'),'r') as f:
    vocab = f.readlines()
vocab = [l.strip() for l in vocab]

archive_file: ../BERT/LM_finetuned/uncased_LM_cc_output/pytorch_model.bin


In [471]:
# if os.path.exists(WORKING_DIR + "/data.cache.pkl"):
#     data = pickle.load(open(WORKING_DIR + "/data.cache.pkl", 'rb'))
# else:
data = get_out_data(os.path.join(DATA_DIR, 'train.tsv'))
pickle.dump(data, open(WORKING_DIR + "/data.cache.pkl", 'wb'))

Number of examples to predict: 823
Adding attention masks...
Preparing input examples for prediction...


In [472]:
len(data)

5

In [477]:
train_inputs, test_inputs, train_labels, test_labels, train_masks, test_masks, = train_test_split(
    data['input_ids'], data['label'], data['attention_mask'],
    random_state=SEED, test_size=0.1)

In [493]:
def build_dataloader(*args, sampler='random'):
    #print(args[:2])
    data = (torch.tensor(x) for x in args)
    #print(data[0])
    data = TensorDataset(*data)

    #sampler = RandomSampler(data) if sampler == 'random' else SequentialSampler(data)
    dataloader = DataLoader(data, batch_size=1)

    return dataloader

In [504]:
train_dataloader = build_dataloader(
    train_inputs[:100], train_labels[:100], train_masks[:100])
test_dataloader = build_dataloader(
    test_inputs, test_labels, test_masks,
    sampler='order')

In [505]:
LEARN_RATE=2e-5
optimizer = AdamW(model.parameters(), lr=LEARN_RATE, eps=1e-8)

In [506]:
NUM_EPOCHS=1
total_steps = len(train_dataloader) * NUM_EPOCHS

In [507]:
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [508]:
if CUDA:
    model = model.cuda()

In [509]:
CUDA

False

In [510]:
len(train_dataloader)

100

In [513]:
for epoch_i in range(0, NUM_EPOCHS):
    
#     # ========================================
#     #               Training
#     # ========================================
#     print("")
#     print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, NUM_EPOCHS))
#     print('Training...')

#     losses = []
#     t0 = time.time()
#     model.train()
#     for step, batch in enumerate(train_dataloader):
#         #print(step,batch)

#         if step % 40 == 0 and not step == 0:
#             elapsed = format_time(time.time() - t0)
#             print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}. Loss: {:.2f}'.format(
#                 step, len(train_dataloader), elapsed, float(np.mean(losses))))

#         if CUDA:
#             batch = (x.cuda() for x in batch)            
#         input_ids, labels, masks = batch
#         model.zero_grad()        

#         outputs = model(
#             input_ids,
#             attention_mask=masks, 
#             labels=labels)
        
#         print(len(outputs))
        
#         #loss, _, _ = outputs
#         loss, _ = outputs
#         losses.append(loss.item())

#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         optimizer.step()
#         scheduler.step()

#     avg_loss = np.mean(losses)
#     writer.add_scalar('train/loss', np.mean(avg_loss), epoch_i)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    print("")
    print("Running Validation...")

#     t0 = time.time()
#     model.eval()
#     losses = []
#     all_preds = []
#     all_labels = []
#     log = open(WORKING_DIR + '/epoch%d.log' % epoch_i, 'w')
#     for step, batch in enumerate(test_dataloader):

#         if CUDA:
#             batch = (x.cuda() for x in batch)            
#         input_ids, labels, masks = batch

#         with torch.no_grad():        
#             outputs = model(
#                 input_ids,
#                 attention_mask=masks, 
#                 labels=labels)
#         #loss, logits, attns = outputs
#         loss, logits = outputs

#         losses.append(loss.item())

#         labels = labels.cpu().numpy()
#         input_ids = input_ids.cpu().numpy()
#         preds = scipy.special.softmax(logits.cpu().numpy(), axis=1)
#         input_toks = [
#             tokenizer.convert_ids_to_tokens(s) for s in input_ids
#         ]

#         for seq, label, pred in zip(input_toks, labels, preds):
#             sep_char = '+' if np.argmax(pred) == label else '-'
#             log.write(sep_char * 40 + '\n')
#             log.write(' '.join(seq) + '\n')
#             log.write('label: ' + str(label) + '\n')
#             log.write('pred: ' + str(np.argmax(pred)) + '\n')
#             log.write('dist: ' + str(pred) + '\n')
#             log.write('\n\n')

#             all_preds += [pred]
#             all_labels += [label]
#     log.close()
#     all_preds = np.array(all_preds)
#     all_labels = np.array(all_labels)

    avg_loss = np.mean(losses)
    f1 = sklearn.metrics.f1_score(all_labels, np.argmax(all_preds, axis=1),average='macro')
    acc = sklearn.metrics.accuracy_score(all_labels, np.argmax(all_preds, axis=1))
    #auc = sklearn.metrics.roc_auc_score(all_labels, all_preds[:, 1])

    writer.add_scalar('eval/acc', acc, epoch_i)
    #writer.add_scalar('eval/auc', auc, epoch_i)
    writer.add_scalar('eval/f1', f1, epoch_i)
    writer.add_scalar('eval/loss', f1, epoch_i)

    print("  Loss: {0:.2f}".format(avg_loss))
    print("  Accuracy: {0:.2f}".format(acc))
    print("  F1: {0:.2f}".format(f1))
    #print("  AUC: {0:.2f}".format(auc))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Done!")


  Average training loss: 0.60
  Training epcoh took: 0:05:23

Running Validation...
  Loss: 0.60
  Accuracy: 0.42
  F1: 0.26
  Validation took: 0:05:23

Done!
