In [34]:
# Set up GPU

import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


In [35]:
#Install transformers
!pip install transformers



In [36]:
#download and set-up IMDB data

import tensorflow_datasets as tfds
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews', 
          split = (tfds.Split.TRAIN, tfds.Split.TEST),
          as_supervised=True,
          with_info=True)
print('info', ds_info)

INFO:absl:No config specified, defaulting to first: imdb_reviews/plain_text
INFO:absl:Load dataset info from /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0
INFO:absl:Reusing dataset imdb_reviews (/root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split (Split('train'), Split('test')), from /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0


info tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning

In [37]:
#Let's look at some reviews and their labels
#Train and test datasets are split 50:50 and the examples are in the form of (label, text)

for review, label in tfds.as_numpy(ds_train.take(3)):
    print('review:', review.decode()[0:150], label)


review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be the 0
review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable  0
review: Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem 0


In [38]:
#Apply BERT tokenizer on all the examples - this can be done using encode_plus function.
import time
t0 = time.time()

def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

# def convert_example_to_feature(decoded_reviews):
#   bert_input = tokenizer.encode_plus(
#                         decoded_reviews,                      
#                         add_special_tokens = True, # add [CLS], [SEP]
#                         max_length = 192, # max length of the text that can go to BERT
#                         pad_to_max_length = True, # add [PAD] tokens
#                         return_attention_mask = True, # add attention mask to not focus on pad tokens
#               )
#   return bert_input

def convert_example_to_feature(review):
  # combine step for tokenization, WordPiece vector mapping and will add also special tokens and truncate reviews longer than our max length
  return tokenizer.encode_plus(review, 
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = 128, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                return_attention_mask = True,# add attention mask to not focus on pad tokens
                truncation = True 
              )
  
def encode_examples(ds, limit=-1):
  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []
  if (limit > 0):
      ds = ds.take(limit)
    
  for review, label in tfds.as_numpy(ds):
    bert_input = convert_example_to_feature(review.decode())
  
    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([label])
  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


format_time(time.time() - t0)

'0:00:00'

In [39]:

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

t0 = time.time()
batch_size = 32

# train dataset
ds_train_encoded = encode_examples(ds_train).shuffle(10000).batch(batch_size)
# test dataset
ds_test_encoded = encode_examples(ds_test).batch(batch_size)
format_time(time.time() - t0)

'0:05:27'

In [40]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5
# we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1

# model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])



All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
import warnings
warnings.filterwarnings('ignore')

bert_history = model.fit(ds_train_encoded, epochs=number_of_epochs, validation_data=ds_test_encoded)




























In [42]:
from google.colab import drive
drive.mount('/content/drive')

!ls "/content/drive/My Drive/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 EBSCO	   FoundryShowcase.png	 'MIDS Resources'  'Shared Items'   W201   W266
 Finance  'Getting started.pdf'   Presentations     W200	    W207


In [43]:
#load the financial phrasebank
import itertools
import random
s2 = []
l2 = []


with open('/content/drive/My Drive/W266/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt', 'r', encoding='cp1252') as f:  
  lines = f.readlines()
  random.shuffle(lines)
  for line in lines:
    line_split = line.split('@')
    if line_split[-1][:-1] != 'neutral':
      s2.append(line_split[:-1])
      l2.append(line_split[-1][:-1])    

sentences = list(itertools.chain(*s2))
labels = []

for label in l2:
  if label == 'positive':
    labels.append(1)
  elif label == 'negative':
    labels.append(0)



print('len of senteces:',len(sentences))
print('len of labels:',len(labels))

len of senteces: 873
len of labels: 873


In [44]:
#Truncate sentences and encode

max_len = 150

# For every sentence...
for sent in sentences:

    #let's truncate sequneces to maxlen 
    if len(sent) > max_len:
      sent = sent[:max_len]

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)


def encode_sentences(sentences,label, limit=-1):
  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = list(label)
  if (limit > 0):
      ds = ds.take(limit)
    
  for sent in sentences:
    bert_input = convert_example_to_feature(sent)
  
    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
  #  label_list.append([label])
  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)



#Financial Phrasebank
train_sentences_length = int(0.8*len(sentences))
val_sentences_length = int(0.1*len(sentences))
test_sentences_length = len(sentences) - train_sentences_length-val_sentences_length

#  tf.contrib.training.stratified_sample([data], label, target_probs)


x_train = sentences[:train_sentences_length]
y_train = labels[:train_sentences_length]

x_val = sentences[train_sentences_length:train_sentences_length+val_sentences_length]
y_val = labels[train_sentences_length:train_sentences_length+val_sentences_length]

x_test = sentences[-test_sentences_length:]
y_test = labels[-test_sentences_length:]

x_total = sentences
y_total = labels

print("Length of train, val and test: ", len(x_train),len(x_val),len(x_test))
print("Length of train, val and test: ", train_sentences_length,val_sentences_length,test_sentences_length )

print("Length of total dataset = ", train_sentences_length+val_sentences_length+test_sentences_length)


#Train dataset
fin_train_encoded = encode_sentences(x_train,y_train).shuffle(100).batch(batch_size)

#Val dataset
fin_val_encoded = encode_sentences(x_val, y_val).batch(batch_size)

# test dataset
fin_test_encoded = encode_sentences(x_test,y_test).batch(batch_size)

#Toal dataset 
fin_total_encoded = encode_sentences(x_total,y_total).batch(batch_size)


# x_train[0]

Max sentence length:  150
Length of train, val and test:  698 87 88
Length of train, val and test:  698 87 88
Length of total dataset =  873


In [45]:
import numpy as np
from sklearn.utils import class_weight
class_weights_test = class_weight.compute_class_weight('balanced', np.unique(y_test), y_test)
class_weights_train = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)


print("Train:",class_weights_train)
print("Test:",class_weights_test)


Train: [1.41869919 0.77212389]
Test: [1.62962963 0.72131148]


In [46]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)

class_weight_dict = {0: class_weights[0],
                1: class_weights[1]
               }

bert_history2 = model.fit(fin_train_encoded, class_weight=class_weight_dict , epochs=3, validation_data=fin_val_encoded)

Epoch 1/3
















Epoch 2/3
Epoch 3/3


In [47]:
print("Train Accuracy(%): {:.2f}".format(model.evaluate(fin_train_encoded, batch_size=None, verbose=1)[1]*100))
print("Val Accuracy(%): {:.2f}".format(model.evaluate(fin_val_encoded, batch_size=None, verbose=1)[1]*100))
print("Test Accuracy(%): {:.2f}".format(model.evaluate(fin_test_encoded, batch_size=None, verbose=1)[1]*100))
print("TOTAL Accuracy(%): {:.2f}".format(model.evaluate(fin_total_encoded, batch_size=None, verbose=1)[1]*100))


Train Accuracy(%): 99.14
Val Accuracy(%): 96.55
Test Accuracy(%): 95.45
TOTAL Accuracy(%): 98.51
