<a href="https://colab.research.google.com/github/Rseiji/ChessCommentaryGeneration/blob/master/BERT_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import torch

!pip install transformers
!pip install wget

from transformers import BertTokenizer
from torch.utils.data import TensorDataset, random_split
from transformers import BertForSequenceClassification, AdamW, BertConfig

from google.colab import drive

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |▍                               | 10kB 19.7MB/s eta 0:00:01[K     |▊                               | 20kB 6.5MB/s eta 0:00:01[K     |█▏                              | 30kB 6.9MB/s eta 0:00:01[K     |█▌                              | 40kB 7.9MB/s eta 0:00:01[K     |█▉                              | 51kB 6.8MB/s eta 0:00:01[K     |██▎                             | 61kB 7.3MB/s eta 0:00:01[K     |██▋                             | 71kB 8.3MB/s eta 0:00:01[K     |███                             | 81kB 8.8MB/s eta 0:00:01[K     |███▍                            | 92kB 8.2MB/s eta 0:00:01[K     |███▊                            | 102kB 8.9MB/s eta 0:00:01[K     |████                            | 112kB 8.9MB/s eta 0:00:01[K     |████▌                           | 122kB 8.9M

In [None]:
## GPU google colab

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Adding path

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


# Code

In [None]:
# Reading input data
# file_path = ..... inserir aqui o file path .....
df = pd.read_csv(file_path)
sentences = df['sentences'].tolist()
labels = df['label'].tolist()

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)



In [None]:
def get_phrases_maximun_len(sentences, default_max_len=None):
  """Bert demands a default maximun phrase len to work properly.
  This lenght is measured by number of words. If the phrase is
  smaller than maximun len, the remaining blank elements are filled
  with padding tokens.
  """
  if default_max_len:
    return default_max_len

  max_len = 0
  # For every sentence...
  for sent in sentences:
      # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
      input_ids = tokenizer.encode(sent, add_special_tokens=True)

      # Update the maximum sentence length.
      max_len = max(max_len, len(input_ids))
  print('Max sentence length: ', max_len)
  return max_len

In [None]:
def tokenize_sentences(sentences, max_length):
  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  attention_masks = []

  # For every sentence...
  for sent in sentences:
      # `encode_plus` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      #   (5) Pad or truncate the sentence to `max_length`
      #   (6) Create attention masks for [PAD] tokens.
      encoded_dict = tokenizer.encode_plus(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = 64,           # Pad & truncate all sentences.
                          pad_to_max_length = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt'     # Return pytorch tensors.
                    )
      
      # Add the encoded sentence to the list.    
      input_ids.append(encoded_dict['input_ids'])
      
      # And its attention mask (simply differentiates padding from non-padding).
      attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  return (input_ids, attention_masks)

  # Print sentence 0, now as a list of IDs.
  # print('Original: ', sentences[0])
  # print('Token IDs:', input_ids[0])

In [None]:
def make_dataset(sentences, labels, max_phrase_len=None):
  max_phrase_len = get_phrases_maximun_len(sentences, max_phrase_len)
  input_ids, attention_masks = tokenize_sentences(sentences, max_phrase_len)
  labels = torch.tensor(labels)

In [None]:
def cross_validation(input_ids, attention_masks, labels):
  # Combine the training inputs into a TensorDataset.
  dataset = TensorDataset(input_ids, attention_masks, labels)

  # Create a 90-10 train-validation split.
  # Calculate the number of samples to include in each set.
  train_size = int(0.9 * len(dataset))
  val_size = len(dataset) - train_size

  # Divide the dataset by randomly selecting samples.
  train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

  print('{:>5,} training samples'.format(train_size))
  print('{:>5,} validation samples'.format(val_size))

  return train_dataset, val_dataset

In [None]:
def data_loader(batch_size, train_dataset, val_dataset):
  # The DataLoader needs to know our batch size for training, so we specify it 
  # here. For fine-tuning BERT on a specific task, the authors recommend a batch 
  # size of 16 or 32.
  batch_size = 32

  # Create the DataLoaders for our training and validation sets.
  # We'll take training samples in random order. 
  train_dataloader = DataLoader(
              train_dataset,  # The training samples.
              sampler = RandomSampler(train_dataset), # Select batches randomly
              batch_size = batch_size # Trains with this batch size.
          )

  # For validation the order doesn't matter, so we'll just read them sequentially.
  validation_dataloader = DataLoader(
              val_dataset, # The validation samples.
              sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
              batch_size = batch_size # Evaluate with this batch size.
          )
  return train_dataloader, validation_dataloader

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
# ideia: 
# 1- Dataset: filtrar comentarios
# 2- Engine: gerar labels
# 3- BERT: setup
# 4- BERT: definir interface (ultima camada)
# 5- BERT: definir 