# Handle data

## Import data

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

# you need to import data.json label.csv and categories_string.csv on this rep
df = pd.read_json("data.json")
label = pd.read_csv("label.csv")
category = pd.read_csv("categories_string.csv")
print(df[0:10])

    Id                                        description gender
0    0   She is also a Ronald D. Asmus Policy Entrepre...      F
1    1   He is a member of the AICPA and WICPA. Brent ...      M
2    2   Dr. Aster has held teaching and research posi...      M
4    3   He runs a boutique design studio attending cl...      M
5    4   He focuses on cloud security, identity and ac...      M
7    5   He is author of several books, including the ...      M
8    6   As an associate Web producer for WFIU, Liz ma...      F
9    7   He holds a Journalism Master’s degree from Ro...      M
10   8   Her teachings get straight to the heart of Ta...      F
12   9   For more quips and tips, refer to her blog, “...      F


## Separate data into train/test and validation set

In [None]:
# separate data between those for training and those for validation : 80 000 each
trainingXSet,testingXSet,trainingYSet,testingYSet = train_test_split(df.description, label.Category,random_state=2018, test_size=80000, train_size = 80000)

# BERT

# BERT Functions

### Data pre-processing function

In [None]:
# tokenize
def create_inputs_ids(x_set,tokenizer,MAX_LEN = 128):
  x_spe_set = ['[CLS] '+x+' [SEP]' for x in x_set]
  x_tokenized = [tokenizer.tokenize(x) for x in x_spe_set]
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in x_tokenized]
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
  return input_ids

In [None]:
def create_attention_masks(input_ids):
  attention_masks = []
  # Create a mask of 1s for each token followed by 0s for padding, don't think it works for us
  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
  return attention_masks

In [None]:
# create dataloader
def create_dataloader(inputs,labels,masks,batch_size = 32):
  tsr_inputs = torch.tensor(inputs)
  tsr_lables = torch.tensor(labels.values)
  tsr_masks = torch.tensor(masks)
  data = TensorDataset(tsr_inputs, tsr_masks, tsr_lables)
  sampler = RandomSampler(data)
  dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
  return dataloader

### Bert creation model

In [None]:
def model_creation(nb_labels):
  # Create BertForSequenceClassification instance.It has a unique classification alyar at the end. 
  model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=nb_labels)
  model.cuda()
  return model

### BERT Training

In [None]:
def model_training(model,training_dataloader,validation_dataloader,epochs = 10):
  # BERT fine-tuning parameters
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.01},
      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0}
  ]
  # create the optimizer
  optimizer = BertAdam(optimizer_grouped_parameters,
                      lr=2e-5,
                      warmup=.1)

  # Function to calculate the accuracy of our predictions
  def flat_accuracy(preds, labels):
      pred_flat = np.argmax(preds, axis=1).flatten()
      labels_flat = labels.flatten()
      return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
  # Store our loss and accuracy for plotting
  train_loss_set = []

  analysis = []
  # BERT training loop
  for _ in trange(epochs, desc="Epoch"):  
    
    ## TRAINING
    
    # Set our model to training mode
    model.train()  
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels = batch
      # Clear out the gradients (by default they accumulate)
      optimizer.zero_grad()
      # Forward pass => trianing
      loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      train_loss_set.append(loss.item())    
      # Backward pass
      loss.backward()
      # Update parameters and take a step using the computed gradient
      optimizer.step()
      # Update tracking variables
      tr_loss += loss.item()
      nb_tr_examples += b_input_ids.size(0)
      nb_tr_steps += 1
    analysis.append("Train loss: {}".format(tr_loss/nb_tr_steps))
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
        
    ## VALIDATION

    # Put model in evaluation mode
    model.eval()
    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels = batch
      # Telling the model not to compute or store gradients, saving memory and speeding up validation
      with torch.no_grad():
        # Forward pass, calculate logit predictions
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)    
      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      tmp_eval_accuracy = flat_accuracy(logits, label_ids)    
      eval_accuracy += tmp_eval_accuracy
      nb_eval_steps += 1
    analysis.append("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

  # plot training performance
  plt.figure(figsize=(15,8))
  plt.title("Training loss")
  plt.xlabel("Batch")
  plt.ylabel("Loss")
  plt.plot(train_loss_set)
  plt.show()
  with open("logs.txt","w") as f :
    for line in analysis:
      f.write(line+"\n")
  files.download("logs.txt")
  return model

### BERT Evaluation

In [None]:
def evaluation_model(model,test_dataloader) :
  # Put model in evaluation mode
  model.eval()
  # Tracking variables 
  predictions , true_labels = [], []
  # Predict 
  for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()  
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
    
  # Import and evaluate each test batch using Matthew's correlation coefficient
  from sklearn.metrics import matthews_corrcoef
  matthews_set = []
  for i in range(len(true_labels)):
    matthews = matthews_corrcoef(true_labels[i],
                  np.argmax(predictions[i], axis=1).flatten())
    matthews_set.append(matthews)
    
  # Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset
  flat_predictions = [item for sublist in predictions for item in sublist]
  flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
  flat_true_labels = [item for sublist in true_labels for item in sublist]
  print('Classification accuracy using BERT Fine Tuning: {0:0.2%}'.format(matthews_corrcoef(flat_true_labels, flat_predictions)))
  return 'Classification accuracy using BERT Fine Tuning: {0:0.2%}'.format(matthews_corrcoef(flat_true_labels, flat_predictions))

### Save BERT

In [None]:
# function use to donwload files on the computer => use it if you don't want to loose 5h of prediction because google colab reset
def save_model_directory(model,tokenizer) :
  %mkdir models
  output_dir = "./models/"

  # Step 1: Save a model, configuration and vocabulary that you have fine-tuned

  # If we have a distributed model, save only the encapsulated model
  # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
  model_to_save = model.module if hasattr(model, 'module') else model

  # If we save using the predefined names, we can load using `from_pretrained`
  output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
  output_config_file = os.path.join(output_dir, CONFIG_NAME)

  torch.save(model_to_save.state_dict(), output_model_file)
  model_to_save.config.to_json_file(output_config_file)
  tokenizer.save_vocabulary(output_dir)
  files.download("./models/vocab.txt")
  files.download("./models/config.json")
  files.download("./models/pytorch_model.bin")

In [None]:
# use to save the information about the model during the training 
def save_accuracy_validation(validationStr):
  with open("./models/validation.txt","w") as f :
    f.write(validationStr+"\n")
  files.download("./models/validation.txt")

# BERT Processing

In [None]:
# install
!pip install pytorch-pretrained-bert pytorch-nlp

# BERT imports
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
!pip install transformers
from transformers import WEIGHTS_NAME, CONFIG_NAME
from google.colab import files
import os
% matplotlib inline

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)



'Tesla T4'

In [None]:
# create inpus :
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
learning_inputs_ids = create_inputs_ids(trainingXSet,tokenizer,128)
learning_masks = create_attention_masks(learning_inputs_ids)

# split into training and validation inputs, same goes for masks:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(learning_inputs_ids, trainingYSet,random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(learning_masks, learning_inputs_ids,random_state=2018, test_size=0.1)

# Create data loader:
train_dataloader = create_dataloader(train_inputs,train_labels,train_masks)
validation_dataloader = create_dataloader(validation_inputs,validation_labels,validation_masks)

#init model
model = model_creation(28)

# training model with 10 epochs
model = model_training(model,train_dataloader,validation_dataloader,10)

# dl analysis + weight onto computer
save_model_directory(model,tokenizer)

#############
# create validation input :
testing_inputs_ids = create_inputs_ids(testingXSet,tokenizer,128)
testing_masks = create_attention_masks(testing_inputs_ids)
# Create data loader:
test_dataloader = create_dataloader(testing_inputs_ids,testingYSet,testing_masks)
# validation 
validation_str = evaluation_model(model,test_dataloader)
# saved onto computer
save_accuracy_validation(validation_str)

Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (595 > 512). Running this sequence through BERT will result in indexing errors
t_total value of -1 results in schedule not being applied
Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

KeyboardInterrupt: ignored