<a href="https://colab.research.google.com/github/Nour-Mws/french_sentiment_analysis_on_tweets/blob/main/training_allocine_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports et fonctions

In [4]:
!pip install transformers>=4.0
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |▎                               | 10kB 16.6MB/s eta 0:00:01[K     |▌                               | 20kB 14.8MB/s eta 0:00:01[K     |▉                               | 30kB 9.2MB/s eta 0:00:01[K     |█                               | 40kB 9.2MB/s eta 0:00:01[K     |█▍                              | 51kB 5.6MB/s eta 0:00:01[K     |█▋                              | 61kB 5.3MB/s eta 0:00:01[K     |██                              | 71kB 5.7MB/s eta 0:00:01[K     |██▏                             | 81kB 6.3MB/s eta 0:00:01[K     |██▌                             | 92kB 6.7MB/s eta 0:00:01[K     |██▊                             | 102kB 5.3MB/s eta 0:00:01[K     |███                             | 112kB 5.3MB/s eta 0:00:01[K     |███▎                 

In [5]:
import numpy as np
import pickle
from sklearn.base import BaseEstimator
from sklearn import metrics
from sklearn.base import TransformerMixin
import tensorflow as tf
import time
from transformers import CamembertTokenizer
from transformers import TFCamembertForSequenceClassification

In [6]:
class CamembertPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, tokenizer, max_seq_length):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def fit(self, X=None):
        pass

    def transform(self, X, y):
        # 1. Tokenize
        X_encoded = encode_reviews(self.tokenizer, X, self.max_seq_length)
        # 2. Labels
        y_array = np.array(y)
        return X_encoded, y_array

    def fit_transform(self, X, y):
        return self.transform(X, y)


class EarlyStoppingModel(BaseEstimator):
    def __init__(
        self, transformers_model, max_epoches, batch_size, validation_data):
        self.model = transformers_model
        self.max_epoches = max_epoches
        self.batch_size = batch_size
        self.validation_data = validation_data

    def fit(self, X, y):
        # Defines early stopper
        early_stopper = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', mode='auto', patience=2, # only 1 !
            verbose=1, restore_best_weights=True
        )

        # Train model on data subset
        self.model.fit(
            X, y,
            validation_data=self.validation_data,
            epochs=self.max_epoches,
            batch_size=self.batch_size,
            callbacks=[early_stopper],
            verbose=1
        )
        return self

    def predict(self, X):
        scores = self.model.predict(X)
        y_pred = np.argmax(scores, axis=1)
        return y_pred


def accuracy_vs_training_data(camembert_model, initial_weights,
                              preprocessor, sizes,
                              train_reviews, train_labels,
                              val_reviews, val_labels,
                              test_reviews, test_labels):
    test_accuracies = []
    X_val, y_val = preprocessor.transform(val_reviews, val_labels)
    X_test, y_test = preprocessor.transform(test_reviews, test_labels)

    for size in sizes:
      # Preprocess data
      X_train, y_train = preprocessor.fit_transform(
          train_reviews[:size], train_labels[:size]
      )

      # Reset weights to initial value
      camembert_model.set_weights(initial_weights)

      best_model = EarlyStoppingModel(
          camembert_model, max_epoches=20, batch_size=4,
          validation_data=(X_val, y_val)
      )

      # Train model
      best_model.fit(X_train, y_train)

      # Evaluate on test set
      y_pred = best_model.predict(X_test)
      test_acc = metrics.accuracy_score(y_test, y_pred)
      test_accuracies.append(test_acc)
      print("Test acc: " + str(test_acc))

    return test_accuracies


def encode_reviews(tokenizer, reviews, max_length):
  token_ids = np.zeros(shape=(len(reviews), max_length), dtype=np.int32)
  for i, review in enumerate(reviews):
      encoded = tokenizer.encode(review, max_length=max_length)
      token_ids[i, 0:len(encoded)] = encoded
  attention_mask = (token_ids != 0).astype(np.int32)
  return {"input_ids": token_ids, "attention_mask": attention_mask}


def load_dataset(dataset_path):
  with open(dataset_path, 'rb') as reader:
      data = pickle.load(reader)
  train_reviews = np.array(data["train_set"]['review'])
  val_reviews = np.array(data["val_set"]['review'])
  test_reviews = np.array(data["test_set"]['review'])
  train_labels = data["train_set"]['polarity']
  val_labels = data["val_set"]['polarity']
  test_labels = data["test_set"]['polarity']
  class_names = data['class_names']

  return (train_reviews, train_labels, val_reviews, val_labels, test_reviews,
          test_labels, class_names)



In [7]:
device_name = tf.test.gpu_device_name()
device_name

''

In [9]:
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

In [10]:
tf.test.is_built_with_cuda()

True

# Load data

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
PICKLE_PATH = "/content/drive/My Drive/allocine_dataset.pickle"
MAX_SEQ_LEN = 400 # in terms of generated tokens (not words)

In [12]:
(train_reviews, train_labels, val_reviews, val_labels, test_reviews,
  test_labels, class_names) = load_dataset(PICKLE_PATH)

# Load classification and pre-processing models

In [16]:
def create_model():
  model_name = "camembert-base"
  tokenizer = CamembertTokenizer.from_pretrained(model_name)
  preprocessor = CamembertPreprocessor(tokenizer, MAX_SEQ_LEN)

  model = TFCamembertForSequenceClassification.from_pretrained('jplu/tf-camembert-base')
  opt = tf.keras.optimizers.Adam(learning_rate=5e-6, epsilon=1e-08)
  loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  model.compile(optimizer=opt, loss=loss_fn, metrics=['accuracy'])
  return model

All model checkpoint layers were used when initializing TFCamembertForSequenceClassification.

Some layers of TFCamembertForSequenceClassification were not initialized from the model checkpoint at jplu/tf-camembert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_camembert_for_sequence_classification_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  110031360 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  592130    
Total params: 110,623,490
Trainable params: 110,623,490
Non-trainable params: 0
_________________________________________________________________


In [None]:
tf.get_logger().setLevel('ERROR')
model = create_model()
initial_weights = model.get_weights()
model.summary()

# Training

## Text encoding

In [23]:
#encoding input
size = 10000
X_train, y_train = preprocessor.fit_transform(
        train_reviews[:size], train_labels[:size]
    )
X_val, y_val = preprocessor.transform(val_reviews[:size], val_labels[:size])
X_test, y_test = preprocessor.transform(test_reviews[:size], test_labels[:size])

## Training time with GPU

In [16]:
epochs = 5
batch_size = 16
with tf.device('/device:GPU:0'):
  tic = time.time()
  model.fit(
            X_train, y_train,
            batch_size = batch_size,
            validation_data=(X_val, y_val),
            epochs=epochs,
            #callbacks=[early_stopper],
            #verbose=1
        )
  gpu_time = time.time() - tic 
  print('Total GPU time for {} texts on {} epochs (batch size {}) is {}'.format(
      size, epochs, batch_size, str(round(gpu_time, 2))))  

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Total GPU time for 100 texts on 5 epochs (batch size 16) is 68.07


In [39]:
print('Total GPU time for {} texts on {} epochs (batch size {}) is {}'.format(size, epochs, batch_size, str(round(gpu_time, 2))))

Total GPU time for 100 texts on 5 epochs (batch size 4) is 61.38


## Inference time with GPU

In [50]:
with tf.device('/device:GPU:0'):
  tic = time.time()
  pred = model.predict(X_test)
  gpu_time = time.time() - tic 
  print('Total GPU time for {} texts is {}'.format(size, str(round(gpu_time, 2))))

Total GPU time for 1000 texts is 30.74


## Training time with CPU

In [None]:
with tf.device('/cpu:0'):
  tic = time.time()
  model.fit(
            X_train, y_train,
            #use_multiprocessing=True,
            #validation_data=self.validation_data,
            epochs=2,
            #batch_size=self.batch_size,
            #callbacks=[early_stopper],
            #verbose=1
        )
  gpu_time = time.time() - tic 
  print('Total CPU time for {} texts is {}'.format(size, str(round(gpu_time, 2))))
  

Epoch 1/2


## Training time with TPU

The code below is very slow on the first run=epoch (much slower than GPU, takes 77 sec. for 100 texts) but lightening fast on the next ones. 
Code adapted from [TPUs in Colab](https://colab.research.google.com/notebooks/tpu.ipynb#scrollTo=hJl3vNtJOB-x). 

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [19]:
with tpu_strategy.scope():
  model = create_model()

All model checkpoint layers were used when initializing TFCamembertForSequenceClassification.

Some layers of TFCamembertForSequenceClassification were not initialized from the model checkpoint at jplu/tf-camembert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
epochs = 5
batch_size = 16
tic = time.time()
model.fit(
          X_train, y_train,
          batch_size = batch_size,
          validation_data=(X_val, y_val),
          epochs=epochs,
          #callbacks=[early_stopper],
          #verbose=1
      )
gpu_time = time.time() - tic 
print('Total GPU time for {} texts on {} epochs (batch size {}) is {}'.format(
    size, epochs, batch_size, str(round(gpu_time, 2))))  

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Total GPU time for 10000 texts on 5 epochs (batch size 16) is 428.62
