In [None]:
from os import getcwd
from os.path import join, dirname

PATH_REPO = dirname(getcwd())
PATH_UTILS = join(PATH_REPO, 'utils')
PATH_DATA = join(PATH_REPO, 'data')
PATH_DESTINATION = join(PATH_REPO, 'models')

import sys
sys.path.insert(0,PATH_UTILS)


import re

from os.path import join
import pandas as pd
import numpy as np


import nlpaug.augmenter.word as naw

import matplotlib.pyplot as plt

#from transformers import DistilBertTokenizer, TFDistilBertModel
from transformers import AutoTokenizer, AutoModelForMaskedLM


import tensorflow as tf
import logging

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

tf.get_logger().setLevel('ERROR')
tf.config.list_physical_devices('GPU')

pd.set_option('max_colwidth', None)

from bert_training import train_bert, get_model, get_inputs
from preprocessing import additional_preprocessing

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Load the datasets

In [None]:
train = pd.read_csv(join(PATH_DATA, "training_set.csv"), usecols= ['text', 'label', 'french', 'spanish', 'arabic'], engine="python")
train.drop(0, inplace=True)
train = train.dropna()

validation = pd.read_csv(join(PATH_DATA, "validation_set.csv"), usecols= ['text', 'label', 'french', 'spanish', 'arabic'], engine="python")
validation.drop(0, inplace=True)
validation = validation.dropna()

test = pd.read_csv(join(PATH_DATA, "test_set.csv"), usecols= ['text', 'label', 'french', 'spanish', 'arabic'], engine="python")
test.drop(0, inplace=True)
test = test.dropna()

In [None]:
test

## Apply additional preprocessing

In [None]:
train["text"] = train["text"].astype(str)
train["french"] = train["french"].astype(str)
train["arabic"] = train["arabic"].astype(str)
train["spanish"] = train["spanish"].astype(str)

validation["text"] = validation["text"].astype(str)
validation["french"] = validation["french"].astype(str)
validation["arabic"] = validation["arabic"].astype(str)
validation["spanish"] = validation["spanish"].astype(str)

test["text"] = test["text"].astype(str)
test["french"] = test["french"].astype(str)
test["arabic"] = test["arabic"].astype(str)
test["spanish"] = test["spanish"].astype(str)

train['arabic'] = train['arabic'].apply(lambda x: re.sub('(مستخدم)|(المستخدم)', '/[USER/]', x))
train['spanish'] = train['spanish'].apply(lambda x: re.sub('(Usario)|(usario)', '/[USER/]', x))
train['french'] = train['french'].apply(lambda x: re.sub('(Utilisateur)|(utilisateur)', '/[USER/]', x))

validation['arabic'] = validation['arabic'].apply(lambda x: re.sub('(مستخدم)|(المستخدم)', '/[USER/]', x))
validation['spanish'] = validation['spanish'].apply(lambda x: re.sub('(Usario)|(usario)', '/[USER/]', x))
validation['french'] = validation['french'].apply(lambda x: re.sub('(Utilisateur)|(utilisateur)', '/[USER/]', x))

test['arabic'] = test['arabic'].apply(lambda x: re.sub('(مستخدم)|(المستخدم)', '/[USER/]', x))
test['spanish'] = test['spanish'].apply(lambda x: re.sub('(Usario)|(usario)', '/[USER/]', x))
test['french'] = test['french'].apply(lambda x: re.sub('(Utilisateur)|(utilisateur)', '/[USER/]', x))



train['text'] = train['text'].apply(lambda x: additional_preprocessing(x))
train['french'] = train['french'].apply(lambda x: additional_preprocessing(x))
train['spanish'] = train['spanish'].apply(lambda x: additional_preprocessing(x))
train['arabic'] = train['arabic'].apply(lambda x: additional_preprocessing(x))


validation['text'] = validation['text'].apply(lambda x: additional_preprocessing(x))
validation['french'] = validation['french'].apply(lambda x: additional_preprocessing(x))
validation['spanish'] = validation['spanish'].apply(lambda x: additional_preprocessing(x))
validation['arabic'] = validation['text'].apply(lambda x: additional_preprocessing(x))

test['text'] = test['text'].apply(lambda x: additional_preprocessing(x))
test['french'] = test['french'].apply(lambda x: additional_preprocessing(x))
test['spanish'] = test['spanish'].apply(lambda x: additional_preprocessing(x))
test['arabic'] = test['arabic'].apply(lambda x: additional_preprocessing(x))

In [None]:
validation

## Initialize augmenters

In [None]:
syn = naw.SynonymAug(aug_src='wordnet', stopwords=['USER'], aug_max=5)
rand = naw.RandomWordAug()

## Apply augmentation

In [None]:
train['text'] = train['text'].apply(lambda x: syn.augment(x, n=1))
train['text'] = train['text'].apply(lambda x: rand.augment(x))

## Merge different language text columns to prepare the arrays

In [None]:
en = train.loc[:, "text"].values
fr = train.loc[:, "french"].values
ar = train.loc[:, "arabic"].values
es = train.loc[:, "spanish"].values

X_train = np.concatenate((en, fr, ar, es))

train["label"] = train["label"].astype(int)
lbls = train.loc[:, 'label'].values

Y_train = np.concatenate((lbls, lbls, lbls, lbls))
train['label'] = pd.Series(Y_train)


en_dev = validation.loc[:, "text"].values
fr_dev = validation.loc[:, "french"].values
ar_dev = validation.loc[:, "arabic"].values
es_dev = validation.loc[:, "spanish"].values

X_dev = np.concatenate((en_dev, fr_dev, ar_dev, es_dev))

validation["label"] = validation["label"].astype(int)
lbls_dev = validation.loc[:, 'label'].values

Y_dev = np.concatenate((lbls_dev, lbls_dev, lbls_dev, lbls_dev))
validation["label"] = pd.Series(Y_dev)


en_test = test.loc[:, "text"].values
fr_test = test.loc[:, "french"].values
ar_test = test.loc[:, "arabic"].values
es_test = test.loc[:, "spanish"].values

X_test = np.concatenate((en_test, fr_test, ar_test, es_test))

test["label"] = test["label"].astype(int)
lbls_test = test.loc[:, 'label'].values

Y_test = np.concatenate((lbls_test, lbls_test, lbls_test, lbls_test))
test["label"] = pd.Series(Y_test)

## Load the tokenizer and the transformer

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-multilingual-cased')

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/869M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-multilingual-cased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
special_tokens = []

## Prepare inputs

In [None]:
dev_inputs = get_inputs(tokenizer, X_dev, 128)
test_inputs = get_inputs(tokenizer, X_test, 128)

## Build the model

In [None]:
model = get_model(128, transformer_model, 1)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_token (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 masked_token (InputLayer)      [(None, 128)]        0           []                               
                                                                                                  
 tf_distil_bert_model (TFDistil  TFBaseModelOutput(l  134734080  ['input_token[0][0]',            
 BertModel)                     ast_hidden_state=(N               'masked_token[0][0]']           
                                one, 128, 768),                                                   
                                 hidden_states=None                                           

## Train the model

In [None]:
model = train_bert(np.array(X_train), np.array(Y_train), np.array(X_dev), np.array(Y_dev), tokenizer, 16, 64, transformer_model, ["Gratitude", "No Gratitude"], 1e-5, 16)
model.save(join(PATH_DESTINATION, 'bert_multi_16_128_1e-5_16'))

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16




# Look at the results

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

#new_model = tf.keras.models.load_model('saved_model/my_model')

preds_test = model.predict(test_inputs)
y_pred_bool_test = np.argmax(preds_test, axis=1)

precision, recall, fscore, support = score(Y_test, y_pred_bool_test, average='macro')
print(" Precision:\t", precision, "\n", "Recall:\t", recall, "\n", "F1:\t", fscore)

Precision:	 0.700027570995313 
 Recall:	 0.6476463313451994 
 F1:	 0.5641620777014215


In [None]:
from sklearn.metrics import classification_report

preds_dev = model.predict(dev_inputs)
preds_test = model.predict(test_inputs)

y_pred_bool_dev = np.argmax(preds_dev, axis=1)
y_pred_bool_test = np.argmax(preds_test, axis=1)

print(classification_report(Y_dev, y_pred_bool_dev))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       404
           1       0.98      0.99      0.99       396

    accuracy                           0.99       800
   macro avg       0.99      0.99      0.99       800
weighted avg       0.99      0.99      0.99       800



In [None]:
print(classification_report(Y_test, y_pred_bool_test))

              precision    recall  f1-score   support

           0       0.93      0.34      0.50      1284
           1       0.47      0.96      0.63       796

    accuracy                           0.57      2080
   macro avg       0.70      0.65      0.56      2080
weighted avg       0.75      0.57      0.55      2080



## Prepare lists for Error Analysis by language

In [None]:
labels = Y_test
preds = y_pred_bool_test
texts = X_test

ar_t = []
ar_l = []
ar_p = []

fr_l = []
fr_t = []
fr_p = []

es_l = []
es_t = []
es_p = []

en_l = []
en_t = []
en_p = []

for i in range(len(texts)):
  if langid.classify(texts[i])[0] == 'en':
    en_t.append(texts[i])
    en_l.append(labels[i])
    en_p.append(preds[i])
  elif langid.classify(texts[i])[0] == 'es':
    es_t.append(texts[i])
    es_l.append(labels[i])
    es_p.append(preds[i])
  elif langid.classify(texts[i])[0] == 'fr':
    fr_t.append(texts[i])
    fr_l.append(labels[i])
    fr_p.append(preds[i])
  elif langid.classify(texts[i])[0] == 'ar':
    ar_t.append(texts[i])
    ar_l.append(labels[i])
    fr_p.append(preds[i])

NameError: ignored

## English results

In [None]:
precision, recall, fscore, support = score(en_l, en_p, average='macro')
print("Precision:\t", precision, "\n", "Recall:\t", recall, "\n", "F1:\t", fscore)

In [None]:
print(classification_report(en_l, en_p))

## French results

In [None]:
precision, recall, fscore, support = score(fr_l, fr_p, average='macro')
print("Precision:\t", precision, "\n", "Recall:\t", recall, "\n", "F1:\t", fscore)

In [None]:
print(classification_report(fr_l, fr_p))

## Spanish results

In [None]:
precision, recall, fscore, support = score(es_l, es_p, average='macro')
print("Precision:\t", precision, "\n", "Recall:\t", recall, "\n", "F1:\t", fscore)

In [None]:
print(classification_report(es_l, es_p))

## Arabic results

In [None]:
precision, recall, fscore, support = score(ar_l, ar_p, average='macro')
print("Precision:\t", precision, "\n", "Recall:\t", recall, "\n", "F1:\t", fscore)

In [None]:
print(classification_report(ar_l, ar_p))

# Error analysis English

In [None]:
# false positives
[en_t[i] for i in range(len(en_t)) if en_l[i] == 0 and en_p[i] == 1]

In [None]:
# false negatives
[en_t[i] for i in range(len(en_t)) if en_l[i] == 1 and en_p[i] == 0]

In [None]:
# true negatives
[en_t[i] for i in range(len(en_t)) if en_l[i] == 0 and en_p[i] == 0]

In [None]:
# true positives
[en_t[i] for i in range(len(en_t)) if en_l[i] == 1 and en_p[i] == 1]

# Error Analysis French

In [None]:
# false positives
[fr_t[i] for i in range(len(fr_t)) if fr_l[i] == 0 and fr_p[i] == 1]

In [None]:
# false negatives
[fr_t[i] for i in range(len(fr_t)) if fr_l[i] == 1 and fr_p[i] == 0]

In [None]:
# true negatives
[fr_t[i] for i in range(len(fr_t)) if fr_l[i] == 0 and fr_p[i] == 0]

In [None]:
# true positives
[fr_t[i] for i in range(len(fr_t)) if fr_l[i] == 1 and fr_p[i] == 1]

# Error Analysis Spanish

In [None]:
# false positives
[es_t[i] for i in range(len(es_t)) if es_l[i] == 0 and es_p[i] == 1]

In [None]:
# false negatives
[es_t[i] for i in range(len(es_t)) if es_l[i] == 1 and es_p[i] == 0]

In [None]:
# true negatives
[es_t[i] for i in range(len(es_t)) if es_l[i] == 0 and es_p[i] == 0]

In [None]:
# true positives
[es_t[i] for i in range(len(es_t)) if es_l[i] == 1 and es_p[i] == 1]

# Error Analysis Arabic

In [None]:
# false positives
[ar_t[i] for i in range(len(ar_t)) if ar_l[i] == 0 and ar_p[i] == 1]

In [None]:
# false negatives
[ar_t[i] for i in range(len(ar_t)) if ar_l[i] == 1 and ar_p[i] == 0]

In [None]:
# true negatives
[ar_t[i] for i in range(len(ar_t)) if ar_l[i] == 0 and ar_p[i] == 0]

In [None]:
# true positives
[ar_t[i] for i in range(len(ar_t)) if ar_l[i] == 1 and ar_p[i] == 1]