In [1]:
!pip install transformers
!pip install emoji
!pip install contractions

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 5.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 24.3MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |███████

In [1]:
# Data manipulation libraries
import sys, os
import pandas as pd
import numpy as np
import json

import emoji
import contractions
import re

# Scikit-learn packages
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight

# Packages to define a BERT model
from transformers import TFBertModel, BertTokenizerFast, BertConfig

# Keras and TensorFlow packages
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras import backend as K
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import TruncatedNormal

In [22]:
# Importing train, validation and test datasets with preprocessed texts and labels
train_GE = pd.read_excel("/content/train.xlsx")
val_GE = pd.read_excel("/content/val.xlsx")
test_GE = pd.read_excel("/content/test.xlsx")
# Shape validation
print(train_GE.shape)


(16000, 2)


In [23]:
data = train_GE.copy()
data.drop(["review"],axis=1,inplace=True)
data=pd.get_dummies(data)
train = pd.concat([train_GE,data], axis=1, join='inner')
train.drop(["sentiment"],axis=1,inplace=True)

In [24]:
data1 = val_GE.copy()
data1.drop(["review"],axis=1,inplace=True)
data1=pd.get_dummies(data1)
val = pd.concat([val_GE,data1], axis=1, join='inner')
val.drop(["sentiment"],axis=1,inplace=True)

In [25]:
data2 = test_GE.copy()
data2.drop(["review"],axis=1,inplace=True)
data2=pd.get_dummies(data2)
test = pd.concat([test_GE,data2], axis=1, join='inner')
test.drop(["sentiment"],axis=1,inplace=True)

In [26]:
full_text = pd.concat([train['review'], val['review'], test["review"]])
max_length = full_text.apply(lambda x: len(x.split())).max()
max_length

66

In [27]:
train.columns

Index(['review', 'sentiment_anger', 'sentiment_fear', 'sentiment_joy',
       'sentiment_love', 'sentiment_sadness', 'sentiment_surprise'],
      dtype='object')

In [28]:
GE_taxonomy= ['sentiment_anger', 'sentiment_fear', 'sentiment_joy','sentiment_love', 'sentiment_sadness', 'sentiment_surprise']

In [29]:

#Importing BERT pre-trained model and tokenizer
model_name = 'bert-base-uncased'
config = BertConfig.from_pretrained(model_name, output_hidden_states=False)
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [30]:
# function for creating BERT based model
def create_model(nb_labels):

  # Load the MainLayer
  bert = transformer_model.layers[0]

  # Build the model inputs
  input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
  attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32')
  token_ids = Input(shape=(max_length,), name='token_ids', dtype='int32')
  inputs = {'input_ids': input_ids, 'attention_mask': attention_mask, 'token_ids': token_ids}

  # Load the Transformers BERT model as a layer in a Keras model
  bert_model = bert(inputs)[1]
  dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
  pooled_output = dropout(bert_model, training=False)

  # Then build the model output
  emotion = Dense(units=nb_labels, activation="softmax", kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='emotion')(pooled_output)
  outputs = emotion

  # And combine it all in a model object
  model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel')

  return model

In [31]:
# Creating a model instance
model = create_model(6)


# Take a look at the model
model.summary

<bound method Model.summary of <tensorflow.python.keras.engine.functional.Functional object at 0x7fa8fad30b90>>

In [32]:
# Creating train, validation and test variables
X_train = train['review']
y_train = train.loc[:, GE_taxonomy].values.astype(float)

X_val = val['review']
y_val  = val.loc[:, GE_taxonomy].values.astype(float)

X_test = test['review']
y_test = test.loc[:, GE_taxonomy].values.astype(float)

In [33]:
# Tokenizing train data
train_token = tokenizer(
    text = X_train.to_list(),
    add_special_tokens = True,
    max_length = max_length,
    truncation = True,
    padding = 'max_length', 
    return_tensors = 'tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

# Tokenizing valisation data
val_token = tokenizer(
    text = X_val.to_list(),
    add_special_tokens = True,
    max_length = max_length,
    truncation = True,
    padding = 'max_length', 
    return_tensors = 'tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

# Tokenizing test data
test_token = tokenizer(
    text = X_test.to_list(),
    add_special_tokens = True,
    max_length = max_length,
    truncation = True,
    padding = 'max_length', 
    return_tensors = 'tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

In [34]:
# Creating BERT compatible inputs with Input Ids, attention masks and token Ids 
train = {'input_ids': train_token['input_ids'], 'attention_mask': train_token['attention_mask'],'token_ids': train_token['token_type_ids']}
val = {'input_ids': val_token['input_ids'], 'attention_mask': val_token['attention_mask'],'token_ids': val_token['token_type_ids']}
test = {'input_ids': test_token['input_ids'], 'attention_mask': test_token['attention_mask'],'token_ids': test_token['token_type_ids']}

In [35]:
# Creating TF tensors
train_tensor = tf.data.Dataset.from_tensor_slices((train, y_train)).shuffle(len(train)).batch(32)
val_tensor = tf.data.Dataset.from_tensor_slices((val, y_val)).shuffle(len(val)).batch(32)
test_tensor = tf.data.Dataset.from_tensor_slices((test, y_test)).shuffle(len(test)).batch(32)

In [36]:
# Function for calculating multilabel class weights
def calculating_class_weights(y_true):
    number_dim = np.shape(y_true)[1]
    weights = np.empty([number_dim, 2])
    for i in range(number_dim):
        weights[i] = compute_class_weight('balanced', [0.,1.], y_true[:, i])
    return weights

class_weights = calculating_class_weights(y_train)


In [37]:
# Custom loss function for multilabel
def get_weighted_loss(weights):
    def weighted_loss(y_true, y_pred):
        return K.mean((weights[:,0]**(1-y_true))*(weights[:,1]**(y_true))*K.binary_crossentropy(y_true, y_pred), axis=-1)
    return weighted_loss

In [18]:
from keras.callbacks import ModelCheckpoint

In [20]:
filepath = "/content/drive/MyDrive/samays approch/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"

In [21]:
checkpoint = ModelCheckpoint(filepath,monitor="val_loss",verbose=1,save_best_only=True,mode="max")
callback_lists = [checkpoint]

In [22]:
# Set an optimizer
optimizer = Adam(
    learning_rate=3.e-05,
    )

# Set loss
loss = get_weighted_loss(class_weights)

# Compile the model

model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])

# train the model
history = model.fit(train_tensor, 
                    epochs=15, 
                    validation_data=val_tensor,
                    callbacks = callback_lists)

Epoch 1/15

KeyboardInterrupt: ignored

In [None]:
# Save model weights
model.save_weights('/content/emotional_bert-weights.h5')

In [None]:
model.save_weights('/content/drive/MyDrive/samays approch/emotional_bert-weights.h5')