In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv("data/train_data.csv", usecols=["reviewText", "sentiment"])
val_data = pd.read_csv("data/validation_data.csv", usecols=["reviewText", "sentiment"])
test_data = pd.read_csv("data/test_data.csv", usecols=["reviewText", "sentiment"])

In [3]:
train_data.dropna(inplace=True)
train_data.rename(columns={"reviewText": "text", 
                           "sentiment": "labels"}, 
                 inplace=True)

val_data.dropna(inplace=True)
val_data.rename(columns={"reviewText": "text", 
                           "sentiment": "labels"}, 
                 inplace=True)

test_data.dropna(inplace=True)
test_data.rename(columns={"reviewText": "text", 
                           "sentiment": "labels"}, 
                 inplace=True)

In [4]:
from datasets import Dataset

In [5]:
train_data_ds = Dataset.from_pandas(train_data)
val_data_ds = Dataset.from_pandas(val_data)
test_data_ds = Dataset.from_pandas(test_data)


In [22]:
from transformers import RobertaTokenizer, RobertaTokenizerFast, TFRobertaModel, DataCollatorWithPadding
import tensorflow as tf
checkpoint = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
bert_model = TFRobertaModel.from_pretrained('roberta-base')

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [24]:
from transformers import AutoConfig

In [31]:
config = AutoConfig.from_pretrained(checkpoint).to_dict()

In [32]:
config["attention_probs_dropout_prob"] = 0.2
config["classifier_dropout"] = 0.2

In [33]:
class Classifier(tf.keras.Model):
    
    def __init__(self, bert_encoder):
        super().__init__()
        self.encoder = bert_encoder
        self.classifier_1 = tf.keras.layers.Dense(32, activation="relu")
        self.dropout = tf.keras.layers.Dropout(0.2)
        self.classifier_2 = tf.keras.layers.Dense(1)

    def call(self, inputs, training):
        x = self.encoder(inputs)["pooler_output"]
        x = self.classifier_1(x)
        x = self.dropout(x, training=training)
        x = self.classifier_2(x)
        return x

In [34]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


train_datasets = train_data_ds.map(tokenize_function, batched=True)
val_datasets = val_data_ds.map(tokenize_function, batched=True)
test_datasets = test_data_ds.map(tokenize_function, batched=True)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")


tf_train_dataset = train_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=16,
)

tf_validation_dataset = val_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=16,
)

tf_test_dataset = test_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=16,
)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [36]:
bert_encoder = TFRobertaModel.from_pretrained('roberta-base', config)

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [37]:
from tensorflow.keras.losses import BinaryCrossentropy

In [38]:
model = Classifier(bert_encoder)

In [39]:
data = pd.read_csv("data/train_data.csv", usecols=["sentiment"])
total = len(data)
data = pd.read_csv("data/train_data.csv", usecols=["sentiment"])
classes = data["sentiment"].value_counts().to_dict()
total = len(data)
weight_for_0 = (1 / classes[0]) * (total / 2.0)
weight_for_1 = (1 / classes[1]) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

In [40]:
class_weight

{0: 5.270777479892762, 1: 0.5524023602135432}

In [41]:
model.compile(
    optimizer="adam",
    loss=BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    class_weight=class_weight,
    epochs=10
)

Epoch 1/10


2022-12-14 20:28:31.841593: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-12-14 21:07:04.344641: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2fabd4e80>

In [42]:
model.evaluate(tf_test_dataset)



[0.6942139863967896, 0.09368635714054108]

In [18]:
model.encoder.save_pretrained("")

<transformers.models.roberta.modeling_tf_roberta.TFRobertaModel at 0x2890921c0>