In [1]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaTokenizerFast, TFRobertaModel, DataCollatorWithPadding
import tensorflow as tf
from datasets import Dataset
from transformers import AutoConfig
from tensorflow.keras.losses import BinaryCrossentropy
from transformers import RobertaConfig

In [2]:
train_data_3 = pd.read_csv("data_v2/Aug_BackTranslation.csv")

from_french = train_data_3[["from_french", "sentiment"]].dropna()
from_french.rename(columns={"from_french": "reviewText"}, inplace=True)

from_italian = train_data_3[["from_italian", "sentiment"]].dropna()
from_italian.rename(columns={"from_italian": "reviewText"}, inplace=True)

from_german = train_data_3[["from_german", "sentiment"]].dropna()
from_german.rename(columns={"from_german": "reviewText"}, inplace=True)

from_chinese = train_data_3[["from_chinese", "sentiment"]].dropna()
from_chinese.rename(columns={"from_chinese": "reviewText"}, inplace=True)

train_data_3 = pd.concat([from_french, from_italian, from_german, from_chinese], ignore_index=True)

In [3]:
val_data_3 = pd.read_csv("data_v2/Aug_Val_BackTranslation.csv")

from_french = val_data_3[["from_french", "sentiment"]].dropna()
from_french.rename(columns={"from_french": "reviewText"}, inplace=True)

from_italian = val_data_3[["from_italian", "sentiment"]].dropna()
from_italian.rename(columns={"from_italian": "reviewText"}, inplace=True)

from_german = val_data_3[["from_german", "sentiment"]].dropna()
from_german.rename(columns={"from_german": "reviewText"}, inplace=True)

from_chinese = val_data_3[["from_chinese", "sentiment"]].dropna()
from_chinese.rename(columns={"from_chinese": "reviewText"}, inplace=True)

val_data_3 = pd.concat([from_french, from_italian, from_german, from_chinese], ignore_index=True)

In [4]:
train_data_1 = pd.read_csv("data_v2/train_data.csv", usecols=["reviewText", "sentiment"])
train_data_2 = pd.read_csv("data_v2/Aug_RandomInsertion.csv", usecols=["reviewText", "sentiment"])
train_data = pd.concat([train_data_1, train_data_2, train_data_3], ignore_index=True)

val_data_1 = pd.read_csv("data_v2/validation_data.csv", usecols=["reviewText", "sentiment"])
val_data_2 = pd.read_csv("data_v2/Aug_Val_RandomInsertion.csv", usecols=["reviewText", "sentiment"])
val_data = pd.concat([val_data_1, val_data_2, val_data_3], ignore_index=True)


test_data = pd.read_csv("data_v2/test_data.csv", usecols=["reviewText", "sentiment"])

In [5]:
val_data.shape

(721, 2)

In [6]:
train_data.shape

(5748, 2)

In [7]:
train_data.isna().sum()

reviewText    1
sentiment     0
dtype: int64

In [8]:
train_data.dropna(inplace=True)
train_data.rename(columns={"reviewText": "text", 
                           "sentiment": "labels"}, 
                 inplace=True)

val_data.dropna(inplace=True)
val_data.rename(columns={"reviewText": "text", 
                           "sentiment": "labels"}, 
                 inplace=True)

test_data.dropna(inplace=True)
test_data.rename(columns={"reviewText": "text", 
                           "sentiment": "labels"}, 
                 inplace=True)

In [9]:
train_data.shape

(5747, 2)

In [10]:
train_data_ds = Dataset.from_pandas(train_data)
val_data_ds = Dataset.from_pandas(val_data)
test_data_ds = Dataset.from_pandas(test_data)


In [11]:

checkpoint = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(checkpoint)


In [12]:
class Classifier(tf.keras.Model):
    
    def __init__(self, bert_encoder):
        super().__init__()
        self.encoder = bert_encoder
        self.classifier_1 = tf.keras.layers.Dense(128, activation="relu")
        self.dropout_1 = tf.keras.layers.Dropout(0.4)
        self.classifier_2 = tf.keras.layers.Dense(32, activation="tanh")
        self.dropout_2 = tf.keras.layers.Dropout(0.4)
        self.classifier_3 = tf.keras.layers.Dense(1)

    def call(self, inputs, training):
        x = self.encoder(inputs, training=training)["last_hidden_state"][:,0]
        x = self.classifier_1(x)
        x = self.dropout_1(x, training=training)
        x = self.classifier_2(x)
        x = self.dropout_2(x, training=training)
        x = self.classifier_3(x)
        return x

In [13]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


train_datasets = train_data_ds.map(tokenize_function, batched=True)
val_datasets = val_data_ds.map(tokenize_function, batched=True)
test_datasets = test_data_ds.map(tokenize_function, batched=True)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")


tf_train_dataset = train_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=32,
)
tf_train_dataset = tf_train_dataset.cache()

tf_validation_dataset = val_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=16,
)
tf_validation_dataset = tf_validation_dataset.cache()

tf_test_dataset = test_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=16,
)



  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Metal device set to: Apple M1 Max


2022-12-19 22:07:01.634280: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-19 22:07:01.634481: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [14]:
bert_encoder = TFRobertaModel.from_pretrained(checkpoint, trainable=True)

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [15]:
model = Classifier(bert_encoder)

In [16]:
# data = pd.read_csv("data/train_data.csv", usecols=["sentiment"])
# total = len(data)
# data = pd.read_csv("data/train_data.csv", usecols=["sentiment"])
# classes = data["sentiment"].value_counts().to_dict()
# total = len(data)
# weight_for_0 = (1 / classes[0]) * (total / 2.0)
# weight_for_1 = (1 / classes[1]) * (total / 2.0)

# class_weight = {0: weight_for_0, 1: weight_for_1}

In [17]:
checkpoint_filepath = 'classifier_checkpoint_v0/'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

model_checkpoint_earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=2)

In [18]:
model.compile(
    optimizer="adam",
    loss=BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    callbacks=[model_checkpoint_callback, model_checkpoint_earlyStopping],
    epochs=10
)

2022-12-19 22:07:05.767633: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/10


2022-12-19 22:07:10.417312: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-12-19 22:23:47.766957: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10


<keras.callbacks.History at 0x2971d8610>

In [19]:
model.evaluate(tf_test_dataset)



[0.6378714442253113, 0.10478360205888748]

In [20]:
from sklearn.metrics import f1_score, precision_recall_curve

In [21]:
train_data_for_pred = pd.read_csv("data_v2/train_data.csv", usecols=["reviewText", "sentiment"])
val_data_for_pred = pd.read_csv("data_v2/validation_data.csv", usecols=["reviewText", "sentiment"])

train_data_for_pred = pd.concat([train_data_for_pred, val_data_for_pred], ignore_index=True)

test_data_for_pred = pd.read_csv("data_v2/test_data.csv", usecols=["reviewText", "sentiment"])

train_data_for_pred.dropna(inplace=True)
train_data_for_pred.rename(columns={"reviewText": "text", 
                                    "sentiment": "labels"}, 
                           inplace=True)


test_data_for_pred.dropna(inplace=True)
test_data_for_pred.rename(columns={"reviewText": "text", 
                                   "sentiment": "labels"}, 
                         inplace=True)


train_data_for_pred_ds = Dataset.from_pandas(train_data_for_pred)
test_data_for_pred_ds = Dataset.from_pandas(test_data_for_pred)

train_datasets_for_pred = train_data_for_pred_ds.map(tokenize_function, batched=True)
test_datasets_for_pred = test_data_for_pred_ds.map(tokenize_function, batched=True)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")



tf_train_dataset_for_pred = train_datasets_for_pred.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=32,
)
tf_train_dataset_for_pred = tf_train_dataset_for_pred.cache()


tf_test_dataset_for_pred = test_datasets_for_pred.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=16,
)
tf_test_dataset_for_pred = tf_test_dataset_for_pred.cache()

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [22]:
train_predict = model.predict(tf_train_dataset_for_pred)
test_predict = model.predict(tf_test_dataset_for_pred)

2022-12-19 23:46:12.276522: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [25]:
train_predict_logits = train_predict.ravel()
test_predict_logits = test_predict.ravel()


In [28]:
import numpy as np

In [26]:
train_labels = train_data_for_pred["labels"].values
test_labels = test_data_for_pred["labels"].values

In [29]:
precision, recall, thresholds = precision_recall_curve(train_labels, train_predict_logits)

# convert to f score
fscore = (2 * precision * recall) / (precision + recall+0.000000001)
ix = np.argmax(fscore)
print('Best Threshold=%f, Training F-Score=%.3f' % (thresholds[ix], fscore[ix]))

Best Threshold=0.146658, Training F-Score=0.944


In [30]:
precision, recall, thresholds

(array([0.89361702, 0.89359007, 0.89378961, 0.89418929, 0.89410868,
        0.89433384, 0.89442589, 0.89454545, 0.8939314 , 0.89449291,
        0.89420085, 0.8967587 , 0.89777491, 0.89618266, 0.8927571 ,
        0.89343881, 0.88957055, 0.89280677, 0.89620019, 0.88578372,
        0.89695946, 0.89356436, 0.89411765, 0.8972973 , 0.90909091,
        0.89285714, 0.84      , 0.75      , 1.        , 1.        ]),
 array([1.00000000e+00, 9.99716553e-01, 9.99433107e-01, 9.98866213e-01,
        9.98015873e-01, 9.93197279e-01, 9.86961451e-01, 9.76190476e-01,
        9.60317460e-01, 9.29988662e-01, 8.95975057e-01, 8.46938776e-01,
        7.89115646e-01, 7.12018141e-01, 6.32369615e-01, 5.44217687e-01,
        4.52097506e-01, 3.58843537e-01, 2.74092971e-01, 2.06632653e-01,
        1.50510204e-01, 1.02324263e-01, 6.46258503e-02, 4.70521542e-02,
        2.55102041e-02, 1.41723356e-02, 5.95238095e-03, 2.55102041e-03,
        5.66893424e-04, 0.00000000e+00]),
 array([0.14665829, 0.14665835, 0.14665836, 

In [32]:
from sklearn.metrics import classification_report

In [48]:
train_predict_labels = [int(x>=0.14665857) for x in train_predict_logits]

print(classification_report(train_labels, train_predict_labels))

              precision    recall  f1-score   support

           0       0.11      0.64      0.18       420
           1       0.89      0.36      0.51      3528

    accuracy                           0.39      3948
   macro avg       0.50      0.50      0.35      3948
weighted avg       0.81      0.39      0.48      3948



In [49]:
test_predict_labels = [int(x>=0.14665857) for x in test_predict_logits]

print(classification_report(test_labels, test_predict_labels))

              precision    recall  f1-score   support

           0       0.10      0.65      0.18        46
           1       0.89      0.35      0.50       393

    accuracy                           0.38       439
   macro avg       0.50      0.50      0.34       439
weighted avg       0.81      0.38      0.47       439



In [50]:
model.encoder.save_pretrained("roberta-finetuned-V1/")

In [51]:
def f1_score(y, y_pred):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for actual_value, predicted_value in zip(y, y_pred):
        if predicted_value == actual_value:  # t?
            if predicted_value:  # tp
                tp += 1
            else:  # tn
                tn += 1
        else:  # f?
            if predicted_value:  # fp
                fp += 1
            else:  # fn
                fn += 1

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)

    print('----------------------------------')
    print('                 Actual Value')
    print('----------------------------------')
    print(f'            Positive    Negative')
    print(f'Positive    {tp:^8}    {fp:^8}')
    print(f'Negative    {fn:^8}    {tn:^8}')
    print('----------------------------------')
    return f1


In [52]:
f1_score(test_labels, test_predict_labels)

----------------------------------
                 Actual Value
----------------------------------
            Positive    Negative
Positive      136          16   
Negative      257          30   
----------------------------------


0.49908256880733937