In [1]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaTokenizerFast, TFAutoModelForSequenceClassification, DataCollatorWithPadding
import tensorflow as tf
from datasets import Dataset
from transformers import AutoConfig
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from transformers import RobertaConfig
import numpy as np

In [2]:
train_data_3 = pd.read_csv("data_v2/Aug_BackTranslation.csv")

from_french = train_data_3[["from_french", "sentiment"]].dropna()
from_french.rename(columns={"from_french": "reviewText"}, inplace=True)

from_italian = train_data_3[["from_italian", "sentiment"]].dropna()
from_italian.rename(columns={"from_italian": "reviewText"}, inplace=True)

from_german = train_data_3[["from_german", "sentiment"]].dropna()
from_german.rename(columns={"from_german": "reviewText"}, inplace=True)

from_chinese = train_data_3[["from_chinese", "sentiment"]].dropna()
from_chinese.rename(columns={"from_chinese": "reviewText"}, inplace=True)

train_data_3 = pd.concat([from_french, from_italian, from_german, from_chinese], ignore_index=True)

In [3]:
val_data_3 = pd.read_csv("data_v2/Aug_Val_BackTranslation.csv")

from_french = val_data_3[["from_french", "sentiment"]].dropna()
from_french.rename(columns={"from_french": "reviewText"}, inplace=True)

from_italian = val_data_3[["from_italian", "sentiment"]].dropna()
from_italian.rename(columns={"from_italian": "reviewText"}, inplace=True)

from_german = val_data_3[["from_german", "sentiment"]].dropna()
from_german.rename(columns={"from_german": "reviewText"}, inplace=True)

from_chinese = val_data_3[["from_chinese", "sentiment"]].dropna()
from_chinese.rename(columns={"from_chinese": "reviewText"}, inplace=True)

val_data_3 = pd.concat([from_french, from_italian, from_german, from_chinese], ignore_index=True)

In [4]:
train_data_1 = pd.read_csv("data_v2/train_data.csv", usecols=["reviewText", "sentiment"])
train_data_2 = pd.read_csv("data_v2/Aug_RandomInsertion.csv", usecols=["reviewText", "sentiment"])
train_data = pd.concat([train_data_1, train_data_2, train_data_3], ignore_index=True)

val_data_1 = pd.read_csv("data_v2/validation_data.csv", usecols=["reviewText", "sentiment"])
val_data_2 = pd.read_csv("data_v2/Aug_Val_RandomInsertion.csv", usecols=["reviewText", "sentiment"])
val_data = pd.concat([val_data_1, val_data_2, val_data_3], ignore_index=True)


test_data = pd.read_csv("data_v2/test_data.csv", usecols=["reviewText", "sentiment"])

In [5]:
val_data.shape

(721, 2)

In [6]:
train_data.shape

(5748, 2)

In [7]:
train_data.isna().sum()

reviewText    1
sentiment     0
dtype: int64

In [8]:
train_data.dropna(inplace=True)
train_data.rename(columns={"reviewText": "text", 
                           "sentiment": "labels"}, 
                 inplace=True)

val_data.dropna(inplace=True)
val_data.rename(columns={"reviewText": "text", 
                           "sentiment": "labels"}, 
                 inplace=True)

test_data.dropna(inplace=True)
test_data.rename(columns={"reviewText": "text", 
                           "sentiment": "labels"}, 
                 inplace=True)

In [9]:
train_data.shape

(5747, 2)

In [10]:
train_data_ds = Dataset.from_pandas(train_data)
val_data_ds = Dataset.from_pandas(val_data)
test_data_ds = Dataset.from_pandas(test_data)


In [11]:

checkpoint = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(checkpoint)


In [13]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


train_datasets = train_data_ds.map(tokenize_function, batched=True)
val_datasets = val_data_ds.map(tokenize_function, batched=True)
test_datasets = test_data_ds.map(tokenize_function, batched=True)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")


tf_train_dataset = train_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=32,
)
tf_train_dataset = tf_train_dataset.cache()

tf_validation_dataset = val_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=16,
)
tf_validation_dataset = tf_validation_dataset.cache()

tf_test_dataset = test_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=16,
)



  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
# bert_encoder = TFRobertaModel.from_pretrained(checkpoint, trainable=True)

In [16]:
# model = Classifier(bert_encoder)

In [17]:
# data = pd.read_csv("data/train_data.csv", usecols=["sentiment"])
# total = len(data)
# data = pd.read_csv("data/train_data.csv", usecols=["sentiment"])
# classes = data["sentiment"].value_counts().to_dict()
# total = len(data)
# weight_for_0 = (1 / classes[0]) * (total / 2.0)
# weight_for_1 = (1 / classes[1]) * (total / 2.0)

# class_weight = {0: weight_for_0, 1: weight_for_1}

In [19]:
checkpoint_filepath = 'classifier_checkpoint_v2/'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

model_checkpoint_earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=3)

In [20]:
model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    callbacks=[model_checkpoint_callback, model_checkpoint_earlyStopping],
    epochs=10
)

2022-12-18 17:56:20.566235: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/10


2022-12-18 17:56:26.120824: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-12-18 18:36:20.828702: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x28fb9e820>

In [21]:
model.evaluate(tf_test_dataset)



[0.6014251708984375, 0.8952164649963379]

In [23]:
from sklearn.metrics import f1_score, precision_recall_curve

In [24]:
train_data_for_pred = pd.read_csv("data_v2/train_data.csv", usecols=["reviewText", "sentiment"])
val_data_for_pred = pd.read_csv("data_v2/validation_data.csv", usecols=["reviewText", "sentiment"])

train_data_for_pred = pd.concat([train_data_for_pred, val_data_for_pred], ignore_index=True)

test_data_for_pred = pd.read_csv("data_v2/test_data.csv", usecols=["reviewText", "sentiment"])

train_data_for_pred.dropna(inplace=True)
train_data_for_pred.rename(columns={"reviewText": "text", 
                                    "sentiment": "labels"}, 
                           inplace=True)


test_data_for_pred.dropna(inplace=True)
test_data_for_pred.rename(columns={"reviewText": "text", 
                                   "sentiment": "labels"}, 
                         inplace=True)


train_data_for_pred_ds = Dataset.from_pandas(train_data_for_pred)
test_data_for_pred_ds = Dataset.from_pandas(test_data_for_pred)

train_datasets_for_pred = train_data_for_pred_ds.map(tokenize_function, batched=True)
test_datasets_for_pred = test_data_for_pred_ds.map(tokenize_function, batched=True)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")



tf_train_dataset_for_pred = train_datasets_for_pred.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=32,
)
tf_train_dataset_for_pred = tf_train_dataset_for_pred.cache()


tf_test_dataset_for_pred = test_datasets_for_pred.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=16,
)
tf_test_dataset_for_pred = tf_test_dataset_for_pred.cache()

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [25]:
train_predict = model.predict(tf_train_dataset_for_pred)
test_predict = model.predict(tf_test_dataset_for_pred)

In [31]:
train_predict_logits = train_predict.logits[:,1]
test_predict_logits = test_predict.logits[:,1]

In [32]:
train_labels = train_data_for_pred["labels"].values
test_labels = test_data_for_pred["labels"].values

In [36]:
precision, recall, thresholds = precision_recall_curve(train_labels, train_predict_logits)

# convert to f score
fscore = (2 * precision * recall) / (precision + recall+0.000000001)
ix = np.argmax(fscore)
print('Best Threshold=%f, Training F-Score=%.3f' % (thresholds[ix], fscore[ix]))

Best Threshold=0.185000, Training F-Score=0.944


In [37]:
precision, recall, thresholds

(array([0.89361702, 0.89485861, 0.89538542, 0.89742605, 0.89785228,
        0.88518519, 0.87037037, 1.        , 1.        , 1.        ]),
 array([1.00000000e+00, 9.86678005e-01, 9.67970522e-01, 6.62131519e-01,
        4.85827664e-01, 6.77437642e-02, 2.66439909e-02, 8.50340136e-04,
        2.83446712e-04, 0.00000000e+00]),
 array([0.18499951, 0.18499953, 0.18499954, 0.18499956, 0.18499957,
        0.18499959, 0.1849996 , 0.18499961, 0.18499963], dtype=float32))

In [52]:
test_predict_labels = [int(x>=0.18499956) for x in test_predict_logits]

print(classification_report(test_labels, test_predict_labels))

              precision    recall  f1-score   support

           0       0.12      0.65      0.20        46
           1       0.91      0.43      0.59       393

    accuracy                           0.46       439
   macro avg       0.52      0.54      0.39       439
weighted avg       0.83      0.46      0.55       439



In [53]:
model.save_pretrained("roberta-finetuned-V3/")

In [54]:
def f1_score(y, y_pred):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for actual_value, predicted_value in zip(y, y_pred):
        if predicted_value == actual_value:  # t?
            if predicted_value:  # tp
                tp += 1
            else:  # tn
                tn += 1
        else:  # f?
            if predicted_value:  # fp
                fp += 1
            else:  # fn
                fn += 1

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)

    print('----------------------------------')
    print('                 Actual Value')
    print('----------------------------------')
    print(f'            Positive    Negative')
    print(f'Positive    {tp:^8}    {fp:^8}')
    print(f'Negative    {fn:^8}    {tn:^8}')
    print('----------------------------------')
    return f1


In [55]:
f1_score(test_labels, test_predict_labels)

----------------------------------
                 Actual Value
----------------------------------
            Positive    Negative
Positive      170          16   
Negative      223          30   
----------------------------------


0.5872193436960277