In [1]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaTokenizerFast, TFAutoModelForSequenceClassification, DataCollatorWithPadding
import tensorflow as tf
from datasets import Dataset
from transformers import AutoConfig
from tensorflow.keras.losses import BinaryCrossentropy
from transformers import RobertaConfig

In [3]:
train_data_3 = pd.read_csv("data_v2/Aug_BackTranslation.csv")

from_french = train_data_3[["from_french", "sentiment"]].dropna()
from_french.rename(columns={"from_french": "reviewText"}, inplace=True)

from_italian = train_data_3[["from_italian", "sentiment"]].dropna()
from_italian.rename(columns={"from_italian": "reviewText"}, inplace=True)

from_german = train_data_3[["from_german", "sentiment"]].dropna()
from_german.rename(columns={"from_german": "reviewText"}, inplace=True)

from_chinese = train_data_3[["from_chinese", "sentiment"]].dropna()
from_chinese.rename(columns={"from_chinese": "reviewText"}, inplace=True)

train_data_3 = pd.concat([from_french, from_italian, from_german, from_chinese], ignore_index=True)

In [4]:
val_data_3 = pd.read_csv("data_v2/Aug_Val_BackTranslation.csv")

from_french = val_data_3[["from_french", "sentiment"]].dropna()
from_french.rename(columns={"from_french": "reviewText"}, inplace=True)

from_italian = val_data_3[["from_italian", "sentiment"]].dropna()
from_italian.rename(columns={"from_italian": "reviewText"}, inplace=True)

from_german = val_data_3[["from_german", "sentiment"]].dropna()
from_german.rename(columns={"from_german": "reviewText"}, inplace=True)

from_chinese = val_data_3[["from_chinese", "sentiment"]].dropna()
from_chinese.rename(columns={"from_chinese": "reviewText"}, inplace=True)

val_data_3 = pd.concat([from_french, from_italian, from_german, from_chinese], ignore_index=True)

In [5]:
train_data_1 = pd.read_csv("data_v2/train_data.csv", usecols=["reviewText", "sentiment"])
train_data_2 = pd.read_csv("data_v2/Aug_RandomInsertion.csv", usecols=["reviewText", "sentiment"])
train_data = pd.concat([train_data_1, train_data_2, train_data_3], ignore_index=True)

val_data_1 = pd.read_csv("data_v2/validation_data.csv", usecols=["reviewText", "sentiment"])
val_data_2 = pd.read_csv("data_v2/Aug_Val_RandomInsertion.csv", usecols=["reviewText", "sentiment"])
val_data = pd.concat([val_data_1, val_data_2, val_data_3], ignore_index=True)


test_data = pd.read_csv("data_v2/test_data.csv", usecols=["reviewText", "sentiment"])

In [6]:
val_data.shape

(721, 2)

In [7]:
train_data.shape

(5748, 2)

In [8]:
train_data.isna().sum()

reviewText    1
sentiment     0
dtype: int64

In [9]:
train_data.dropna(inplace=True)
train_data.rename(columns={"reviewText": "text", 
                           "sentiment": "labels"}, 
                 inplace=True)

val_data.dropna(inplace=True)
val_data.rename(columns={"reviewText": "text", 
                           "sentiment": "labels"}, 
                 inplace=True)

test_data.dropna(inplace=True)
test_data.rename(columns={"reviewText": "text", 
                           "sentiment": "labels"}, 
                 inplace=True)

In [10]:
train_data.shape

(5747, 2)

In [11]:
train_data_ds = Dataset.from_pandas(train_data)
val_data_ds = Dataset.from_pandas(val_data)
test_data_ds = Dataset.from_pandas(test_data)


In [12]:

checkpoint = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(checkpoint)


In [21]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


train_datasets = train_data_ds.map(tokenize_function, batched=True)
val_datasets = val_data_ds.map(tokenize_function, batched=True)
test_datasets = test_data_ds.map(tokenize_function, batched=True)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")


tf_train_dataset = train_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=32,
)
tf_train_dataset = tf_train_dataset.cache()

tf_validation_dataset = val_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=16,
)
tf_validation_dataset = tf_validation_dataset.cache()

tf_test_dataset = test_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=16,
)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [23]:
# bert_encoder = TFRobertaModel.from_pretrained(checkpoint, trainable=True)

In [24]:
# model = Classifier(bert_encoder)

In [25]:
# data = pd.read_csv("data/train_data.csv", usecols=["sentiment"])
# total = len(data)
# data = pd.read_csv("data/train_data.csv", usecols=["sentiment"])
# classes = data["sentiment"].value_counts().to_dict()
# total = len(data)
# weight_for_0 = (1 / classes[0]) * (total / 2.0)
# weight_for_1 = (1 / classes[1]) * (total / 2.0)

# class_weight = {0: weight_for_0, 1: weight_for_1}

In [26]:
checkpoint_filepath = 'classifier_checkpoint/'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

model_checkpoint_earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=3)

In [27]:
model.compile(
    optimizer="adam",
    loss=BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    callbacks=[model_checkpoint_callback, model_checkpoint_earlyStopping],
    epochs=10
)

Epoch 1/10


2022-12-18 16:49:59.413713: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-12-18 17:05:25.371746: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x2e654bee0>

In [28]:
model.evaluate(tf_test_dataset)



[0.5329323410987854, 0.10478360205888748]

In [44]:
from sklearn.metrics import f1_score, precision_recall_curve

In [45]:
train_data_for_pred = pd.read_csv("data_v2/train_data.csv", usecols=["reviewText", "sentiment"])
val_data_for_pred = pd.read_csv("data_v2/validation_data.csv", usecols=["reviewText", "sentiment"])

train_data_for_pred = pd.concat([train_data_for_pred, val_data_for_pred], ignore_index=True)

test_data_for_pred = pd.read_csv("data_v2/test_data.csv", usecols=["reviewText", "sentiment"])

train_data_for_pred.dropna(inplace=True)
train_data_for_pred.rename(columns={"reviewText": "text", 
                                    "sentiment": "labels"}, 
                           inplace=True)


test_data_for_pred.dropna(inplace=True)
test_data_for_pred.rename(columns={"reviewText": "text", 
                                   "sentiment": "labels"}, 
                         inplace=True)


train_data_for_pred_ds = Dataset.from_pandas(train_data_for_pred)
test_data_for_pred_ds = Dataset.from_pandas(test_data_for_pred)

train_datasets_for_pred = train_data_for_pred_ds.map(tokenize_function, batched=True)
test_datasets_for_pred = test_data_for_pred_ds.map(tokenize_function, batched=True)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")



tf_train_dataset_for_pred = train_datasets_for_pred.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=32,
)
tf_train_dataset_for_pred = tf_train_dataset_for_pred.cache()


tf_test_dataset_for_pred = test_datasets_for_pred.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=16,
)
tf_test_dataset_for_pred = tf_test_dataset_for_pred.cache()

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [46]:
train_predict = model.predict(tf_train_dataset_for_pred)
test_predict = model.predict(tf_test_dataset_for_pred)

In [47]:
train_predict_logits = train_predict.logits.ravel()
test_predict_logits = test_predict.logits.ravel()


In [52]:
train_labels = train_data_for_pred["labels"].values

In [53]:
test_labels = test_data_for_pred["labels"].values

In [54]:
precision, recall, thresholds = precision_recall_curve(train_labels, train_predict_logits)

# convert to f score
fscore = (2 * precision * recall) / (precision + recall+0.000000001)
ix = np.argmax(fscore)
print('Best Threshold=%f, Training F-Score=%.3f' % (thresholds[ix], fscore[ix]))

Best Threshold=0.476540, Training F-Score=0.944


In [79]:
precision, recall, thresholds

(array([0.89361702, 0.89357379, 0.89222193, 0.88950073, 0.89281642,
        0.88333333, 0.85714286, 1.        ]),
 array([1.        , 0.99716553, 0.9526644 , 0.68679138, 0.44387755,
        0.03004535, 0.00680272, 0.        ]),
 array([0.47653988, 0.4765399 , 0.47653994, 0.47653997, 0.47654   ,
        0.47654003, 0.47654006], dtype=float32))

In [80]:
ix

0

In [101]:
train_predict_labels = [int(x>=0.47653997) for x in train_predict_logits]

print(classification_report(train_labels, train_predict_labels))

              precision    recall  f1-score   support

           0       0.11      0.55      0.18       420
           1       0.89      0.44      0.59      3528

    accuracy                           0.46      3948
   macro avg       0.50      0.50      0.39      3948
weighted avg       0.81      0.46      0.55      3948



In [78]:
from sklearn.metrics import classification_report

In [102]:
test_predict_labels = [int(x>=0.47653997) for x in test_predict_logits]

print(classification_report(test_labels, test_predict_labels))

              precision    recall  f1-score   support

           0       0.09      0.46      0.15        46
           1       0.88      0.45      0.59       393

    accuracy                           0.45       439
   macro avg       0.48      0.45      0.37       439
weighted avg       0.79      0.45      0.55       439



In [96]:
model.save_pretrained("roberta-finetuned-V2/")

In [103]:
def f1_score(y, y_pred):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for actual_value, predicted_value in zip(y, y_pred):
        if predicted_value == actual_value:  # t?
            if predicted_value:  # tp
                tp += 1
            else:  # tn
                tn += 1
        else:  # f?
            if predicted_value:  # fp
                fp += 1
            else:  # fn
                fn += 1

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)

    print('----------------------------------')
    print('                 Actual Value')
    print('----------------------------------')
    print(f'            Positive    Negative')
    print(f'Positive    {tp:^8}    {fp:^8}')
    print(f'Negative    {fn:^8}    {tn:^8}')
    print('----------------------------------')
    return f1


In [104]:
f1_score(test_labels, test_predict_labels)

----------------------------------
                 Actual Value
----------------------------------
            Positive    Negative
Positive      177          25   
Negative      216          21   
----------------------------------


0.5949579831932773