In [1]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf

from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import create_optimizer
from transformers.keras_callbacks import KerasMetricCallback
from transformers import TFAutoModelForSequenceClassification

from sklearn.metrics import classification_report, f1_score

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("./../inp/sinhala/train.csv")
test_data = pd.read_csv("./../inp/sinhala/test.csv")

In [3]:
id2label = {0: "NOT", 1: "HOF"}
label2id = {"NOT": 0, "HOF": 1}

In [4]:
def data_clean(data_df):
    # Removing Unwanted Columns
    data_df.drop(["post_id"], axis=1, inplace=True)
    
    # Removing @tags 
    pattern = r'@\w+'
    data_df["text"] = data_df["text"].apply(lambda x: re.sub(pattern, '', x))
    
    # Transforming Categorical Values to Numericals
    data_df["labels"] = data_df["label"].apply(lambda x: [label2id[x]])
    
    # Dropping label column
    data_df.drop("label", axis=1, inplace=True)
    
    return data_df

In [5]:
# Cleaning and Preparing Test Data
test_id = test_data["post_id"]
test_data.drop("post_id", axis=1, inplace=True)
pattern = r'@\w+'
test_data["text"] = test_data["text"].apply(lambda x: re.sub(pattern, '', x))

In [6]:
model_name = "keshan/SinhalaBERTo"
tokenizer = AutoTokenizer.from_pretrained(model_name, from_pt=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

Downloading (…)lve/main/config.json: 100%|██████████| 551/551 [00:00<00:00, 783kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.08M/1.08M [00:00<00:00, 1.28MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 721k/721k [00:00<00:00, 819kB/s]


In [7]:
def tokenize_examples(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, max_length=True)
    return tokenized_inputs

def create_model(model_name, optimizer):
    model = TFAutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id,
        from_pt=True
    )
    
    model.compile(
        optimizer=optimizer,
        metrics=[tf.keras.metrics.binary_crossentropy]
    )
    
    model.summary()
    return model

In [8]:
test = Dataset.from_pandas(test_data)
test_tokenized = test.map(tokenize_examples, batched=True)

Map: 100%|██████████| 2500/2500 [00:00<00:00, 22684.57 examples/s]


In [9]:
fold_path = "./../inp/sinhala/folds/2023/"
dirs = os.listdir(fold_path)

oof_preds = np.zeros((data.shape[0],))
test_preds = np.zeros((test_data.shape[0], 2))

for dir_name in dirs:
    dir_path = os.path.join(fold_path, dir_name)

    # Defining the Train and Val paths 
    train_df = pd.read_csv(os.path.join(dir_path, 'train.csv'))
    val_df = pd.read_csv(os.path.join(dir_path, 'val.csv'))
    
    # Cleaning and Prepareing the Data
    train_clean = data_clean(train_df)
    val_clean = data_clean(val_df)
    
    # Converting to HuggingFace Datasets
    train_ds = Dataset.from_pandas(train_df)
    val_ds = Dataset.from_pandas(val_df)
    
    # Tokenize the Data    
    train_tokenized = train_ds.map(tokenize_examples, batched=True)
    val_tokenized = val_ds.map(tokenize_examples, batched=True)
    
    # Defining the Parameters for Training
    batch_size = 8
    num_epochs = 10
    batches_per_epoch = len(train_tokenized) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)
    optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
    
    # Define the Model 
    model = create_model(model_name, optimizer)
    
    # Converting to Tf Dataset for training
    train_set = model.prepare_tf_dataset(
        train_tokenized,
        shuffle=True,
        batch_size=8,
        collate_fn=data_collator,
    )

    validation_set = model.prepare_tf_dataset(
        val_tokenized,
        shuffle=False,
        batch_size=8,
        collate_fn=data_collator,
    )
    
    test_set = model.prepare_tf_dataset(
        test_tokenized,
        shuffle=False,
        batch_size=8,
        collate_fn=data_collator,
    )
    
    # Define Model
    es = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        min_delta=0,
        patience=3,
        verbose=1,
        mode='auto',
        baseline=None,
        restore_best_weights=False,
        start_from_epoch=0
    )
    
    history = model.fit(x=train_set,
              validation_data=validation_set,
              epochs=15,
              callbacks=[es])

    logits = model.predict(validation_set).logits
    y_preds = tf.argmax(tf.nn.sigmoid(logits), axis=1).numpy()
    oof_preds[val_df['index'].values] += y_preds

    # predict on test
    test_logits = model.predict(test_set).logits
    test_y_preds = tf.nn.sigmoid(test_logits)
    test_preds += test_y_preds/5
    

Map: 100%|██████████| 6000/6000 [00:00<00:00, 25644.43 examples/s]
Map: 100%|██████████| 1500/1500 [00:00<00:00, 26240.75 examples/s]
Downloading pytorch_model.bin: 100%|██████████| 334M/334M [00:16<00:00, 19.7MB/s] 
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  82860288  
 r)                                                              
                                                                 
 classifier (TFRobertaClass  multiple                  592130    
 ificationHead)                                                  
                                                                 
Total params: 83452418 (318.35 MB)
Trainable params: 83452418 (318.35 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 4: early stopping


Map: 100%|██████████| 6000/6000 [00:00<00:00, 19289.31 examples/s]
Map: 100%|██████████| 1500/1500 [00:00<00:00, 20566.56 examples/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_

Model: "tf_roberta_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  82860288  
 r)                                                              
                                                                 
 classifier (TFRobertaClass  multiple                  592130    
 ificationHead)                                                  
                                                                 
Total params: 83452418 (318.35 MB)
Trainable params: 83452418 (318.35 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 4: early stopping


Map: 100%|██████████| 6000/6000 [00:00<00:00, 19267.63 examples/s]
Map: 100%|██████████| 1500/1500 [00:00<00:00, 20527.64 examples/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_

Model: "tf_roberta_for_sequence_classification_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  82860288  
 r)                                                              
                                                                 
 classifier (TFRobertaClass  multiple                  592130    
 ificationHead)                                                  
                                                                 
Total params: 83452418 (318.35 MB)
Trainable params: 83452418 (318.35 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 4: early stopping


Map: 100%|██████████| 6000/6000 [00:00<00:00, 20207.86 examples/s]
Map: 100%|██████████| 1500/1500 [00:00<00:00, 19974.72 examples/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_

Model: "tf_roberta_for_sequence_classification_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  82860288  
 r)                                                              
                                                                 
 classifier (TFRobertaClass  multiple                  592130    
 ificationHead)                                                  
                                                                 
Total params: 83452418 (318.35 MB)
Trainable params: 83452418 (318.35 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 5: early stopping


Map: 100%|██████████| 6000/6000 [00:00<00:00, 20089.01 examples/s]
Map: 100%|██████████| 1500/1500 [00:00<00:00, 20922.00 examples/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_

Model: "tf_roberta_for_sequence_classification_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  82860288  
 r)                                                              
                                                                 
 classifier (TFRobertaClass  multiple                  592130    
 ificationHead)                                                  
                                                                 
Total params: 83452418 (318.35 MB)
Trainable params: 83452418 (318.35 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 5: early stopping


In [10]:
y_true = data['label'].map(label2id)
print(classification_report(y_true, oof_preds))

              precision    recall  f1-score   support

           0       0.84      0.81      0.82      4324
           1       0.75      0.79      0.77      3176

    accuracy                           0.80      7500
   macro avg       0.80      0.80      0.80      7500
weighted avg       0.80      0.80      0.80      7500



In [11]:
y_preds_test = tf.argmax(test_preds, axis=1).numpy()
test_data['label'] = y_preds_test
test_data['label'] = test_data['label'].map(id2label)
test_data['label'].value_counts()

label
NOT    1447
HOF    1053
Name: count, dtype: int64

In [12]:
test_data['id'] = test_id
test_data[['id', 'label']].head()

Unnamed: 0,id,label
0,task1a_test_1,NOT
1,task1a_test_2,HOF
2,task1a_test_3,HOF
3,task1a_test_4,NOT
4,task1a_test_5,HOF


In [13]:
test_data[['id', 'label']].to_csv('./../out/sinhala_bert_baseline.csv', index=False)