In [1]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf

from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import create_optimizer
from transformers import TFAutoModelForSequenceClassification

from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("./../inp/gujju/train.csv")
test_data = pd.read_csv("./../inp/gujju/test.csv")

In [3]:
id2label = {0: "NOT", 1: "HOF"}
label2id = {"NOT": 0, "HOF": 1}

In [4]:
def data_clean(data_df):
    # Removing Unwanted Columns
    data_df.drop(["tweet_id", "created_at", "user_screen_name"], axis=1, inplace=True)
    
    # Removing @tags 
    pattern = r'@\w+'
    data_df["text"] = data_df["text"].apply(lambda x: re.sub(pattern, '', x))
    
    # Transforming Categorical Values to Numericals
    data_df["labels"] = data_df["label"].apply(lambda x: [label2id[x]])
    
    # Dropping label column
    data_df.drop("label", axis=1, inplace=True)
    
    return data_df

In [5]:
# Cleaning and Preparing Test Data
test_id = test_data["tweet_id"]
test_data.drop("tweet_id", axis=1, inplace=True)
pattern = r'@\w+'
test_data["text"] = test_data["text"].apply(lambda x: re.sub(pattern, '', x))

In [6]:
model_name = "l3cube-pune/gujarati-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name, from_pt=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

In [7]:
def tokenize_examples(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, max_length=True)
    return tokenized_inputs

def create_model(model_name, optimizer):
    model = TFAutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id,
        from_pt=True
    )
    
    model.compile(
        optimizer=optimizer,
        metrics=[tf.keras.metrics.binary_crossentropy]
    )
    
    model.summary()
    return model

In [8]:
test = Dataset.from_pandas(test_data)
test_tokenized = test.map(tokenize_examples, batched=True)

Map: 100%|██████████| 1196/1196 [00:00<00:00, 9605.96 examples/s]


In [9]:
fold_path = "./../inp/gujju/folds/2023/"
dirs = os.listdir(fold_path)

oof_preds = np.zeros((data.shape[0],))
test_preds = np.zeros((test_data.shape[0], 2))

for dir_name in dirs:
    dir_path = os.path.join(fold_path, dir_name)

    # Defining the Train and Val paths 
    train_df = pd.read_csv(os.path.join(dir_path, 'train.csv'))
    val_df = pd.read_csv(os.path.join(dir_path, 'val.csv'))
    
    # Cleaning and Prepareing the Data
    train_clean = data_clean(train_df)
    val_clean = data_clean(val_df)
    
    # Converting to HuggingFace Datasets
    train_ds = Dataset.from_pandas(train_df)
    val_ds = Dataset.from_pandas(val_df)
    
    # Tokenize the Data    
    train_tokenized = train_ds.map(tokenize_examples, batched=True)
    val_tokenized = val_ds.map(tokenize_examples, batched=True)
    
    # Defining the Parameters for Training
    batch_size = 8
    num_epochs = 10
    batches_per_epoch = len(train_tokenized) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)
    optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
    
    # Define the Model 
    model = create_model(model_name, optimizer)
    
    # Converting to Tf Dataset for training
    train_set = model.prepare_tf_dataset(
        train_tokenized,
        shuffle=True,
        batch_size=8,
        collate_fn=data_collator,
    )

    validation_set = model.prepare_tf_dataset(
        val_tokenized,
        shuffle=False,
        batch_size=8,
        collate_fn=data_collator,
    )
    
    test_set = model.prepare_tf_dataset(
        test_tokenized,
        shuffle=False,
        batch_size=8,
        collate_fn=data_collator,
    )
    
    # Define Model
    es = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        min_delta=0,
        patience=3,
        verbose=1,
        mode='auto',
        baseline=None,
        restore_best_weights=False,
        start_from_epoch=0
    )
    
    history = model.fit(x=train_set,
              validation_data=validation_set,
              epochs=15,
              callbacks=[es])

    logits = model.predict(validation_set).logits
    y_preds = tf.argmax(tf.nn.sigmoid(logits), axis=1).numpy()
    oof_preds[val_df['index'].values] += y_preds

    # predict on test
    test_logits = model.predict(test_set).logits
    test_y_preds = tf.nn.sigmoid(test_logits)
    test_preds += test_y_preds/5
    

Map: 100%|██████████| 160/160 [00:00<00:00, 24648.82 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 12678.32 examples/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bi

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  237556224 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 237557762 (906.21 MB)
Trainable params: 237557762 (906.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 11: early stopping


Map: 100%|██████████| 160/160 [00:00<00:00, 6851.41 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 10663.71 examples/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bia

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  237556224 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 237557762 (906.21 MB)
Trainable params: 237557762 (906.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 13: early stopping


Map: 100%|██████████| 160/160 [00:00<00:00, 6088.96 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 6829.17 examples/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias

Model: "tf_bert_for_sequence_classification_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  237556224 
                                                                 
 dropout_113 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 237557762 (906.21 MB)
Trainable params: 237557762 (906.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 10: early stopping


Map: 100%|██████████| 160/160 [00:00<00:00, 14780.39 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 8512.03 examples/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bia

Model: "tf_bert_for_sequence_classification_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  237556224 
                                                                 
 dropout_151 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 237557762 (906.21 MB)
Trainable params: 237557762 (906.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 4: early stopping


Map: 100%|██████████| 160/160 [00:00<00:00, 5588.17 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 7259.72 examples/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias

Model: "tf_bert_for_sequence_classification_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  237556224 
                                                                 
 dropout_189 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 237557762 (906.21 MB)
Trainable params: 237557762 (906.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 11: early stopping


In [10]:
y_true = data['label'].map(label2id)
print(classification_report(y_true, oof_preds))

              precision    recall  f1-score   support

           0       0.60      0.67      0.63       100
           1       0.62      0.55      0.59       100

    accuracy                           0.61       200
   macro avg       0.61      0.61      0.61       200
weighted avg       0.61      0.61      0.61       200



In [11]:
y_preds_test = tf.argmax(test_preds, axis=1).numpy()
test_data['label'] = y_preds_test
test_data['label'] = test_data['label'].map(id2label)
test_data['label'].value_counts()

label
NOT    718
HOF    478
Name: count, dtype: int64

In [12]:
test_data['id'] = test_id
test_data[['id', 'label']].head()

Unnamed: 0,id,label
0,2023_test_main_tweet_1,NOT
1,2023_test_main_tweet_2,HOF
2,2023_test_main_tweet_3,HOF
3,2023_test_main_tweet_4,NOT
4,2023_test_main_tweet_5,HOF


In [13]:
test_data[['id', 'label']].to_csv('./../out/guj_bert_baseline.csv', index=False)