In [1]:
import pandas as pd
import json
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch
from datasets import load_metric
import numpy as np

# Load the training and development JSON datasets
with open(r"C:\Users\KIIT\Desktop\CSV files\EXIST 2024 Tweets Dataset\dev\EXIST2024_dev.json", encoding='utf-8') as f:
    dev_data = json.load(f)
with open(r"C:\Users\KIIT\Desktop\CSV files\EXIST 2024 Tweets Dataset\training\EXIST2024_training.json", encoding='utf-8') as f:
    train_data = json.load(f)

# Convert the JSON data to DataFrames
def json_to_dataframe(json_data):
    records = []
    for key, value in json_data.items():
        record = {
            'id': value['id_EXIST'],
            'lang': value['lang'],
            'tweet': value['tweet'],
            'labels_task2': value['labels_task2'],
            'split': value['split']
        }
        records.append(record)
    return pd.DataFrame(records)

train_df = json_to_dataframe(train_data)
dev_df = json_to_dataframe(dev_data)

# Filter validation data
val_df = dev_df[dev_df['split'].str.contains('DEV')]

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Encode the data
def encode_data(tokenizer, df, max_length=128):
    texts = df['tweet'].tolist()
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = encode_data(tokenizer, train_df)
val_encodings = encode_data(tokenizer, val_df)

# Possible labels for task2
possible_labels_task2 = ["DIRECT", "REPORTED", "JUDGEMENTAL", "-", "UNKNOWN"]

# Encode labels for task2
def encode_labels_task2(labels_list, possible_labels):
    encoded_labels = []
    for labels in labels_list:
        encoded_label = [0] * len(possible_labels)
        for label_set in labels:
            for label in label_set:
                if label in possible_labels:
                    encoded_label[possible_labels.index(label)] = 1
        encoded_labels.append(encoded_label)
    return encoded_labels

train_labels_task2 = encode_labels_task2(train_df['labels_task2'].tolist(), possible_labels_task2)
val_labels_task2 = encode_labels_task2(val_df['labels_task2'].tolist(), possible_labels_task2)

# Define the Dataset class
class HinglishDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset objects
train_dataset = HinglishDataset(train_encodings, train_labels_task2)
val_dataset = HinglishDataset(val_encodings, val_labels_task2)

# Load the metrics
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    sigmoid_logits = torch.sigmoid(torch.tensor(logits))
    predictions = (sigmoid_logits > 0.5).int().numpy()
    labels = np.array(labels)
    
    # Flatten the arrays
    predictions = predictions.flatten()
    labels = labels.flatten()
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='micro')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='micro')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='micro')
    
    return {
        "accuracy": accuracy['accuracy'],
        "precision": precision['precision'],
        "recall": recall['recall'],
        "f1": f1['f1']
    }

# Initialize the model for multi-label classification
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(possible_labels_task2))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results_task2',
    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs_task2',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on the validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Validation Results for labels_task2: {val_results}")

# Separate block for testing accuracy
test_results = trainer.predict(val_dataset)
print(f"Testing Accuracy: {test_results.metrics['test_accuracy']}")
print(f"Testing Precision: {test_results.metrics['test_precision']}")
print(f"Testing Recall: {test_results.metrics['test_recall']}")
print(f"Testing F1 Score: {test_results.metrics['test_f1']}")


  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.tr

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0588,0.088649,0.968593,0.968593,0.968593,0.968593
2,0.0661,0.086015,0.968593,0.968593,0.968593,0.968593
3,0.0649,0.088948,0.968593,0.968593,0.968593,0.968593
4,0.0391,0.118734,0.965896,0.965896,0.965896,0.965896
5,0.0097,0.140636,0.96185,0.96185,0.96185,0.96185
6,0.0086,0.179917,0.963198,0.963198,0.963198,0.963198
7,0.0011,0.185726,0.966089,0.966089,0.966089,0.966089
8,0.0026,0.188761,0.964933,0.964933,0.964933,0.964933


Validation Results for labels_task2: {'eval_loss': 0.18876051902770996, 'eval_accuracy': 0.9649325626204239, 'eval_precision': 0.9649325626204239, 'eval_recall': 0.9649325626204239, 'eval_f1': 0.9649325626204239, 'eval_runtime': 272.0498, 'eval_samples_per_second': 3.815, 'eval_steps_per_second': 0.239, 'epoch': 8.0}
Testing Accuracy: 0.9649325626204239
Testing Precision: 0.9649325626204239
Testing Recall: 0.9649325626204239
Testing F1 Score: 0.9649325626204239


In [2]:
# Save the model and tokenizer
model.save_pretrained('./saved_model_BERTtask2')
tokenizer.save_pretrained('./saved_tokenizer_BERTtask2')

('./saved_tokenizer_BERTtask2\\tokenizer_config.json',
 './saved_tokenizer_BERTtask2\\special_tokens_map.json',
 './saved_tokenizer_BERTtask2\\vocab.txt',
 './saved_tokenizer_BERTtask2\\added_tokens.json')

In [1]:
import pandas as pd
import json
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch
from datasets import load_metric
from sklearn.model_selection import train_test_split
import numpy as np

# Load the JSON datasets
with open(r"C:\Users\KIIT\Desktop\CSV files\EXIST 2024 Tweets Dataset\dev\EXIST2024_dev.json", encoding='utf-8') as f:
    dev_data = json.load(f)
with open(r"C:\Users\KIIT\Desktop\CSV files\EXIST 2024 Tweets Dataset\training\EXIST2024_training.json", encoding='utf-8') as f:
    train_data = json.load(f)

# Convert the JSON data to DataFrames
def json_to_dataframe(json_data):
    records = []
    for key, value in json_data.items():
        record = {
            'id': value['id_EXIST'],
            'lang': value['lang'],
            'tweet': value['tweet'],
            'labels_task2': value['labels_task2'],
            'split': value['split']
        }
        records.append(record)
    return pd.DataFrame(records)

train_df = json_to_dataframe(train_data)
dev_df = json_to_dataframe(dev_data)

# Combine the training and development data
combined_df = pd.concat([train_df, dev_df])

# Split the data into train and validation sets
train_df, val_df = train_test_split(combined_df, test_size=0.2, random_state=42)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Encode the data
def encode_data(tokenizer, df, max_length=128):
    texts = df['tweet'].tolist()
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = encode_data(tokenizer, train_df)
val_encodings = encode_data(tokenizer, val_df)

# Possible labels for task2
possible_labels_task2 = ["DIRECT", "REPORTED", "JUDGEMENTAL", "-", "UNKNOWN"]

# Encode labels for task2
def encode_labels_task2(labels_list, possible_labels):
    encoded_labels = []
    for labels in labels_list:
        encoded_label = [0] * len(possible_labels)
        for label_set in labels:
            for label in label_set:
                if label in possible_labels:
                    encoded_label[possible_labels.index(label)] = 1
        encoded_labels.append(encoded_label)
    return encoded_labels

train_labels_task2 = encode_labels_task2(train_df['labels_task2'].tolist(), possible_labels_task2)
val_labels_task2 = encode_labels_task2(val_df['labels_task2'].tolist(), possible_labels_task2)

# Define the Dataset class
class HinglishDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset objects
train_dataset = HinglishDataset(train_encodings, train_labels_task2)
val_dataset = HinglishDataset(val_encodings, val_labels_task2)

# Load the metrics
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    sigmoid_logits = torch.sigmoid(torch.tensor(logits))
    predictions = (sigmoid_logits > 0.5).int().numpy()
    labels = np.array(labels)
    
    # Flatten the arrays
    predictions = predictions.flatten()
    labels = labels.flatten()
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='micro')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='micro')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='micro')
    
    return {
        "accuracy": accuracy['accuracy'],
        "precision": precision['precision'],
        "recall": recall['recall'],
        "f1": f1['f1']
    }

# Initialize the model for multi-label classification
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(possible_labels_task2))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results_task2',
    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs_task2',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on the validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Validation Results for labels_task2: {val_results}")

# Separate block for testing accuracy
test_results = trainer.predict(val_dataset)
print(f"Testing Accuracy: {test_results.metrics['test_accuracy']}")
print(f"Testing Precision: {test_results.metrics['test_precision']}")
print(f"Testing Recall: {test_results.metrics['test_recall']}")
print(f"Testing F1 Score: {test_results.metrics['test_f1']}")


  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.tr

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0648,0.073208,0.975503,0.975503,0.975503,0.975503
2,0.0806,0.071421,0.975503,0.975503,0.975503,0.975503
3,0.0853,0.072429,0.975503,0.975503,0.975503,0.975503
4,0.0765,0.074621,0.975503,0.975503,0.975503,0.975503
5,0.0739,0.074784,0.975503,0.975503,0.975503,0.975503
6,0.0808,0.074481,0.975503,0.975503,0.975503,0.975503
7,0.0776,0.074454,0.975503,0.975503,0.975503,0.975503
8,0.0696,0.074504,0.975503,0.975503,0.975503,0.975503


Validation Results for labels_task2: {'eval_loss': 0.07450447231531143, 'eval_accuracy': 0.9755025125628141, 'eval_precision': 0.9755025125628141, 'eval_recall': 0.9755025125628141, 'eval_f1': 0.9755025125628141, 'eval_runtime': 377.65, 'eval_samples_per_second': 4.216, 'eval_steps_per_second': 0.265, 'epoch': 8.0}
Testing Accuracy: 0.9755025125628141
Testing Precision: 0.9755025125628141
Testing Recall: 0.9755025125628141
Testing F1 Score: 0.9755025125628141


In [2]:
# Save the model and tokenizer
model.save_pretrained('./saved_model_BERTtask2')
tokenizer.save_pretrained('./saved_tokenizer_BERTtask2')

('./saved_tokenizer_BERTtask2\\tokenizer_config.json',
 './saved_tokenizer_BERTtask2\\special_tokens_map.json',
 './saved_tokenizer_BERTtask2\\vocab.txt',
 './saved_tokenizer_BERTtask2\\added_tokens.json')