In [6]:
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch
from datasets import load_metric
from sklearn.model_selection import train_test_split
import numpy as np

# Load the JSON datasets
with open(r"C:\Users\KIIT\Desktop\CSV files\EXIST 2024 Tweets Dataset\dev\EXIST2024_dev.json", encoding='utf-8') as f:
    dev_data = json.load(f)
with open(r"C:\Users\KIIT\Desktop\CSV files\EXIST 2024 Tweets Dataset\training\EXIST2024_training.json", encoding='utf-8') as f:
    train_data = json.load(f)

# Convert the JSON data to DataFrames
def json_to_dataframe(json_data):
    records = []
    for key, value in json_data.items():
        record = {
            'id': value['id_EXIST'],
            'lang': value['lang'],
            'tweet': value['tweet'],
            'labels_task3': value['labels_task3'],
            'split': value['split']
        }
        records.append(record)
    return pd.DataFrame(records)

train_df = json_to_dataframe(train_data)
dev_df = json_to_dataframe(dev_data)

# Combine the training and development data
combined_df = pd.concat([train_df, dev_df])

# Split the data into train and validation sets
train_df, val_df = train_test_split(combined_df, test_size=0.2, random_state=42)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/LaBSE')

# Encode the data
def encode_data(tokenizer, df, max_length=128):
    texts = df['tweet'].tolist()
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = encode_data(tokenizer, train_df)
val_encodings = encode_data(tokenizer, val_df)

# Possible labels for task3
possible_labels_task3 = ["IDEOLOGICAL-INEQUALITY", "STEREOTYPING-DOMINANCE", "OBJECTIFICATION", "SEXUAL-VIOLENCE", "MISOGYNY-NON-SEXUAL-VIOLENCE", "-", "UNKNOWN"]

# Encode labels for task3
def encode_labels_task3(labels_list, possible_labels):
    encoded_labels = []
    for labels in labels_list:
        encoded_label = [0] * len(possible_labels)
        for label_set in labels:
            for label in label_set:
                if label in possible_labels:
                    encoded_label[possible_labels.index(label)] = 1
        encoded_labels.append(encoded_label)
    return encoded_labels

train_labels_task3 = encode_labels_task3(train_df['labels_task3'].tolist(), possible_labels_task3)
val_labels_task3 = encode_labels_task3(val_df['labels_task3'].tolist(), possible_labels_task3)

# Define the Dataset class
class HinglishDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset objects
train_dataset = HinglishDataset(train_encodings, train_labels_task3)
val_dataset = HinglishDataset(val_encodings, val_labels_task3)

# Load the metrics
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    sigmoid_logits = torch.sigmoid(torch.tensor(logits))
    predictions = (sigmoid_logits > 0.5).int().numpy()
    labels = np.array(labels)
    
    # Flatten the arrays
    predictions = predictions.flatten()
    labels = labels.flatten()
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='micro')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='micro')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='micro')
    
    return {
        "accuracy": accuracy['accuracy'],
        "precision": precision['precision'],
        "recall": recall['recall'],
        "f1": f1['f1']
    }

# Initialize the model for multi-label classification
model = AutoModelForSequenceClassification.from_pretrained('sentence-transformers/LaBSE', num_labels=len(possible_labels_task3))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results_task3',
    num_train_epochs=10,  # Increase the number of epochs
    per_device_train_batch_size=8,  # Decrease batch size for potentially better generalization
    per_device_eval_batch_size=8,
    warmup_steps=200,  # Increase warmup steps
    learning_rate=2e-5,  # Adjust learning rate
    weight_decay=0.01,
    logging_dir='./logs_task3',
    logging_steps=10,
    evaluation_strategy="epoch",  # Ensure both strategies match
    save_strategy="epoch",  # Ensure both strategies match
    gradient_accumulation_steps=2,  # Accumulate gradients to simulate a larger batch size
    save_total_limit=2,  # Save only the best models to avoid disk space issues
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="eval_f1",  # Use F1 score to determine the best model
    greater_is_better=True  # Maximize the F1 score
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on the validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Validation Results for labels_task3: {val_results}")

# Separate block for testing accuracy
test_results = trainer.predict(val_dataset)
print(f"Testing Accuracy: {test_results.metrics['test_accuracy']}")
print(f"Testing Precision: {test_results.metrics['test_precision']}")
print(f"Testing Recall: {test_results.metrics['test_recall']}")
print(f"Testing F1 Score: {test_results.metrics['test_f1']}")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Using the latest cached version of the module from C:\Users\KIIT\.cache\huggingface\modules\datasets_modules\metrics\accuracy\9756d5fa4a0f9da966341741fc3926eafdc604b8276add51d5abbaa8958a25f9 (last modified on Sun Jun  2 12:05:01 2024) since it couldn't be found locally at accuracy, or remotely on the Hugging Face Hub.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Using the latest cached version of the module from C:\Users\KIIT\.cache\huggingface\modules\datasets_modules\metrics\precision\26faf6607f5f6fa666ded33d9e7aa1e8818a9cc6f423514adad4623641d8751c (last modified on Mon Jun 17 12:07:47 2024) since it couldn't be found locally at prec

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.461,0.43447,0.791098,0.791098,0.791098,0.791098
2,0.3943,0.425453,0.795316,0.795316,0.795316,0.795316
3,0.3994,0.421134,0.802764,0.802764,0.802764,0.802764
4,0.343,0.436589,0.801687,0.801687,0.801687,0.801687
5,0.2724,0.457387,0.7972,0.7972,0.7972,0.7972
6,0.2537,0.479779,0.796213,0.796213,0.796213,0.796213
7,0.2284,0.50145,0.798816,0.798816,0.798816,0.798816
8,0.2053,0.519893,0.794508,0.794508,0.794508,0.794508
9,0.1686,0.532571,0.793252,0.793252,0.793252,0.793252
10,0.1634,0.540074,0.792265,0.792265,0.792265,0.792265


Validation Results for labels_task3: {'eval_loss': 0.4211338758468628, 'eval_accuracy': 0.8027638190954773, 'eval_precision': 0.8027638190954773, 'eval_recall': 0.8027638190954773, 'eval_f1': 0.8027638190954773, 'eval_runtime': 336.9649, 'eval_samples_per_second': 4.725, 'eval_steps_per_second': 0.591, 'epoch': 10.0}
Testing Accuracy: 0.8027638190954773
Testing Precision: 0.8027638190954773
Testing Recall: 0.8027638190954773
Testing F1 Score: 0.8027638190954773


In [7]:
# Save the model and tokenizer
model.save_pretrained('./saved_model_LaBSEtask3')
tokenizer.save_pretrained('./saved_tokenizer_LaBSEtask3')

('./saved_tokenizer_LaBSEtask3\\tokenizer_config.json',
 './saved_tokenizer_LaBSEtask3\\special_tokens_map.json',
 './saved_tokenizer_LaBSEtask3\\vocab.txt',
 './saved_tokenizer_LaBSEtask3\\added_tokens.json',
 './saved_tokenizer_LaBSEtask3\\tokenizer.json')

In [4]:
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch
from datasets import load_metric
import numpy as np

# Load the training and development JSON datasets
with open(r"C:\Users\KIIT\Desktop\CSV files\EXIST 2024 Tweets Dataset\dev\EXIST2024_dev.json", encoding='utf-8') as f:
    dev_data = json.load(f)
with open(r"C:\Users\KIIT\Desktop\CSV files\EXIST 2024 Tweets Dataset\training\EXIST2024_training.json", encoding='utf-8') as f:
    train_data = json.load(f)

# Convert the JSON data to DataFrames
def json_to_dataframe(json_data):
    records = []
    for key, value in json_data.items():
        record = {
            'id': value['id_EXIST'],
            'lang': value['lang'],
            'tweet': value['tweet'],
            'labels_task3': value['labels_task3'],
            'split': value['split']
        }
        records.append(record)
    return pd.DataFrame(records)

train_df = json_to_dataframe(train_data)
dev_df = json_to_dataframe(dev_data)

# Filter validation data
val_df = dev_df[dev_df['split'].str.contains('DEV')]

# Initialize tokenizer for LaBSE
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/LaBSE')

# Encode the data
def encode_data(tokenizer, df, max_length=128):
    texts = df['tweet'].tolist()
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = encode_data(tokenizer, train_df)
val_encodings = encode_data(tokenizer, val_df)

# Possible labels for task3
possible_labels_task3 = ["IDEOLOGICAL-INEQUALITY", "STEREOTYPING-DOMINANCE", "OBJECTIFICATION",
                         "SEXUAL-VIOLENCE", "MISOGYNY-NON-SEXUAL-VIOLENCE", "-", "UNKNOWN"]

# Encode labels for task3
def encode_labels_task3(labels_list, possible_labels):
    encoded_labels = []
    for labels in labels_list:
        encoded_label = [0] * len(possible_labels)
        for label_set in labels:
            for label in label_set:
                if label in possible_labels:
                    encoded_label[possible_labels.index(label)] = 1
        encoded_labels.append(encoded_label)
    return encoded_labels

train_labels_task3 = encode_labels_task3(train_df['labels_task3'].tolist(), possible_labels_task3)
val_labels_task3 = encode_labels_task3(val_df['labels_task3'].tolist(), possible_labels_task3)

# Define the Dataset class
class HinglishDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset objects
train_dataset = HinglishDataset(train_encodings, train_labels_task3)
val_dataset = HinglishDataset(val_encodings, val_labels_task3)

# Load the metrics
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    sigmoid_logits = torch.sigmoid(torch.tensor(logits))
    predictions = (sigmoid_logits > 0.5).int().numpy()
    labels = np.array(labels)
    
    # Flatten the arrays
    predictions = predictions.flatten()
    labels = labels.flatten()
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='micro')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='micro')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='micro')
    
    return {
        "accuracy": accuracy['accuracy'],
        "precision": precision['precision'],
        "recall": recall['recall'],
        "f1": f1['f1']
    }

# Initialize the model for multi-label classification using LaBSE
model = AutoModelForSequenceClassification.from_pretrained('sentence-transformers/LaBSE', num_labels=len(possible_labels_task3))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results_task3',
    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs_task3',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on the validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Validation Results for labels_task3: {val_results}")

# Separate block for testing accuracy
test_results = trainer.predict(val_dataset)
print(f"Testing Accuracy: {test_results.metrics['test_accuracy']}")
print(f"Testing Precision: {test_results.metrics['test_precision']}")
print(f"Testing Recall: {test_results.metrics['test_recall']}")
print(f"Testing F1 Score: {test_results.metrics['test_f1']}")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/LaBSE and are newly initialized: ['classifier.weight', 'classifier.bias']
You 

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4544,0.454478,0.780897,0.780897,0.780897,0.780897
2,0.3728,0.430831,0.797275,0.797275,0.797275,0.797275
3,0.328,0.460921,0.792045,0.792045,0.792045,0.792045
4,0.2776,0.492098,0.783237,0.783237,0.783237,0.783237
5,0.179,0.567896,0.782136,0.782136,0.782136,0.782136
6,0.1154,0.635688,0.779383,0.779383,0.779383,0.779383
7,0.0696,0.692855,0.774979,0.774979,0.774979,0.774979
8,0.0561,0.707159,0.778695,0.778695,0.778695,0.778695


Validation Results for labels_task3: {'eval_loss': 0.7071593403816223, 'eval_accuracy': 0.7786952931461602, 'eval_precision': 0.7786952931461602, 'eval_recall': 0.7786952931461602, 'eval_f1': 0.77869529314616, 'eval_runtime': 207.5172, 'eval_samples_per_second': 5.002, 'eval_steps_per_second': 0.313, 'epoch': 8.0}
Testing Accuracy: 0.7786952931461602
Testing Precision: 0.7786952931461602
Testing Recall: 0.7786952931461602
Testing F1 Score: 0.77869529314616
