In [3]:
!pip install tokenizers==0.13.2

Collecting tokenizers==0.13.2
  Downloading tokenizers-0.13.2-cp39-cp39-win_amd64.whl (3.3 MB)
     ---------------------------------------- 3.3/3.3 MB 2.0 MB/s eta 0:00:00
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
Successfully installed tokenizers-0.13.2




In [4]:
import pandas as pd
import json
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch
from datasets import load_metric
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

# Load the training and development JSON datasets
with open("C:\\Users\\KIIT\\Desktop\\CSV files\\EXIST 2024 Tweets Dataset\\training\\EXIST2024_training.json", 'r', encoding='utf-8') as file:
    train_data = json.load(file)

with open("C:\\Users\\KIIT\\Desktop\\CSV files\\EXIST 2024 Tweets Dataset\\dev\\EXIST2024_dev.json", 'r', encoding='utf-8') as file:
    dev_data = json.load(file)


In [5]:
# Convert the JSON data to DataFrames
def json_to_df(data):
    records = []
    for key, value in data.items():
        record = {
            'id': value['id_EXIST'],
            'lang': value['lang'],
            'tweet': value['tweet'],
            'labels_task1': value['labels_task1'],
            'split': value['split']
        }
        records.append(record)
    return pd.DataFrame(records)

train_df = json_to_df(train_data)
dev_df = json_to_df(dev_data)

In [6]:
# Split the DEV dataset into validation and test sets
val_df, test_df = train_test_split(dev_df, test_size=0.5, random_state=42)

In [7]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Encode the data
def encode_data(tokenizer, df, max_length=128):
    texts = df['tweet'].tolist()
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = encode_data(tokenizer, train_df)
val_encodings = encode_data(tokenizer, val_df)
test_encodings = encode_data(tokenizer, test_df)



In [8]:
# Encode labels for task1
def encode_labels(df):
    return [1 if label == 'YES' else 0 for label in df['labels_task1'].apply(lambda x: x[0]).tolist()]

train_labels = encode_labels(train_df)
val_labels = encode_labels(val_df)
test_labels = encode_labels(test_df)

In [9]:
# Define the Dataset class
class HinglishDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
# Create dataset objects
train_dataset = HinglishDataset(train_encodings, train_labels)
val_dataset = HinglishDataset(val_encodings, val_labels)
test_dataset = HinglishDataset(test_encodings, test_labels)

In [11]:
# Load the accuracy metric
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to 

In [12]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results_task1',
    num_train_epochs=5,  # Increase the number of epochs if necessary
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-5,  # Adjust the learning rate
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_task1',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
num_training_steps = len(train_dataset) * training_args.num_train_epochs
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=training_args.warmup_steps,
    num_training_steps=num_training_steps
)

trainer.optimizer = optimizer
trainer.lr_scheduler = lr_scheduler

In [14]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5877,0.639413,0.660886
2,0.6003,0.62083,0.691715
3,0.4923,0.64408,0.701349
4,0.4077,0.752499,0.674374
5,0.2706,0.932545,0.672447


TrainOutput(global_step=2165, training_loss=0.468224351015157, metrics={'train_runtime': 29028.0497, 'train_samples_per_second': 1.192, 'train_steps_per_second': 0.075, 'total_flos': 2275910628864000.0, 'train_loss': 0.468224351015157, 'epoch': 5.0})

In [15]:
# Evaluate on the validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Validation Accuracy for labels_task1: {val_results['eval_accuracy']}")

# Make predictions on the validation set
val_predictions = trainer.predict(val_dataset)

# Get the logits and predicted labels for validation set
val_logits = val_predictions.predictions
val_pred_labels = val_logits.argmax(axis=-1)

# Compute metrics for validation set
val_f1 = f1_score(val_labels, val_pred_labels, average='weighted')
val_precision = precision_score(val_labels, val_pred_labels, average='weighted')
val_recall = recall_score(val_labels, val_pred_labels, average='weighted')

print(f"Validation F1 Score for labels_task1: {val_f1:.2f}")
print(f"Validation Precision for labels_task1: {val_precision:.2f}")
print(f"Validation Recall for labels_task1: {val_recall:.2f}")

# Evaluate on the test set
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test Accuracy for labels_task1: {test_results['eval_accuracy']}")

# Make predictions on the test set
test_predictions = trainer.predict(test_dataset)

# Get the logits and predicted labels for test set
test_logits = test_predictions.predictions
test_pred_labels = test_logits.argmax(axis=-1)

# Compute metrics for test set
test_f1 = f1_score(test_labels, test_pred_labels, average='weighted')
test_precision = precision_score(test_labels, test_pred_labels, average='weighted')
test_recall = recall_score(test_labels, test_pred_labels, average='weighted')

print(f"Test F1 Score for labels_task1: {test_f1:.2f}")
print(f"Test Precision for labels_task1: {test_precision:.2f}")
print(f"Test Recall for labels_task1: {test_recall:.2f}")

Validation Accuracy for labels_task1: 0.6917148362235067
Validation F1 Score for labels_task1: 0.69
Validation Precision for labels_task1: 0.70
Validation Recall for labels_task1: 0.69


Test Accuracy for labels_task1: 0.7148362235067437
Test F1 Score for labels_task1: 0.71
Test Precision for labels_task1: 0.72
Test Recall for labels_task1: 0.71


In [18]:
import pandas as pd
import json
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch
from datasets import load_metric
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

# Load the training and development JSON datasets
with open("C:\\Users\\KIIT\\Desktop\\CSV files\\EXIST 2024 Tweets Dataset\\training\\EXIST2024_training.json", 'r', encoding='utf-8') as file:
    train_data = json.load(file)

with open("C:\\Users\\KIIT\\Desktop\\CSV files\\EXIST 2024 Tweets Dataset\\dev\\EXIST2024_dev.json", 'r', encoding='utf-8') as file:
    dev_data = json.load(file)

# Convert the JSON data to DataFrames
def json_to_df(data):
    records = []
    for key, value in data.items():
        record = {
            'id': value['id_EXIST'],
            'lang': value['lang'],
            'tweet': value['tweet'],
            'labels_task1': value['labels_task1'],
            'split': value['split']
        }
        records.append(record)
    return pd.DataFrame(records)

train_df = json_to_df(train_data)
dev_df = json_to_df(dev_data)

# Split the DEV dataset into validation and test sets
val_df, test_df = train_test_split(dev_df, test_size=0.5, random_state=42)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Encode the data
def encode_data(tokenizer, df, max_length=128):
    texts = df['tweet'].tolist()
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = encode_data(tokenizer, train_df)
val_encodings = encode_data(tokenizer, val_df)
test_encodings = encode_data(tokenizer, test_df)

# Encode labels for task1
def encode_labels(df):
    return [1 if label == 'YES' else 0 for label in df['labels_task1'].apply(lambda x: x[0]).tolist()]

train_labels = encode_labels(train_df)
val_labels = encode_labels(val_df)
test_labels = encode_labels(test_df)

# Define the Dataset class
class HinglishDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset objects
train_dataset = HinglishDataset(train_encodings, train_labels)
val_dataset = HinglishDataset(val_encodings, val_labels)
test_dataset = HinglishDataset(test_encodings, test_labels)

# Load the accuracy metric
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results_task1',
    num_train_epochs=10,  # Increase the number of epochs
    per_device_train_batch_size=8,  # Adjust batch size
    per_device_eval_batch_size=16,
    learning_rate=2e-5,  # Adjust the learning rate
    warmup_steps=1000,  # Adjust warmup steps
    weight_decay=0.01,
    logging_dir='./logs_task1',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on the validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Validation Accuracy for labels_task1: {val_results['eval_accuracy']}")

# Make predictions on the validation set
val_predictions = trainer.predict(val_dataset)

# Get the logits and predicted labels for validation set
val_logits = val_predictions.predictions
val_pred_labels = val_logits.argmax(axis=-1)

# Compute metrics for validation set
val_f1 = f1_score(val_labels, val_pred_labels, average='weighted')
val_precision = precision_score(val_labels, val_pred_labels, average='weighted')
val_recall = recall_score(val_labels, val_pred_labels, average='weighted')

print(f"Validation F1 Score for labels_task1: {val_f1:.2f}")
print(f"Validation Precision for labels_task1: {val_precision:.2f}")
print(f"Validation Recall for labels_task1: {val_recall:.2f}")

# Evaluate on the test set
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test Accuracy for labels_task1: {test_results['eval_accuracy']}")

# Make predictions on the test set
test_predictions = trainer.predict(test_dataset)

# Get the logits and predicted labels for test set
test_logits = test_predictions.predictions
test_pred_labels = test_logits.argmax(axis=-1)

# Compute metrics for test set
test_f1 = f1_score(test_labels, test_pred_labels, average='weighted')
test_precision = precision_score(test_labels, test_pred_labels, average='weighted')
test_recall = recall_score(test_labels, test_pred_labels, average='weighted')

print(f"Test F1 Score for labels_task1: {test_f1:.2f}")
print(f"Test Precision for labels_task1: {test_precision:.2f}")
print(f"Test Recall for labels_task1: {test_recall:.2f}")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing 

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6579,0.62868,0.651252
2,0.58,0.584823,0.699422
3,0.6297,0.617836,0.712909
4,0.4019,1.092967,0.66474
5,0.2972,1.806696,0.628131
6,0.1631,1.789858,0.685934
7,0.0056,1.982192,0.693642
8,0.0003,2.360038,0.685934
9,0.1043,2.628808,0.66474
10,0.0001,2.547919,0.691715


Validation Accuracy for labels_task1: 0.6994219653179191
Validation F1 Score for labels_task1: 0.70
Validation Precision for labels_task1: 0.70
Validation Recall for labels_task1: 0.70


Test Accuracy for labels_task1: 0.7129094412331407
Test F1 Score for labels_task1: 0.71
Test Precision for labels_task1: 0.71
Test Recall for labels_task1: 0.71


In [19]:
# Define the directory where you want to save the model and tokenizer
output_dir = './trained_model_task1'

# Save the trained model
model.save_pretrained(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)


('./trained_model_task1\\tokenizer_config.json',
 './trained_model_task1\\special_tokens_map.json',
 './trained_model_task1\\vocab.txt',
 './trained_model_task1\\added_tokens.json')

In [None]:
# from transformers import BertTokenizer, BertForSequenceClassification

# # Load the saved model and tokenizer
# model = BertForSequenceClassification.from_pretrained(output_dir)
# tokenizer = BertTokenizer.from_pretrained(output_dir)

In [1]:
import pandas as pd
import json
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch
from datasets import load_metric
from sklearn.model_selection import train_test_split
import numpy as np

# Load the JSON datasets
with open(r"C:\Users\KIIT\Desktop\CSV files\EXIST 2024 Tweets Dataset\dev\EXIST2024_dev.json", encoding='utf-8') as f:
    dev_data = json.load(f)
with open(r"C:\Users\KIIT\Desktop\CSV files\EXIST 2024 Tweets Dataset\training\EXIST2024_training.json", encoding='utf-8') as f:
    train_data = json.load(f)

# Convert the JSON data to DataFrames
def json_to_dataframe(json_data):
    records = []
    for key, value in json_data.items():
        record = {
            'id': value['id_EXIST'],
            'lang': value['lang'],
            'tweet': value['tweet'],
            'labels_task1': 1 if 'YES' in value['labels_task1'] else 0,
            'split': value['split']
        }
        records.append(record)
    return pd.DataFrame(records)

train_df = json_to_dataframe(train_data)
dev_df = json_to_dataframe(dev_data)

# Combine the training and development data
combined_df = pd.concat([train_df, dev_df])

# Split the data into train and validation sets
train_df, val_df = train_test_split(combined_df, test_size=0.2, random_state=42)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Encode the data
def encode_data(tokenizer, df, max_length=128):
    texts = df['tweet'].tolist()
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = encode_data(tokenizer, train_df)
val_encodings = encode_data(tokenizer, val_df)

# Encode labels for task1
train_labels_task1 = train_df['labels_task1'].tolist()
val_labels_task1 = val_df['labels_task1'].tolist()

# Define the Dataset class
class HinglishDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset objects
train_dataset = HinglishDataset(train_encodings, train_labels_task1)
val_dataset = HinglishDataset(val_encodings, val_labels_task1)

# Load the metrics
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='binary')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='binary')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='binary')
    
    return {
        "accuracy": accuracy['accuracy'],
        "precision": precision['precision'],
        "recall": recall['recall'],
        "f1": f1['f1']
    }

# Initialize the model for binary classification
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results_task1',
    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs_task1',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on the validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Validation Results for labels_task1: {val_results}")

# Separate block for testing accuracy
test_results = trainer.predict(val_dataset)
print(f"Testing Accuracy: {test_results.metrics['test_accuracy']}")
print(f"Testing Precision: {test_results.metrics['test_precision']}")
print(f"Testing Recall: {test_results.metrics['test_recall']}")
print(f"Testing F1 Score: {test_results.metrics['test_f1']}")


  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationsh

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3549,0.475886,0.829146,0.846538,0.954143,0.897126
2,0.417,0.417979,0.820352,0.860045,0.919549,0.888802
3,0.2421,0.494836,0.802764,0.871303,0.876911,0.874098
4,0.1804,0.602335,0.805905,0.879058,0.871279,0.875152
5,0.0641,0.90266,0.805276,0.853142,0.906677,0.879095
6,0.0643,1.057344,0.809045,0.861989,0.899437,0.880315
7,0.0023,1.254853,0.810302,0.859435,0.905068,0.881661
8,0.0002,1.389149,0.809673,0.852853,0.913918,0.88233


Validation Results for labels_task1: {'eval_loss': 1.3891489505767822, 'eval_accuracy': 0.8096733668341709, 'eval_precision': 0.8528528528528528, 'eval_recall': 0.913917940466613, 'eval_f1': 0.8823300970873786, 'eval_runtime': 379.601, 'eval_samples_per_second': 4.194, 'eval_steps_per_second': 0.263, 'epoch': 8.0}
Testing Accuracy: 0.8096733668341709
Testing Precision: 0.8528528528528528
Testing Recall: 0.913917940466613
Testing F1 Score: 0.8823300970873786


In [None]:
# Define the directory where you want to save the model and tokenizer
output_dir = './trained_model_task1'

# Save the trained model
model.save_pretrained(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)