In [1]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW

In [2]:
# Define custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [3]:
def generate_custom_tuned_model (train_path, test_path, language):
    
    #read train & test file
    df_train = pd.read_csv(train_path, encoding='utf-8')
    df_test = pd.read_csv(test_path, encoding='utf-8')

     # Replace NaN with an empty string and ensure all text is treated as string
    df_train['tweet_text'] = df_train['tweet_text'].fillna('').astype(str)
    df_test['tweet_text'] = df_test['tweet_text'].fillna('').astype(str)
    
    # Load pre-trained model and tokenizer
    model_name = 'xlm-roberta-base'
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
    model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)  # binary classification

    #tokenize_visualize(df_train, tokenizer)
    #tokenize_visualize(df_test, tokenizer)

    train_texts = df_train["tweet_text"]
    train_labels = df_train["class_label"]
    test_texts = df_test["tweet_text"]
    test_labels = df_test["class_label"]

    le = LabelEncoder()
    le.fit(train_labels)
    train_labels_num = le.transform(train_labels)
    le.fit(test_labels)
    test_labels_num = le.transform(test_labels)

    train_dataset = CustomDataset(train_texts, train_labels_num, tokenizer, max_length=512)
    val_dataset = CustomDataset(test_texts, test_labels_num, tokenizer, max_length=512)

    # Define DataLoader
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8)
    
    # Optimizer & Learning Rate Scheduler
    optimizer = AdamW(model.parameters(),
                    lr = 2e-5,
                    eps = 1e-8
                    )

    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

    # check if we have cuda installed
    if torch.cuda.is_available():
        # to use GPU
        device = torch.device("cuda")
        print('There are %d GPU(s) available.' % torch.cuda.device_count())
        print('GPU is:', torch.cuda.get_device_name(0))
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    for epoch in range(3):
        model.train()
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            labels = labels.to(torch.long)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                labels = labels.to(torch.long)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

                _, predicted = torch.max(outputs.logits, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(val_loader)
        accuracy = correct / total

        print(f'Epoch {epoch + 1}/{3}, Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}')

    # Save the fine-tuned model
    model.save_pretrained("../fine_tuned_models/fine_tuned_xlm_roberta_model_"+language)

In [4]:
def tokenize_visualize (df, tokenizer):
    tokenized_feature_raw = tokenizer.batch_encode_plus(
                            # Sentences to encode
                            df["tweet_text"].values.tolist(),
                            add_special_tokens = True
                   )
    # collect tokenized sentence length
    token_sentence_length = [len(x) for x in tokenized_feature_raw['input_ids']]
    print('max: ', max(token_sentence_length))
    print('min: ', min(token_sentence_length))
    
    # plot the distribution
    plt.figure(figsize=(20, 8))
    plt.hist(token_sentence_length, rwidth = 0.9)
    plt.xlabel('Sequence Length', fontsize = 18)
    plt.ylabel('# of Samples', fontsize = 18)
    plt.xticks(fontsize = 14)
    plt.yticks(fontsize = 14)

In [None]:
# Call all language file to generate custom fine tuned model by tuning transformer model

generate_custom_tuned_model("CT24_checkworthy_spanish_train_preprocessed.csv", "CT24_checkworthy_spanish_dev_preprocessed.csv", "spanish")
generate_custom_tuned_model("CT24_checkworthy_arabic_train_preprocessed.csv", "CT24_checkworthy_arabic_dev_preprocessed.csv", "arabic")
generate_custom_tuned_model("CT24_checkworthy_dutch_train_preprocessed.csv", "CT24_checkworthy_dutch_dev_preprocessed.csv", "dutch")
generate_custom_tuned_model("CT24_checkworthy_english_train_preprocessed.csv", "CT24_checkworthy_english_dev_preprocessed.csv", "english")


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


No GPU available, using the CPU instead.
