In [1]:
import pandas as pd
import torch
from torch.nn.functional import softmax
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

In [5]:
def predict_class(test_file_path, language):

    # Load XLM-RoBERTa tokenizer and model
    model_path = "../fine_tuned_models/fine_tuned_xlm_roberta_model_"+language
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
    model = XLMRobertaForSequenceClassification.from_pretrained(model_path)
    
    # Load test data
    test_data = pd.read_csv(test_file_path)
    
    # Prepare inputs for the model
    inputs = tokenizer(test_data['tweet_text'].tolist(), padding=True, truncation=True, return_tensors='pt')
    
    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Apply softmax to get predicted probabilities
    probabilities = softmax(outputs.logits, dim=1)
    
    # Get predicted class labels based on probability threshold
    threshold = 0.5 
    predicted_labels = ['yes' if prob[1] > threshold else 'no' for prob in probabilities]
    
    # Add predicted labels to the test data
    test_data['predicted_label'] = predicted_labels
    
    # Save results to CSV
    test_data[['tweet_id', 'predicted_label']].to_csv("../output_data/transformer/output_"+language+".csv", index=False)


In [6]:

eng_test_file_path = "../data/processed_data/CT24_checkworthy_english/CT24_checkworthy_english_dev-test_preprocessed.csv"
predict_class(eng_test_file_path, "english")

arabic_test_file_path = "../data/processed_data/CT24_checkworthy_arabic/CT24_checkworthy_arabic_dev-test_preprocessed.csv"
predict_class(arabic_test_file_path, "arabic")

dutch_test_file_path = "../data/processed_data/CT24_checkworthy_dutch/CT24_checkworthy_dutch_dev-test_preprocessed.csv"
predict_class(dutch_test_file_path, "dutch")