<a href="https://colab.research.google.com/github/Rakib911Hossan/hate_speech_detection_demo/blob/main/subtask_1A_xlm_roberta_qwen_label_Final_Bangla_Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Define the URLs for the datasets
train_url = 'https://raw.githubusercontent.com/Rakib911Hossan/hate_speech_task_A_dataset_null_filled_with_qwen/main/train_converted.csv'
val_url = 'https://raw.githubusercontent.com/Rakib911Hossan/hate_speech_task_A_dataset_null_filled_with_qwen/main/val_converted.csv'
test_url = 'https://raw.githubusercontent.com/Rakib911Hossan/hate_speech_task_A_dataset_null_filled_with_qwen/main/test_converted.csv'

# Load the datasets
train_df = pd.read_csv(train_url)
val_df = pd.read_csv(val_url)
test_df = pd.read_csv(test_url)

# Display the number of samples in each dataset
print(f"Train: {len(train_df)} samples")
print(f"Validation: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")


Train: 35522 samples
Validation: 2512 samples
Test: 2512 samples


In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install matplotlib seaborn scipy

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import pandas as pd
import datasets
import evaluate
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
import torch
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report
import re

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

In [None]:
# Data types & non-null counts
print("Train dataset info:")
print(train_df.info())

# First few rows
print("\nFirst few rows:")
print(train_df.head())

# Check for missing values
print("\nMissing values in train dataset:")
print(train_df.isnull().sum())

# Check label distribution
if 'labels' in train_df.columns:
    print("\nLabel distribution:")
    print(train_df['labels'].value_counts(normalize=True))

    train_df['labels'].value_counts().plot(kind='bar')
    plt.title('Label Distribution')
    plt.show()

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import TrainingArguments, EarlyStoppingCallback

# 🎯 OPTIMIZED CONFIGURATION
training_args = TrainingArguments(
    output_dir="./banglabert_hate_speech/",
    overwrite_output_dir=True,

    # 🔧 ADJUSTED LEARNING SCHEDULE
    learning_rate=10e-6,                     # Better for BERT models
    num_train_epochs=8,                     # HateBERT converges faster
    warmup_ratio=0.15,                       # Less warmup needed
    lr_scheduler_type="cosine",

    # 🛡️ STRONGER REGULARIZATION
    weight_decay=0.01,                      # Weight decay
    max_grad_norm=0.5,                      # Tighter gradient clipping
    dataloader_drop_last=True,              # More consistent batch sizes

    # ✅ BATCH CONFIGURATION
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,

    # 🎯 EARLY STOPPING
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_micro_f1",
    greater_is_better=True,

    # 📊 MONITORING
    logging_steps=250,
    eval_steps=None,
    save_total_limit=3,

    # 🔧 SYSTEM OPTIMIZATIONS
    report_to=None,
    dataloader_num_workers=2,
    fp16=True,
    group_by_length=True,
)

# 🛑 EARLY STOPPING
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,
    early_stopping_threshold=0.001
)

# 🎯 DATA PARAMETERS
max_train_samples = None
max_eval_samples = None
max_predict_samples = None
max_seq_length = 512
batch_size = 8

In [None]:
transformers.utils.logging.set_verbosity_info()

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Model setup
model_name = 'csebuetnlp/banglabert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=6  # Your 5 classes
)

In [None]:
set_seed(training_args.seed)

In [None]:
# Create label mapping
l2id = {'None': 0, 'Abusive': 1, 'Sexism': 2, 'Religious Hate': 3, 'Political Hate': 4, 'Profane': 5}
id2l = {v: k for k, v in l2id.items()}

class KeywordFeatureExtractor:
    def __init__(self):
        # Based on your chi-squared analysis
        self.label_keywords = {
            'Profane': ['বাল', 'মাগির', 'বালের', 'খানকির', 'বেশ্যা', 'দফা', 'বাচ্চা', 'সালা', 'শালা',
                       'মাদারচোদ', 'কুত্তার', 'জারজ', 'তোর', 'পোলা', 'শালার', 'পোলারা', 'শুয়োরের', 'মাদার', 'বাচ্চারা'],
            'Religious Hate': ['মুসলিম', 'হিন্দু', 'ইহুদি', 'মুসলমানদের', 'গজব', 'ধর্ম', 'হিন্দুদের', 'মুসলমান', 'ইসলাম',
                              'কাফের', 'মসজিদ', 'ধর্মীয়', 'ইহুদিরা', 'মোল্লারা', 'আল্লাহর', 'হিন্দুর', 'আল্লাহ', 'ইহুদী', 'মুসলিমরা', 'কাফেরদের'],
            'Political Hate': ['ভোট', 'বিএনপি', 'আওয়ামী', 'লীগ', 'সরকার', 'নির্বাচন', 'বিএনপির', 'লীগের', 'হাসিনা',
                              'সরকারের', 'অবৈধ', 'জনগণ', 'করে', 'পার্টি', 'দল', 'চোর', 'রাজনীতি'],
            'Sexism': ['নারী', 'পরকিয়া', 'মহিলাদের', 'পুরুষদের', 'হিজরা', 'মহিলা', 'মহিলাকে', 'বিয়ের',
                      'মেয়ের', 'মেয়ে', 'নারীরা', 'মেয়েদের', 'পুরুষ'],
            'Abusive': ['দালাল', 'টিভি', 'ফালতু', 'চোর', 'ধন্যবাদ', 'সময়', 'মিথ্যা', 'পাগল', 'জুতা',
                       'লজ্জা', 'নিউজ', 'আমিন', 'এদের', 'দালালি', 'সাংবাদিক', 'এরা']
        }

        self.all_keywords = []
        for words in self.label_keywords.values():
            self.all_keywords.extend(words)
        self.all_keywords = list(set(self.all_keywords))

    def extract_features(self, texts):
        features = []
        for text in texts:
            text_lower = str(text).lower()
            feature_dict = {}

            # Individual keyword counts
            for keyword in self.all_keywords[:50]:  # Top 50 keywords
                feature_dict[f'kw_{keyword}'] = text_lower.count(keyword)

            # Label-specific aggregated features
            for label, keywords in self.label_keywords.items():
                total_count = sum(text_lower.count(kw.lower()) for kw in keywords)
                feature_dict[f'label_{label}_total'] = total_count
                feature_dict[f'label_{label}_ratio'] = total_count / len(text.split()) if len(text.split()) > 0 else 0

            # Text statistics
            feature_dict['text_length'] = len(text)
            feature_dict['word_count'] = len(text.split())
            feature_dict['avg_word_length'] = np.mean([len(word) for word in text.split()]) if text.split() else 0

            features.append(feature_dict)

        return pd.DataFrame(features).fillna(0)

# Initialize keyword extractor
keyword_extractor = KeywordFeatureExtractor()

# Convert DataFrames to HuggingFace Datasets
print("Converting to HuggingFace datasets...")

train_df_renamed = train_df.copy()
val_df_renamed = val_df.copy()
test_df_renamed = test_df.copy()

# Remove 'label' column if it exists, then rename 'labels' -> 'label'
for df in [train_df_renamed, val_df_renamed, test_df_renamed]:
    if "label" in df.columns:
        df.drop(columns=["label"], inplace=True)
    if "labels" in df.columns:
        df.rename(columns={"labels": "label"}, inplace=True)


# Add ID column if not present
if 'id' not in train_df_renamed.columns:
    train_df_renamed['id'] = range(len(train_df_renamed))
if 'id' not in val_df_renamed.columns:
    val_df_renamed['id'] = range(len(val_df_renamed))
if 'id' not in test_df_renamed.columns:
    test_df_renamed['id'] = range(len(test_df_renamed))

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df_renamed)
validation_dataset = Dataset.from_pandas(val_df_renamed)
test_dataset = Dataset.from_pandas(test_df_renamed)

# Create DatasetDict
raw_datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
})

print(f"Dataset sizes:")
print(f"Train: {len(raw_datasets['train'])}")
print(f"Validation: {len(raw_datasets['validation'])}")
print(f"Test: {len(raw_datasets['test'])}")

for key in raw_datasets.keys():
    logger.info(f"loaded dataset for {key}")

In [None]:
# Labels
label_list = raw_datasets["train"].unique("label")
print(f"Unique labels: {label_list}")
label_list.sort()  # sort the labels
num_labels = len(label_list)
print(f"Number of labels: {num_labels}")

In [None]:
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    finetuning_task=None,
    cache_dir=None,
    revision="main",
    use_auth_token=None,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=None,
    use_fast=True,
    revision="main",
    use_auth_token=None,
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    from_tf=bool(".ckpt" in model_name),
    config=config,
    cache_dir=None,
    revision="main",
    use_auth_token=None,
    ignore_mismatched_sizes=False,
)

In [None]:
non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
print(f"Non-label columns: {non_label_column_names}")

# Find the text column
sentence1_key = 'text' if 'text' in non_label_column_names else 'sentence' if 'sentence' in non_label_column_names else non_label_column_names[1]
print(f"Using text column: {sentence1_key}")

# Padding strategy
padding = "max_length"

# Label mapping
label_to_id = None
if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id):
    label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
    if sorted(label_name_to_id.keys()) == sorted(label_list):
        label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
    else:
        logger.warning(
            "Your model seems to have been trained with labels, but they don't match the dataset: ",
            f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
            "\nIgnoring the model labels as a result.",)

if label_to_id is not None:
    model.config.label2id = label_to_id
    model.config.id2label = {id: label for label, id in config.label2id.items()}

# Set max sequence length
max_seq_length = min(128, tokenizer.model_max_length)

def preprocess_function(examples):
    # Tokenize the texts
    args = (examples[sentence1_key],)
    result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)

    # Map labels to IDs
    if label_to_id is not None and "label" in examples:
        result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
    return result

raw_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    load_from_cache_file=True,
    desc="Running tokenizer on dataset",
)

print("Tokenization completed!")

In [None]:
if "train" not in raw_datasets:
    raise ValueError("requires a train dataset")
train_dataset = raw_datasets["train"]
if max_train_samples is not None:
    max_train_samples_n = min(len(train_dataset), max_train_samples)
    train_dataset = train_dataset.select(range(max_train_samples_n))

print(f"Final train dataset: {len(train_dataset)} samples")

In [None]:
if "validation" not in raw_datasets:
    raise ValueError("requires a validation dataset")
eval_dataset = raw_datasets["validation"]
if max_eval_samples is not None:
    max_eval_samples_n = min(len(eval_dataset), max_eval_samples)
    eval_dataset = eval_dataset.select(range(max_eval_samples_n))

print(f"Final eval dataset: {len(eval_dataset)} samples")

In [None]:
if "test" not in raw_datasets:
    raise ValueError("requires a test dataset")
predict_dataset = raw_datasets["test"]
if max_predict_samples is not None:
    max_predict_samples_n = min(len(predict_dataset), max_predict_samples)
    predict_dataset = predict_dataset.select(range(max_predict_samples_n))

print(f"Final predict dataset: {len(predict_dataset)} samples")

In [None]:
for index in random.sample(range(len(train_dataset)), 3):
    logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

In [None]:
import evaluate
metric_accuracy = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)

    # Calculate accuracy
    accuracy = (preds == p.label_ids).astype(np.float32).mean().item()

    # Calculate micro F1 (official metric)
    micro_f1 = metric_f1.compute(predictions=preds, references=p.label_ids, average='micro')['f1']

    # Calculate macro F1 for additional insight
    macro_f1 = metric_f1.compute(predictions=preds, references=p.label_ids, average='macro')['f1']

    return {
        "accuracy": accuracy,
        "micro_f1": micro_f1,
        "macro_f1": macro_f1
    }

In [None]:
data_collator = default_data_collator

In [None]:
# Remove ID columns for training
train_dataset = train_dataset.remove_columns("id")
eval_dataset = eval_dataset.remove_columns("id")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping],
)

print("Trainer initialized successfully!")

In [None]:
print("Starting training...")
train_result = trainer.train()
metrics = train_result.metrics
max_train_samples = (
    max_train_samples if max_train_samples is not None else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))

print("Training completed!")
print(f"Training metrics: {metrics}")



In [None]:
# Add this after training completes
print("Making final predictions...")
predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
predictions = np.argmax(predictions, axis=1)

# Get test IDs and save predictions
test_ids = [test_df_renamed.iloc[i]['id'] for i in range(len(predict_dataset))]
output_predict_file = os.path.join(training_args.output_dir, "subtask_1A.tsv")

with open(output_predict_file, "w", encoding='utf-8') as writer:
    writer.write("id\tlabel\tmodel\n")
    for i, pred_idx in enumerate(predictions):
        pred_label = id2l[pred_idx]
        writer.write(f"{test_ids[i]}\t{pred_label}\thatebert\n")

# Create ZIP
import zipfile
zip_file_path = "subtask_1A_banglabert.zip"
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
    zipf.write(output_predict_file, "subtask_1A.tsv")

print(f"BanglaBert predictions saved: {output_predict_file}")

In [None]:
trainer.save_model()
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

print("Model saved successfully!")

In [None]:
logger.info("*** Evaluate ***")

metrics = trainer.evaluate(eval_dataset=eval_dataset)

max_eval_samples = (
    max_eval_samples if max_eval_samples is not None else len(eval_dataset)
)
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

print(f"Evaluation metrics: {metrics}")

In [None]:
# # Predictions
# logger.info("*** Predict ***")
# try:
#     # Get IDs before removing columns
#     ids = predict_dataset['id']
#     predict_dataset_clean = predict_dataset.remove_columns("id")

#     # Make predictions
#     predictions = trainer.predict(predict_dataset_clean, metric_key_prefix="predict").predictions
#     predictions = np.argmax(predictions, axis=1)

#     # Save predictions in required TSV format
#     output_predict_file = os.path.join(training_args.output_dir, f"subtask_1A.tsv")

#     if trainer.is_world_process_zero():
#         with open(output_predict_file, "w", encoding='utf-8') as writer:
#             logger.info(f"***** Predict results *****")
#             writer.write("id\tlabel\tmodel\n")
#             for index, pred_idx in enumerate(predictions):
#                 pred_label = id2l[pred_idx]
#                 writer.write(f"{ids[index]}\t{pred_label}\t{model_name}\n")

#         # Create submission ZIP
#         zip_file_path = "subtask_1A.zip"
#         with zipfile.ZipFile(zip_file_path, 'w') as zipf:
#             zipf.write(output_predict_file, "subtask_1A.tsv")
#         print(f"Submission file created: {zip_file_path}")

#         # Preview predictions
#         print(f"\nFirst 10 predictions:")
#         print("ID | Predicted Label")
#         print("-" * 40)
#         for i in range(min(10, len(predictions))):
#             pred_label = id2l[predictions[i]]
#             print(f"{ids[i]} | {pred_label}")

#         # ✅ Calculate F1 scores if true labels are available
#         from sklearn.metrics import f1_score

#         try:
#             true_labels = None

#             # Check if predict_dataset has labels (before we removed columns)
#             if 'label' in predict_dataset.column_names:
#                 true_labels = predict_dataset['label']
#                 print(f"\n✅ Found true labels in predict_dataset")
#             # Check eval_dataset if available
#             elif 'eval_dataset' in locals() and len(eval_dataset) == len(predictions):
#                 true_labels = eval_dataset['label']
#                 print(f"\n✅ Found true labels in eval_dataset")

#             if true_labels is not None:
#                 # Calculate F1 scores
#                 micro_f1 = f1_score(true_labels, predictions, average='micro')
#                 macro_f1 = f1_score(true_labels, predictions, average='macro')
#                 weighted_f1 = f1_score(true_labels, predictions, average='weighted')

#                 print(f"\n=== F1 SCORES ===")
#                 print(f"🎯 Micro F1 (Official): {micro_f1:.4f}")
#                 print(f"📊 Macro F1:            {macro_f1:.4f}")
#                 print(f"⚖️  Weighted F1:         {weighted_f1:.4f}")

#                 # Per-class F1
#                 per_class_f1 = f1_score(true_labels, predictions, average=None)
#                 print(f"\n📋 Per-class F1:")
#                 for i, f1 in enumerate(per_class_f1):
#                     print(f"  {id2l[i]:12s}: {f1:.4f}")

#             else:
#                 print(f"\n⚠️  No true labels found - cannot calculate F1 scores")

#         except Exception as e:
#             print(f"\n❌ Error calculating F1: {e}")

# except Exception as e:
#     logger.error(f"Error during prediction: {e}")
#     raise

# # Download evaluation tools (do this once)
# import os
# if not os.path.exists("requirements.txt"):
#     !wget https://raw.githubusercontent.com/AridHasan/blp25_task1/main/requirements.txt -O requirements.txt
#     !pip install -r requirements.txt

# if not os.path.exists("scorer"):
#     os.makedirs("scorer", exist_ok=True)
#     !wget https://raw.githubusercontent.com/AridHasan/blp25_task1/main/scorer/task.py -O scorer/task.py

# if not os.path.exists("format_checker"):
#     os.makedirs("format_checker", exist_ok=True)
#     !wget https://raw.githubusercontent.com/AridHasan/blp25_task1/main/format_checker/task.py -O format_checker/task.py

# # Format verification
# print(f"\n=== FORMAT VERIFICATION ===")
# print(f"Output file: {output_predict_file}")
# print("Expected format: id\\tlabel\\tmodel")
# print("Running format checker...")
# !python format_checker/task.py -p {output_predict_file}

# # Note: Replace 'path_to_gold_file.tsv' with actual gold standard file
# print("\nTo evaluate predictions against gold standard, run:")
# print(f"python scorer/task.py -p {output_predict_file} -g path_to_gold_file.tsv")
# print("Official evaluation metric: micro-F1")

In [None]:
# import os
# import zipfile
# import logging
# import numpy as np

# logger = logging.getLogger(__name__)
# logging.basicConfig(level=logging.INFO)

# output_dir = "./xlm_roberta_recovery"
# os.makedirs(output_dir, exist_ok=True)
# model_name = "xlm-roberta"
# id2l = {0: 'None', 1: 'Abusive', 2: 'Sexism', 3: 'Religious Hate', 4: 'Political Hate', 5: 'Profane'}

# # ======= PREDICTION =======
# logger.info("*** Predict ***")

# ids = predict_dataset['id']
# predict_dataset_clean = predict_dataset.remove_columns("id")

# predictions = trainer.predict(predict_dataset_clean, metric_key_prefix="predict").predictions
# predictions = np.argmax(predictions, axis=1)

# # Save predictions TSV
# output_predict_file = os.path.join(output_dir, "subtask_1A.tsv")
# with open(output_predict_file, "w", encoding="utf-8") as f:
#     f.write("id\tlabel\tmodel\n")
#     for idx, pred_idx in enumerate(predictions):
#         pred_label = id2l[pred_idx]
#         f.write(f"{ids[idx]}\t{pred_label}\t{model_name}\n")

# logger.info(f"Predictions saved: {output_predict_file}")

# # ======= ZIP PREDICTIONS =======
# zip_file_path = os.path.join(output_dir, "subtask_1A.zip")
# with zipfile.ZipFile(zip_file_path, 'w') as zipf:
#     zipf.write(output_predict_file, "subtask_1A.tsv")

# logger.info(f"Submission file created and zipped: {zip_file_path}")

# # ======= PREVIEW =======
# print("\nFirst 10 predictions:")
# import pandas as pd
# print(pd.read_csv(output_predict_file, sep="\t").head(10))


In [None]:
# # ✅ Show only ID + predicted label for preview
# print(f"\nFirst 10 predictions (Required Format):")
# print("Format: ID | Predicted Label")
# print("-" * 40)
# for i in range(min(10, len(predictions))):
#     pred_label = id2l[predictions[i]]
#     print(f"{ids[i]} | {pred_label}")


# print(f"\n=== FORMAT VERIFICATION ===")
# print(f"Output file: {output_predict_file}")
# print("Expected format: id\\tlabel\\tmodel")
# print("To verify format compliance, run:")
# print(f"python format_checker/task.py -p {output_predict_file}")
# print("\nTo evaluate predictions, run:")
# print(f"python scorer/task.py --gold-file-path=<gold_file> --pred-file-path={output_predict_file}")
# print(f"Official evaluation metric: micro-F1")


In [None]:
# !python format_checker/task.py -p /content/xlm_roberta_recovery/subtask_1A.tsv
# !python scorer/task.py -p /content/xlm_roberta_recovery/subtask_1A.tsv -g /content/dev.tsv


