In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [5]:
pip install --upgrade fsspec

Collecting fsspec
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.9.0
    Uninstalling fsspec-2024.9.0:
      Successfully uninstalled fsspec-2024.9.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.2.0 requires fsspec[http]<=2024.9.0,>=2023.1.0, but you have fsspec 2024.12.0 which is incompatible.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.[0m[31m
[0mSuccessfully installed

In [27]:
import re
import torch
import argparse
from datasets import Dataset
from collections import Counter
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from sklearn.metrics import precision_score, recall_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_data(file_path):
  sentence = []
  sentences_list = []
  label = []
  labels_list = []
  with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
      if re.match(r'^\d', line):  # if a sentence starts with number (\d matches any number)
        parts = line.split() # split(): splits the string into a list, separated by a space
        second_column = parts[1].strip() # Get the second part (the word) and remove the leading and trailing Spaces
        third_column = parts[2].strip()
        sentence.append(second_column) # Add the word to the word list of the current sentence
        label.append(third_column)
      elif line.strip() == "":  # one sentence end
        if sentence and label:
          sentences_list.append(sentence) # Adds the word list of the current sentence to the list of the sentence
          labels_list.append(label)
        sentence = []
        label = []
  return sentences_list, labels_list

# Convert sentences and their labels into a format that the model can understand
def preprocess_data(sentences, labels, tokenizer, id2label):
  input_ids = []
  attention_masks = []
  labels_ids = []
  for sentence, label in zip(sentences, labels):
    # 'encoded_dict' contains 'input_ids' and 'attention_mask'
    # 'is_split_into_words=True indicates' that the sentence has been split into words
    encoded_dict = tokenizer(sentence, is_split_into_words=True, return_tensors='pt', padding=True, truncation=True)
    word_ids = encoded_dict.word_ids()
    label_ids = []
    # Convert text labels(eg.B-PER) to numeric labels (eg.ids:1) that the model can understand
    previous_word_id = None
    for word_id in word_ids:
      if word_id is None:  # Special tokens
        label_ids.append(-100)
      elif word_id != previous_word_id:
        label_name = label[word_id]
        if label_name in id2label.values():
            label_ids.append(list(id2label.keys())[list(id2label.values()).index(label_name)])
        else:
            print(f"Warning: Label '{label_name}' not found in id2label.values(). Skipping.")
            label_ids.append(-100)
      else:
        label_ids.append(-100)
      previous_word_id = word_id
    input_ids.append(encoded_dict["input_ids"].squeeze(0))
    attention_masks.append(encoded_dict["attention_mask"].squeeze(0))
    labels_ids.append(label_ids)
  for idx, (input_id, label) in enumerate(zip(input_ids, labels_ids)):
    if len(input_id) != len(label):
      print(f"Length mismatch at index {idx}: input_id={len(input_id)}, label={len(label)}")

  max_len = max(len(x) for x in input_ids)
  input_ids = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in input_ids]
  attention_masks = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in attention_masks]
  labels_ids = [x + [-100] * (max_len - len(x)) for x in labels_ids]
  return input_ids, attention_masks, labels_ids

def fine_tune_model(model, tokenizer, train_sentences, train_labels, id2label):
  # Prepare the dataset
  input_ids, attention_masks, labels = preprocess_data(train_sentences, train_labels, tokenizer, id2label)
  assert len(input_ids) == len(labels), "Mismatch between input_ids and labels lengths"
  dataset = Dataset.from_dict({
    "input_ids": [x.tolist() for x in input_ids],
    "attention_mask": [x.tolist() for x in attention_masks],
    "labels": labels,
  })
  data_collator = DataCollatorForTokenClassification(tokenizer)
  # Define training arguments
  training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="no",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
  )
  # Create the Trainer
  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
  )
  # Train the model
  trainer.train()
  return model

# Prediction test is performed on the given pre-trained model
def evaluate_model(model, tokenizer, sentences_list, labels_list, id2label):
  model = model.to(device)
  input_ids, attention_masks, labels_ids = preprocess_data(sentences_list, labels_list, tokenizer, id2label)
  model.eval()
  all_preds = []
  all_labels = []
  for input_ids, attention_mask, label in zip(input_ids, attention_masks, labels_ids):
    input_dict = {
      'input_ids': input_ids.unsqueeze(0).to(device),
      'attention_mask': attention_mask.unsqueeze(0).to(device),
    }
    with torch.no_grad():
      outputs = model(**input_dict)
    logits = outputs.logits
    pred = torch.argmax(logits, dim=2)
    # Convert prediction and label from ids to string labels
    valid_preds = [id2label[p] for p, l in zip(pred[0].cpu().numpy(), label) if l != -100]
    valid_labels = [id2label[l] for l in label if l != -100]
    all_preds.extend(valid_preds)
    all_labels.extend(valid_labels)
  precision = precision_score(all_labels, all_preds, average='macro')
  recall = recall_score(all_labels, all_preds, average='macro')
  f1 = f1_score(all_labels, all_preds, average='macro')
  return precision, recall, f1, all_preds, all_labels

def extract_crf_results(language, crf_report_path):
  with open(crf_report_path, "r", encoding="utf-8") as f:
    lines = f.readlines()
  start_idx = next((i for i, line in enumerate(lines) if f"Classification Report for {language.capitalize()}:" in line), None)
  if start_idx is None:
    raise ValueError(f"Language '{language.capitalize()}' not found in {crf_report_path}")
  report_lines = []
  for line in lines[start_idx + 1:]:
    line = line.strip()
    if line == "":
      continue
    if "Classification Report for " in line and line != f"Classification Report for {language.capitalize()}:":
      break
    report_lines.append(line)
  for line in report_lines:
    if "macro avg" in line:
      parts = line.split()
      print(f"Parts: {parts}")
      if len(parts) >= 5:
        return float(parts[2]), float(parts[3]), float(parts[4])
  raise ValueError(f"Results not found for language: {language.capitalize()} in {crf_report_path}")

def report_comparison(language, all_preds, all_labels, crf_report_path, output_path="compare_eval.txt"):
  crf_precision, crf_recall, crf_f1 = extract_crf_results(language, crf_report_path)
  ignore_labels = {"O", "B-OTH", "I-OTH"}
  filtered_preds = [pred for pred, label in zip(all_preds, all_labels) if label not in ignore_labels]
  filtered_labels = [label for label in all_labels if label not in ignore_labels]
  precision = precision_score(filtered_labels, filtered_preds, average="macro")
  recall = recall_score(filtered_labels, filtered_preds, average="macro")
  f1 = f1_score(filtered_labels, filtered_preds, average="macro")
  fine_tuned_results = f"""
  Classification Report for {language.capitalize()} (Fine-tuned Model):
  Precision: {precision:.4f}
  Recall: {recall:.4f}
  F1-Score: {f1:.4f}

  Comparison with CRF Results:
  Macro Precision Difference: {precision - crf_precision:.4f}
  Macro Recall Difference: {recall - crf_recall:.4f}
  Macro F1-Score Difference: {f1 - crf_f1:.4f}
  """
  with open(output_path, "a", encoding="utf-8") as f:
    f.write(fine_tuned_results)
  print(f"Comparison for {language} saved to {output_path}")

def main():
  languages = {
    'Portuguese': {'train_file': 'pt_bosque-ud-train.iob2', 'test_file': 'pt_bosque-ud-test.iob2', 'crf_report': 'crf_reports.txt'},
    'Chinese': {'train_file': 'zh_gsdsimp-ud-train.iob2', 'test_file': 'zh_gsdsimp-ud-test.iob2', 'crf_report': 'crf_reports.txt'},
    'Swedish': {'train_file': 'sv_talbanken-ud-train.iob2', 'test_file': 'sv_talbanken-ud-test.iob2', 'crf_report': 'crf_reports.txt'},
    'Serbian': {'train_file': 'sr_set-ud-train.iob2', 'test_file': 'sr_set-ud-test.iob2', 'crf_report': 'crf_reports.txt'},
    'Slovak': {'train_file': 'sk_snk-ud-train.iob2', 'test_file': 'sk_snk-ud-test.iob2', 'crf_report': 'crf_reports.txt'},
    'Croatian': {'train_file': 'hr_set-ud-train.iob2', 'test_file': 'hr_set-ud-test.iob2', 'crf_report': 'crf_reports.txt'},
    'English': {'train_file': 'en_ewt-ud-train.iob2', 'test_file': 'en_ewt-ud-test.iob2', 'crf_report': 'crf_reports.txt'},
    'Danish': {'train_file': 'da_ddt-ud-train.iob2', 'test_file': 'da_ddt-ud-test.iob2', 'crf_report': 'crf_reports.txt'}
    }

  model_name = "dslim/distilbert-NER"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForTokenClassification.from_pretrained(model_name)
  # model provide id2label mapping
  id2label = model.config.id2label

  for language, files in languages.items():
    print(f"Processing {language}...")
    train_sentences, train_labels = load_data(files['train_file'])
    test_sentences, test_labels = load_data(files['test_file'])
    print(f"Starting fine-tuning for {language}...")
    fine_tuned_model = fine_tune_model(model, tokenizer, train_sentences, train_labels, id2label)
    transfer_precision, transfer_recall, transfer_f1, all_preds, all_labels = evaluate_model(fine_tuned_model, tokenizer, test_sentences, test_labels, id2label)
    report_comparison(language, all_preds, all_labels, crf_report_path = "crf_reports.txt")
    # Print label distributions
    print(f"Label distribution for {language} in true labels:", Counter(all_labels))
    print(f"Label distribution for {language} in predicted labels:", Counter(all_preds))
    print(f"Precision for {language}: {transfer_precision}")
    print(f"Recall for {language}: {transfer_recall}")
    print(f"F1 Score for {language}: {transfer_f1}")

if __name__ == "__main__":
  main()

Processing Portuguese...
Starting fine-tuning for Portuguese...


  trainer = Trainer(


Step,Training Loss
500,0.098
1000,0.0549


Parts: ['macro', 'avg', '0.82', '0.78', '0.80', '27604']
Comparison for Portuguese saved to compare_eval.txt
Label distribution for Portuguese in true labels: Counter({'O': 25458, 'B-ORG': 456, 'B-PER': 442, 'I-ORG': 430, 'I-PER': 320, 'B-LOC': 317, 'I-LOC': 181})
Label distribution for Portuguese in predicted labels: Counter({'O': 25516, 'B-ORG': 452, 'B-PER': 429, 'I-ORG': 390, 'I-PER': 331, 'B-LOC': 328, 'I-LOC': 158})
Precision for Portuguese: 0.855247657794241
Recall for Portuguese: 0.8341772985885214
F1 Score for Portuguese: 0.8438691457177315
Processing Chinese...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Starting fine-tuning for Chinese...


  trainer = Trainer(


Step,Training Loss
500,0.4114


Parts: ['macro', 'avg', '0.80', '0.64', '0.71', '12012']
Comparison for Chinese saved to compare_eval.txt
Label distribution for Chinese in true labels: Counter({'O': 10605, 'B-LOC': 429, 'I-LOC': 337, 'B-PER': 205, 'I-ORG': 183, 'B-ORG': 129, 'I-PER': 124})
Label distribution for Chinese in predicted labels: Counter({'O': 11272, 'B-LOC': 240, 'I-LOC': 212, 'B-PER': 98, 'I-ORG': 75, 'I-PER': 67, 'B-ORG': 48})
Precision for Chinese: 0.5642965540414673
Recall for Chinese: 0.3666242576061944
F1 Score for Chinese: 0.4316635668310077
Processing Swedish...
Starting fine-tuning for Swedish...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  trainer = Trainer(


Step,Training Loss
500,0.0237


Parts: ['macro', 'avg', '0.66', '0.55', '0.59', '20377']
Comparison for Swedish saved to compare_eval.txt
Label distribution for Swedish in true labels: Counter({'O': 20090, 'B-LOC': 132, 'B-PER': 33, 'B-ORG': 31, 'I-PER': 25, 'I-ORG': 15, 'I-LOC': 5})
Label distribution for Swedish in predicted labels: Counter({'O': 20096, 'B-LOC': 120, 'B-PER': 38, 'B-ORG': 35, 'I-PER': 25, 'I-ORG': 14, 'I-LOC': 3})
Precision for Swedish: 0.8053868677038364
Recall for Swedish: 0.7765075453991461
F1 Score for Swedish: 0.7848647345385296
Processing Serbian...
Starting fine-tuning for Serbian...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  trainer = Trainer(


Step,Training Loss
500,0.0662


Parts: ['macro', 'avg', '0.88', '0.75', '0.80', '11421']
Comparison for Serbian saved to compare_eval.txt
Label distribution for Serbian in true labels: Counter({'O': 10066, 'B-LOC': 387, 'B-ORG': 251, 'B-PER': 209, 'I-ORG': 177, 'I-PER': 144, 'I-LOC': 55})
Label distribution for Serbian in predicted labels: Counter({'O': 10095, 'B-LOC': 405, 'B-ORG': 230, 'B-PER': 207, 'I-ORG': 150, 'I-PER': 147, 'I-LOC': 55})
Precision for Serbian: 0.864842356911155
Recall for Serbian: 0.8462909711562097
F1 Score for Serbian: 0.8545888114870639
Processing Slovak...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Starting fine-tuning for Slovak...


  trainer = Trainer(


Step,Training Loss
500,0.0469
1000,0.0208
1500,0.0117


Parts: ['macro', 'avg', '0.60', '0.42', '0.46', '12736']
Comparison for Slovak saved to compare_eval.txt
Label distribution for Slovak in true labels: Counter({'O': 11349, 'B-PER': 539, 'B-LOC': 326, 'I-PER': 316, 'I-ORG': 80, 'I-LOC': 76, 'B-ORG': 50})
Label distribution for Slovak in predicted labels: Counter({'O': 11461, 'B-PER': 555, 'I-PER': 301, 'B-LOC': 263, 'B-ORG': 61, 'I-LOC': 49, 'I-ORG': 46})
Precision for Slovak: 0.6259362007918357
Recall for Slovak: 0.5722610135346958
F1 Score for Slovak: 0.5923156373532488
Processing Croatian...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Starting fine-tuning for Croatian...


  trainer = Trainer(


Step,Training Loss
500,0.0607
1000,0.0305


Parts: ['macro', 'avg', '0.79', '0.71', '0.74', '24260']
Comparison for Croatian saved to compare_eval.txt
Label distribution for Croatian in true labels: Counter({'O': 21877, 'B-LOC': 597, 'B-ORG': 414, 'B-PER': 392, 'I-ORG': 293, 'I-PER': 264, 'I-LOC': 116})
Label distribution for Croatian in predicted labels: Counter({'O': 21888, 'B-LOC': 601, 'B-ORG': 401, 'B-PER': 395, 'I-ORG': 297, 'I-PER': 266, 'I-LOC': 105})
Precision for Croatian: 0.8949434950760794
Recall for Croatian: 0.8834152342274714
F1 Score for Croatian: 0.8888393039540313
Processing English...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Starting fine-tuning for English...


  trainer = Trainer(


Step,Training Loss
500,0.0506
1000,0.0301
1500,0.0184
2000,0.0111


Parts: ['macro', 'avg', '0.74', '0.64', '0.68', '25097']
Comparison for English saved to compare_eval.txt
Label distribution for English in true labels: Counter({'O': 23418, 'B-PER': 449, 'B-ORG': 322, 'B-LOC': 317, 'I-ORG': 276, 'I-PER': 243, 'I-LOC': 72})
Label distribution for English in predicted labels: Counter({'O': 23510, 'B-PER': 448, 'B-LOC': 319, 'I-PER': 252, 'B-ORG': 250, 'I-ORG': 224, 'I-LOC': 94})
Precision for English: 0.7937881856705881
Recall for English: 0.7761896657106921
F1 Score for English: 0.7811079754664332
Processing Danish...
Starting fine-tuning for Danish...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  trainer = Trainer(


Step,Training Loss
500,0.0484


Parts: ['macro', 'avg', '0.71', '0.64', '0.67', '10023']
Comparison for Danish saved to compare_eval.txt
Label distribution for Danish in true labels: Counter({'O': 9379, 'B-PER': 184, 'B-ORG': 172, 'I-PER': 139, 'B-LOC': 90, 'I-ORG': 55, 'I-LOC': 4})
Label distribution for Danish in predicted labels: Counter({'O': 9424, 'B-PER': 176, 'B-ORG': 139, 'I-PER': 136, 'B-LOC': 108, 'I-ORG': 38, 'I-LOC': 2})
Precision for Danish: 0.7416917665800951
Recall for Danish: 0.6958532593352558
F1 Score for Danish: 0.7133913715164432


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
