## 1. Load and Import Libraries

In [1]:
# %pip install chardet

In [2]:
import os
import json
import chardet

import pandas as pd



## 2. Load & Clean Dataset

In [3]:
def detect_file_encoding(file_path):
    """Detect the encoding of a file."""
    with open(file_path, 'rb') as f:  # Open in binary mode
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result['encoding']

def prepare_hf_dataset(parent_folder):
    """Prepare the dataset from all year folders in the given parent folder."""
    dataset = {"tokens": [], "tags": []}
    
    # Iterate over all year folders in the parent folder
    for year_folder in os.listdir(parent_folder):
        year_path = os.path.join(parent_folder, year_folder)
        if os.path.isdir(year_path):  # Check if it's a folder
            for file in os.listdir(year_path):
                if file.endswith(".final"):
                    file_path = os.path.join(year_path, file)
                    # print(f"Processing file: {file_path}")  # Debugging
                    
                    # Detect encoding
                    try:
                        encoding = detect_file_encoding(file_path)
                        # print(f"Detected encoding for {file_path}: {encoding}")
                        with open(file_path, 'r', encoding=encoding) as f:
                            tokens, tags = [], []
                            for line in f:
                                if line.strip():  # Non-empty line
                                    parts = line.strip().split()
                                    if len(parts) == 2:
                                        word, tag = parts
                                        tokens.append(word)
                                        tags.append(tag)
                                else:  # Empty line signals end of a sentence
                                    if tokens and tags:
                                        dataset["tokens"].append(tokens)
                                        dataset["tags"].append(tags)
                                        tokens, tags = [], []  # Reset for next sentence
                            # Ensure the last sentence in the file is added
                            if tokens and tags:
                                dataset["tokens"].append(tokens)
                                dataset["tags"].append(tags)
                    except UnicodeDecodeError as e:
                        print(f"Error processing file {file_path}: {e}")
    
    return dataset

# Test the function
dataset_dir = os.getcwd() + "\\Combine_Dataset"
dataset = dict({})

for dir in os.scandir(dataset_dir):
    if dir.is_dir():
        dataset[dir.name] = prepare_hf_dataset(f"{dataset_dir}\\{str(dir.name)}")
        # Save the dataset to a JSON file
        output_file = f"{dir.name}_dataset.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(dataset[dir.name], f)
        print(f"Dataset saved to {output_file}")

Error processing file c:\Users\ASUS\Documents\GitHub\NLP-NER\Combine_Dataset\dev\2012\2012-38-4-867-915.final: 'charmap' codec can't decode byte 0x81 in position 1579: character maps to <undefined>
Dataset saved to dev_dataset.json
Dataset saved to test_dataset.json
Error processing file c:\Users\ASUS\Documents\GitHub\NLP-NER\Combine_Dataset\train\2012\2012-38-2-223-260.final: 'charmap' codec can't decode byte 0x81 in position 334: character maps to <undefined>
Error processing file c:\Users\ASUS\Documents\GitHub\NLP-NER\Combine_Dataset\train\2021\2021-47-2-445-476.final: 'charmap' codec can't decode byte 0x9d in position 384: character maps to <undefined>
Dataset saved to train_dataset.json


In [4]:
train_df = pd.read_json("train_dataset.json")
test_df = pd.read_json("test_dataset.json")
dev_df = pd.read_json("dev_dataset.json")

print(f"Train Size: {len(train_df)}")
print(f"Test Size: {len(test_df)}")
print(f"Dev Size: {len(dev_df)}")

Train Size: 502
Test Size: 60
Dev Size: 48


In [5]:
def clean_labels(df):
    for i, tags in enumerate(df.tags):
        for tag in tags:
            if tag.islower():
                df['tags'][i][tags.index(tag)] = tag.upper()
            if len(tag) > 1:
                df['tags'][i][tags.index(tag)] = tag[0]
            if tag == '0':
                df['tags'][i][tags.index(tag)] = 'O'

clean_labels(train_df)
clean_labels(test_df)
clean_labels(dev_df)

## 3. Map BIO Tags to Numeric Labels

In [6]:
label2id = {
    'O': 0,
    'B': 1,
    'I': 2
}

id2label = {
    0: 'O',
    1: 'B',
    2: 'I'
}


def convert_bio_to_numeric(bio_tags, bio_map):
    return [bio_map[tag] for tag in bio_tags]

train_df['num_tags'] = train_df['tags'].apply(lambda x: convert_bio_to_numeric(x, label2id))
test_df['num_tags'] = test_df['tags'].apply(lambda x: convert_bio_to_numeric(x, label2id))
dev_df['num_tags'] = dev_df['tags'].apply(lambda x: convert_bio_to_numeric(x, label2id))

In [7]:
train_df

Unnamed: 0,tokens,tags,num_tags
0,"[In, this, article, we, discuss, several, metr...","[O, O, O, O, O, O, B, I, I, O, O, B, I, O, O, ...","[0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 1, 2, 0, 0, ..."
1,"[We, estimate, empirically, which, is, the, mo...","[O, O, O, O, O, O, O, O, B, O, O, O, O, B, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ..."
2,"[Our, main, result, is, that, the, simplest, m...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B, I, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, ..."
3,"[This, baseline, can, be, used, for, the, deve...","[O, O, O, O, O, O, O, O, O, O, B, I, I, O, B, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 1, ..."
4,"[Current, machine, translation, (, MT, ), syst...","[O, B, I, O, B, O, O, O, O, O, O, O]","[0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...
497,"[Large, language, models, (, LLMs, ), are, cap...","[B, I, I, O, B, O, O, O, O, O, O, O, B, I, O, ...","[1, 2, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, ..."
498,"[The, utilization, of, monolingual, data, has,...","[O, O, O, B, I, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
499,"[Transformer, language, models, have, received...","[B, I, I, O, O, O, O, O, O, O, O, B, I, O, O, ...","[1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, ..."
500,"[Despite, the, success, of, Transformer-based,...","[O, O, O, O, B, I, I, O, O, O, O, O, B, I, I, ...","[0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 1, 2, 2, ..."


## 4. Convert Dataset for Hugging Face Trainer

In [8]:
from datasets import Dataset

test_hugging_face_dat = Dataset.from_dict({
    "tokens": test_df["tokens"],
    "tags": test_df["tags"]
})

test_hugging_face_dat

Dataset({
    features: ['tokens', 'tags'],
    num_rows: 60
})

In [9]:
# Convert the data into a Hugging Face dataset

dev_hugging_face_dat = Dataset.from_dict({
    "tokens": dev_df["tokens"],
    "tags": dev_df["tags"]
})

train_hugging_face_dat = Dataset.from_dict({
    "tokens": train_df["tokens"],
    "tags": train_df["tags"]
})

test_hugging_face_dat = Dataset.from_dict({
    "tokens": test_df["tokens"],
    "tags": test_df["tags"]
})

## 5. Create Data Collator

## 6. Design Model or Load Pretrained Model

## 7. Setting Training Parameters

In [10]:
def extract_features(tokens, idx):
    word = tokens[idx]
    features = {
        'word': word,  # Current word
        'is_upper': word.isupper(),  # Is the word uppercase?
        'is_title': word.istitle(),  # Is the word title case?
        'is_digit': word.isdigit(),  # Is the word a digit?
        'word_len': len(word),  # Length of the word
    }
    
    # Previous word features
    if idx > 0:
        features['prev_word'] = tokens[idx - 1]
    else:
        features['prev_word'] = "<START>"
    
    # Next word features
    if idx < len(tokens) - 1:
        features['next_word'] = tokens[idx + 1]
    else:
        features['next_word'] = "<END>"
    
    return features

def prepare_data_for_crf(dataset):
    features = []
    labels = []
    
    for example in dataset:
        tokens = example['tokens']
        tags = example['tags']
        
        # Extract features for each token
        sentence_features = [extract_features(tokens, idx) for idx in range(len(tokens))]
        features.append(sentence_features)
        labels.append(tags)
    
    return features, labels
    

In [11]:
# Example: Assuming train_dataset is the Hugging Face dataset
train_features, train_labels = prepare_data_for_crf(train_hugging_face_dat)
test_features, test_labels = prepare_data_for_crf(test_hugging_face_dat)

In [12]:
unique_labels = set({})
for labels in train_labels:
    for label in labels:
        unique_labels.add(label)

unique_labels

{'B', 'I', 'O'}

## 8. Train Model  

In [13]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# Initialize the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='arow',  # Use LBFGS optimization
    max_iterations=200,  # Maximum iterations
    all_possible_transitions=True  # Allow all transitions between tags
)

# Train the model
crf.fit(train_features, train_labels)

## 9. Evaluate Model

In [14]:
test_labels

[['O',
  'O',
  'O',
  'B',
  'I',
  'O',
  'B',
  'B',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B',
  'I',
  'I',
  'I',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B',
  'O',
  'O'],
 ['O',
  'B',
  'O',
  'B',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B',
  'I',
  'O',
  'O',
  'B',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'B',
  'O',
  'B',
  'I',
  'I',
  'I',
  'O',
  'O',
  'O',
  'B',
  'I',
  'I',
  'O',
  'B',
  'B',
  'I',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B',
  'O',
  'O',
  'O',
  'B',
  'I',
  'O',
  'O',
  'O',
  'O',
  'B',
  'O',
  'B',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B',
  'O',
  'B',
  'I',
  'I',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B',
  'I',
  'O',
  'O',
  'B',
  'I',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B',
  'I',
  'I',
  'O',
  'O',
  'O',
  

In [15]:
# Make predictions on the test set
predicted_labels = crf.predict(test_features)

# Evaluate the model's performance
print(metrics.flat_classification_report(test_labels, predicted_labels))

              precision    recall  f1-score   support

           B       0.57      0.55      0.56       451
           I       0.59      0.55      0.57       488
           O       0.90      0.91      0.90      3205

    accuracy                           0.83      4144
   macro avg       0.68      0.67      0.68      4144
weighted avg       0.82      0.83      0.83      4144



## 10. Save & Load Model 

In [16]:
def predict_tags(tokens):
    features = [extract_features(tokens, idx) for idx in range(len(tokens))]  # Extract features for the new tokens
    predicted_tags = crf.predict([features])  # Get predicted tags
    return predicted_tags[0]

# Example usage
new_tokens = ["This", "is", "an", "natural", "language"]
predicted_tags = predict_tags(new_tokens)
print(list(zip(new_tokens, predicted_tags)))

[('This', 'O'), ('is', 'O'), ('an', 'O'), ('natural', 'B'), ('language', 'I')]


## Test Model On Random Data

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [18]:
label_list = ['O', 'B', 'I']
label_list

['O', 'B', 'I']

In [34]:
tokenized_inputs = tokenizer(train_hugging_face_dat["tokens"], truncation=True, is_split_into_words=True)
tokenized_inputs["labels"] = labels
print(tokenized_inputs.labels)

['O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'B', 'I', 'O', 'B', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'B', 'I', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O']


In [38]:
# Example label-to-id mapping
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    tmp = []
    for i, label in enumerate(examples["tags"]):
        # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in tokenized_inputs.word_ids(batch_index=i):  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label2id[label[word_idx]])  # Map label to its ID
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        tmp.append(label_ids)
    tokenized_inputs["labels"] = tmp
    return tokenized_inputs


In [40]:
train_tokenized = train_hugging_face_dat.map(tokenize_and_align_labels, batched=True)
dev_tokenized = dev_hugging_face_dat.map(tokenize_and_align_labels, batched=True)
test_tokenized = test_hugging_face_dat.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/502 [00:00<?, ? examples/s]

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [46]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [47]:
import evaluate

seqeval = evaluate.load("seqeval")

In [48]:
train_hugging_face_dat[0][f"tags"]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B',
 'I',
 'I',
 'O',
 'O',
 'B',
 'I',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B',
 'O',
 'O',
 'O',
 'O',
 'B',
 'I',
 'I',
 'O']

In [50]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [51]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="distilbert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=dev_tokenized,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  0%|          | 0/320 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.33512774109840393, 'eval_precision': 0.4817073170731707, 'eval_recall': 0.6061381074168798, 'eval_f1': 0.5368063420158551, 'eval_accuracy': 0.8532307692307692, 'eval_runtime': 5.9339, 'eval_samples_per_second': 8.089, 'eval_steps_per_second': 0.506, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.3182756006717682, 'eval_precision': 0.5186046511627908, 'eval_recall': 0.5703324808184144, 'eval_f1': 0.5432399512789282, 'eval_accuracy': 0.8636923076923076, 'eval_runtime': 5.8341, 'eval_samples_per_second': 8.228, 'eval_steps_per_second': 0.514, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.31504225730895996, 'eval_precision': 0.5652173913043478, 'eval_recall': 0.5652173913043478, 'eval_f1': 0.5652173913043478, 'eval_accuracy': 0.8732307692307693, 'eval_runtime': 5.9929, 'eval_samples_per_second': 8.01, 'eval_steps_per_second': 0.501, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.33683326840400696, 'eval_precision': 0.49691991786447637, 'eval_recall': 0.618925831202046, 'eval_f1': 0.55125284738041, 'eval_accuracy': 0.8575384615384616, 'eval_runtime': 5.4911, 'eval_samples_per_second': 8.741, 'eval_steps_per_second': 0.546, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.3138277232646942, 'eval_precision': 0.5631067961165048, 'eval_recall': 0.5933503836317136, 'eval_f1': 0.5778331257783312, 'eval_accuracy': 0.8747692307692307, 'eval_runtime': 6.3843, 'eval_samples_per_second': 7.518, 'eval_steps_per_second': 0.47, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.3297218084335327, 'eval_precision': 0.5108225108225108, 'eval_recall': 0.6035805626598465, 'eval_f1': 0.5533411488862836, 'eval_accuracy': 0.864, 'eval_runtime': 6.8417, 'eval_samples_per_second': 7.016, 'eval_steps_per_second': 0.438, 'epoch': 6.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.32932841777801514, 'eval_precision': 0.5555555555555556, 'eval_recall': 0.5498721227621484, 'eval_f1': 0.5526992287917738, 'eval_accuracy': 0.8750769230769231, 'eval_runtime': 32.9389, 'eval_samples_per_second': 1.457, 'eval_steps_per_second': 0.091, 'epoch': 7.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.33661210536956787, 'eval_precision': 0.5567567567567567, 'eval_recall': 0.5268542199488491, 'eval_f1': 0.5413929040735872, 'eval_accuracy': 0.8750769230769231, 'eval_runtime': 22.9005, 'eval_samples_per_second': 2.096, 'eval_steps_per_second': 0.131, 'epoch': 8.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.34319472312927246, 'eval_precision': 0.5378973105134475, 'eval_recall': 0.5626598465473146, 'eval_f1': 0.5500000000000002, 'eval_accuracy': 0.8716923076923077, 'eval_runtime': 23.3636, 'eval_samples_per_second': 2.054, 'eval_steps_per_second': 0.128, 'epoch': 9.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.3426060974597931, 'eval_precision': 0.5394088669950738, 'eval_recall': 0.5601023017902813, 'eval_f1': 0.5495608531994981, 'eval_accuracy': 0.8726153846153846, 'eval_runtime': 17.6341, 'eval_samples_per_second': 2.722, 'eval_steps_per_second': 0.17, 'epoch': 10.0}
{'train_runtime': 8443.0034, 'train_samples_per_second': 0.595, 'train_steps_per_second': 0.038, 'train_loss': 0.19902346134185792, 'epoch': 10.0}


TrainOutput(global_step=320, training_loss=0.19902346134185792, metrics={'train_runtime': 8443.0034, 'train_samples_per_second': 0.595, 'train_steps_per_second': 0.038, 'total_flos': 349539088023000.0, 'train_loss': 0.19902346134185792, 'epoch': 10.0})

In [None]:
from transformers import pipeline

inputs = tokenizer(text, return_tensors="pt")

Exception: Impossible to guess which tokenizer to use. Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer.