In [1]:
from flair.data import Sentence
from flair.models import SequenceTagger

In [2]:
tagger = SequenceTagger.load("flair/ner-english")

Downloading:   0%|          | 0.00/432M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


2023-04-18 11:38:57,683 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [3]:
sentence = Sentence("George Washington went to Washington")

In [4]:
tagger.predict(sentence)

In [5]:
print(sentence)

Sentence[5]: "George Washington went to Washington" → ["George Washington"/PER, "Washington"/LOC]


In [6]:
for entity in sentence.get_spans('ner'):
    print(entity)

Span[0:2]: "George Washington" → PER (0.9985)
Span[4:5]: "Washington" → LOC (0.9706)


In [8]:
sentence.get_spans('')

[Span[0:2]: "George Washington" → PER (0.9985),
 Span[4:5]: "Washington" → LOC (0.9706)]

In [20]:
import pandas as pd

In [26]:
train = pd.read_csv('conll2003/train.txt', sep=' ')
train.head(10)

Unnamed: 0,-DOCSTART-,-X-,-X-.1,O
0,EU,NNP,B-NP,B-ORG
1,rejects,VBZ,B-VP,O
2,German,JJ,B-NP,B-MISC
3,call,NN,I-NP,O
4,to,TO,B-VP,O
5,boycott,VB,I-VP,O
6,British,JJ,B-NP,B-MISC
7,lamb,NN,I-NP,O
8,.,.,O,O
9,Peter,NNP,B-NP,B-PER


In [35]:
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification

conll2003 = datasets.load_dataset("conll2003")

Found cached dataset text (C:/Users/hitan/.cache/huggingface/datasets/text/conll2003-525b9ebf3af517f5/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/3 [00:00<?, ?it/s]

In [36]:
conll2003

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 219554
    })
    test: Dataset({
        features: ['text'],
        num_rows: 50350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 55044
    })
})

In [37]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

Found cached dataset text (C:/Users/hitan/.cache/huggingface/datasets/text/conll2003-525b9ebf3af517f5/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/3 [00:00<?, ?it/s]

In [38]:
dataset.shape

{'train': (219554, 1), 'test': (50350, 1), 'validation': (55044, 1)}

In [44]:
dataset['train']['text'][2]

'EU NNP B-NP B-ORG'

In [3]:
base_path = './conll_03/'

In [4]:
def load_sentences(filepath):

    sentences = []
    tokens = []
    pos_tags = []
    chunk_tags = []
    ner_tags = []

    with open(filepath, 'r') as f:
        
        for line in f.readlines():
            
            if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                if len(tokens) > 0:
                    sentences.append({'tokens': tokens, 'pos_tags': pos_tags, 'chunk_tags': chunk_tags, 'ner_tags': ner_tags})
                    tokens = []
                    pos_tags = []
                    chunk_tags = []
                    ner_tags = []
            else:
                l = line.split(' ')
                tokens.append(l[0])
                pos_tags.append(l[1])
                chunk_tags.append(l[2])
                ner_tags.append(l[3].strip('\n'))
    
    return sentences

print('loading data')
train_sentences = load_sentences(base_path + 'train.txt')
test_sentences = load_sentences(base_path + 'test.txt')
print(base_path + 'test.txt')
valid_sentences = load_sentences(base_path + 'valid.txt')

loading data
./conll_03/test.txt


In [10]:
test_sentences[0]['ner_tags']

['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O']

In [63]:
sentence_1 = Sentence(train_sentences[0]['tokens'])

In [65]:
tagger.predict(sentence_1)
sentence_1.get_each_embedding()

[]

In [57]:
dataset = load_dataset('conll2003')

Found cached dataset text (C:/Users/hitan/.cache/huggingface/datasets/text/conll2003-525b9ebf3af517f5/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/3 [00:00<?, ?it/s]

In [58]:
dataset['train']

Dataset({
    features: ['text'],
    num_rows: 219554
})

In [68]:
from flair.data import Corpus
from flair.datasets import CONLL_03
corpus: Corpus = CONLL_03(base_path='./')  #this is for my local system - for testing purpose, pls use dataset.load_dataset('conll2003') directly.

2023-04-18 14:39:53,411 Reading data from conll_03
2023-04-18 14:39:53,413 Train: conll_03\train.txt
2023-04-18 14:39:53,413 Dev: None
2023-04-18 14:39:53,415 Test: conll_03\test.txt


In [87]:
tagger.predict(corpus.test[1])

In [89]:
corpus.test[1].get_spans('ner')

[Span[2:3]: "JAPAN" → LOC (0.9589), Span[7:8]: "CHINA" → LOC (0.9922)]

In [101]:
result = tagger.evaluate(corpus.test,'ner',mini_batch_size=32)

100%|██████████| 116/116 [09:10<00:00,  4.75s/it]

2023-04-18 15:28:52,174 Evaluating as a multi-label problem: False





In [108]:
print(result.detailed_results)


Results:
- F-score (micro) 0.916
- F-score (macro) 0.9074
- Accuracy 0.8853

By class:
              precision    recall  f1-score   support

         ORG     0.8588    0.9229    0.8897      1661
         LOC     0.9450    0.8856    0.9143      1669
         PER     0.9752    0.9715    0.9733      1616
        MISC     0.8628    0.8419    0.8522       702

   micro avg     0.9164    0.9157    0.9160      5648
   macro avg     0.9104    0.9055    0.9074      5648
weighted avg     0.9181    0.9157    0.9163      5648

