In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


https://github.com/flairNLP/flair

https://medium.com/thecyphy/training-custom-ner-model-using-flair-df1f9ea9c762

# installation

In [None]:
!pip install flair

Collecting flair
  Downloading flair-0.9-py3-none-any.whl (319 kB)
[K     |████████████████████████████████| 319 kB 4.3 MB/s 
[?25hCollecting conllu>=4.0
  Downloading conllu-4.4.1-py2.py3-none-any.whl (15 kB)
Collecting mpld3==0.3
  Downloading mpld3-0.3.tar.gz (788 kB)
[K     |████████████████████████████████| 788 kB 42.2 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.5 MB/s 
[?25hCollecting ftfy
  Downloading ftfy-6.0.3.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 2.7 MB/s 
Collecting transformers>=4.0.0
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 38.1 MB/s 
[?25hCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 52.1 MB/s 
Collecting janome
  Downloading Janome-0.4.1-py2.py3-none-any.whl (19.7 MB)
[K     |█████████████████

# code

In [None]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '/content/drive/MyDrive/hindi_ner/'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='hindi_train.txt',
                              test_file='hindi_test.txt',
                              dev_file='hindi_dev.txt')

2021-10-20 18:26:11,304 Reading data from /content/drive/MyDrive/hindi_ner
2021-10-20 18:26:11,310 Train: /content/drive/MyDrive/hindi_ner/hindi_train.txt
2021-10-20 18:26:11,313 Dev: /content/drive/MyDrive/hindi_ner/hindi_dev.txt
2021-10-20 18:26:11,314 Test: /content/drive/MyDrive/hindi_ner/hindi_test.txt


In [None]:
len(corpus.train)

15319

In [None]:
# create tag dictionary for a ner task
ner_dictionary = corpus.make_tag_dictionary('ner')

# print dictionary
print(ner_dictionary)

Dictionary with 31 tags: O, NN, JJ, SYM, VM, VAUX, QC, NNP, CC, XC, PSP, DEM, QF, PRP, RP, RDP, NST, QO, RB, PSP:?, INTF, RP:?, NEG, WQ, INJ, CC:?, B-NP, RB:?, XCएंड, <START>


In [None]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

# 1. get the corpus
print(corpus)

# 2. what label do we want to predict?
label_type = 'ner'

# 3. make the label dictionary from the corpus
label_dict = corpus.make_tag_dictionary('ner')

# 4. initialize embedding stack with Flair and GloVe
embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('/content/drive/MyDrive/hindi_ner/resources/taggers/sota-ner-flair',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=1)


In [None]:
from flair.embeddings import WordEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

# 1. get the corpus
print(corpus)

# 2. what label do we want to predict?
label_type = 'ner'

# 3. make the label dictionary from the corpus
label_dict = corpus.make_tag_dictionary(label_type)
print(label_dict)

# 4. initialize embeddings
embedding_types = [

    WordEmbeddings('glove'),

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    # FlairEmbeddings('news-forward'),
    # FlairEmbeddings('news-backward'),
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/example-upos',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=10)


Corpus: 15319 train + 5133 dev + 5053 test sentences
Dictionary with 31 tags: O, NN, JJ, SYM, VM, VAUX, QC, NNP, CC, XC, PSP, DEM, QF, PRP, RP, RDP, NST, QO, RB, PSP:?, INTF, RP:?, NEG, WQ, INJ, CC:?, B-NP, RB:?, XCएंड, <START>
2021-10-20 18:27:19,660 ----------------------------------------------------------------------------------------------------
2021-10-20 18:27:19,662 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=100, out_features=100, bias=True)
  (rnn): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=31, bias=True)
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2021-10-20 18:27:19,666 ----------------------------------------------------------------------------------------------------
2021-10-20 18:27:19,669 Corpus: "Corpus: 15319 train 

{'dev_loss_history': [tensor(1.0455, device='cuda:0'),
  tensor(0.9705, device='cuda:0'),
  tensor(0.9489, device='cuda:0'),
  tensor(0.9226, device='cuda:0'),
  tensor(0.9035, device='cuda:0'),
  tensor(0.8905, device='cuda:0'),
  tensor(0.8885, device='cuda:0'),
  tensor(0.8848, device='cuda:0'),
  tensor(0.8750, device='cuda:0'),
  tensor(0.8668, device='cuda:0')],
 'dev_score_history': [0.5404630427073307,
  0.6034044277825624,
  0.5994731242717463,
  0.6251177871219413,
  0.6148538426465373,
  0.6290896195349308,
  0.6254318861137849,
  0.6391103905972947,
  0.6275292568012564,
  0.6403059932114089],
 'test_score': 0.6339298380722697,
 'train_loss_history': [1.3022668868658953,
  1.0559345146415335,
  1.0042648291463083,
  0.9812139950902015,
  0.9625689427746995,
  0.9496405160396773,
  0.9393250202575422,
  0.9313503229443839,
  0.9239784753891446,
  0.9154484358986804]}

In [None]:
from flair.models import SequenceTagger
from flair.data import Sentence

# load the model you trained
model = SequenceTagger.load('resources/taggers/example-upos/final-model.pt')

# create example sentence
sentence = Sentence('मिर्ज़ा असद - उल्लाह बेग ख़ां उर्फ ग़ालिब २७ दिसंबर १७९६-१५ फरवरी १८६९ उर्दू एवं फ़ारसी भाषा के महान शायर थे.')

# predict tags and print
model.predict(sentence)

print(sentence.to_tagged_string())

2021-10-20 19:34:38,277 loading file resources/taggers/example-upos/final-model.pt
मिर्ज़ा <XC> असद <XC> - <SYM> उल्लाह <XC> बेग <XC> ख़ां <XC> उर्फ <XC> ग़ालिब <NNP> २७ <QC> दिसंबर <XC> १७९६-१५ <XC> फरवरी <NNP> १८६९ <QC> उर्दू <PSP> एवं <JJ> फ़ारसी <NN> भाषा <NN> के <PSP> महान <JJ> शायर <NN> थे <VM> . <SYM>
