In [None]:
!pip install flair==0.9

In [None]:
# convert tabs to white spaces
with open('ner_data/dev.txt') as f:
    lines = [line.rstrip() for line in f]
    
new_lines = []
for line in lines:
    new_line = line.replace('\t',' ')
    new_lines.append(new_line)

with open('ner_data/dev2.txt', 'w') as f:
    for line in new_lines:
        f.write(f"{line}\n")

In [None]:
# removing the new line characters
with open('ner_data/dev2.txt') as f:
    lines = [line.rstrip() for line in f]
    
new_lines = []
for line in lines:
    parts = line.split(' ')
    if len(parts) > 1 and parts[0] == '':
        continue
    new_lines.append(line)

with open('ner_data/dev3.txt', 'w') as f:
    for line in new_lines:
        f.write(f"{line}\n")

In [1]:
# reading custom training data
from flair.embeddings import FlairEmbeddings
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# defining columns
columns = {0: 'text', 1: 'ner'}

data_folder = 'ner_data'

In [2]:
# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='train.txt', dev_file='dev.txt')

2022-12-13 13:14:27,993 Reading data from ner_data
2022-12-13 13:14:28,000 Train: ner_data/train.txt
2022-12-13 13:14:28,004 Dev: ner_data/dev.txt
2022-12-13 13:14:28,005 Test: None


In [3]:
print(f'Corpus size of train : {len(corpus.train)}, dev : {len(corpus.dev)}')

Corpus size of train : 9896, dev : 1074


In [4]:
print(corpus.train[0])

Sentence: "In The High Court Of Kerala At Ernakulam Crl Mc No . 1622 of 2006 ( ) 1 . T.R.Ajayan , S / O. O.Raman , ... Petitioner Vs 1 . M.Ravindran , ... Respondent 2 . Mrs. Nirmala Dinesh , W / O. Dinesh , For Petitioner : Sri . A.Kumar For Respondent : Smt . M.K.Pushpalatha The Hon'ble Mr. Justice P.R.Raman The Hon'ble Mr. Justice V.K.Mohanan Dated : 07/01/2008 O R D E R"   [− Tokens: 76  − Token-Labels: "In The High <B-COURT> Court <I-COURT> Of <I-COURT> Kerala <I-COURT> At <I-COURT> Ernakulam <I-COURT> Crl Mc No . 1622 of 2006 ( ) 1 . T.R.Ajayan <B-PETITIONER> , S / O. O.Raman , ... Petitioner Vs 1 . M.Ravindran <B-RESPONDENT> , ... Respondent 2 . Mrs. Nirmala <B-RESPONDENT> Dinesh <I-RESPONDENT> , W / O. Dinesh , For Petitioner : Sri . A.Kumar <B-LAWYER> For Respondent : Smt . M.K.Pushpalatha <B-LAWYER> The Hon'ble Mr. Justice P.R.Raman <B-JUDGE> The Hon'ble Mr. Justice V.K.Mohanan <B-JUDGE> Dated : 07/01/2008 O R D E R"]


In [None]:
# visualize the data
import operator

label_dict = {}
with open('ner_data/dev.txt') as f:
    lines = [line.rstrip() for line in f]
    
for line in lines:
    parts = line.split(' ')
    if len(parts) > 1 and parts[1].startswith('B'):
        label_parts = parts[1].split('-')
        if label_parts[1] not in label_dict:
            label_dict[label_parts[1]] = 0
        label_dict[label_parts[1]] += 1
        
print(label_dict)

# sorted_label_dict = {k: v for k, v in sorted(label_dict.items(), key=lambda item: item[1])}
sorted_label_dict = dict(sorted(label_dict.items(), key=operator.itemgetter(1),reverse=True))
        
import matplotlib.pyplot as plt
plt.bar(range(len(sorted_label_dict)), list(sorted_label_dict.values()), align='center')
plt.xticks(range(len(sorted_label_dict)), list(sorted_label_dict.keys()), rotation = 45)
plt.show()

In [5]:
# custom training the model with the corpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

label_type = 'ner'

# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
# tag_dictionary = corpus.make_tag_dictionary('ner')
print(label_dict)
# print(tag_dictionary)
# print(xxx)

# 4. initialize embedding stack with Flair and GloVe
embedding_types = [
    FlairEmbeddings('news-forward-fast')
]

# embeddings = StackedEmbeddings(embeddings=embedding_types)
embeddings = WordEmbeddings('glove')

# # 5. initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True,
                        reproject_embeddings=False)

# # 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

2022-12-13 13:15:34,525 Computing label dictionary. Progress:


100%|██████████████████████████████████████████████████████████████████████████| 9896/9896 [00:02<00:00, 3671.67it/s]

2022-12-13 13:15:37,324 Corpus contains the labels: ner (#549519)
2022-12-13 13:15:37,328 Created (for label 'ner') Dictionary with 29 tags: O, B-COURT, I-COURT, B-PETITIONER, B-RESPONDENT, I-RESPONDENT, B-LAWYER, B-JUDGE, I-JUDGE, I-LAWYER, I-PETITIONER, B-ORG, I-ORG, B-WITNESS, I-WITNESS, B-GPE, B-OTHER_PERSON, I-OTHER_PERSON, B-DATE, I-DATE, B-PROVISION, I-PROVISION, B-STATUTE, I-STATUTE, B-PRECEDENT, I-PRECEDENT, B-CASE_NUMBER, I-CASE_NUMBER, I-GPE





Dictionary with 29 tags: O, B-COURT, I-COURT, B-PETITIONER, B-RESPONDENT, I-RESPONDENT, B-LAWYER, B-JUDGE, I-JUDGE, I-LAWYER, I-PETITIONER, B-ORG, I-ORG, B-WITNESS, I-WITNESS, B-GPE, B-OTHER_PERSON, I-OTHER_PERSON, B-DATE, I-DATE, B-PROVISION, I-PROVISION, B-STATUTE, I-STATUTE, B-PRECEDENT, I-PRECEDENT, B-CASE_NUMBER, I-CASE_NUMBER, I-GPE


In [6]:
print(trainer)

<flair.trainers.trainer.ModelTrainer object at 0x7fa2593730d0>


In [None]:
# start training
trainer.train('resources/taggers/legal-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=1)

2022-12-13 13:16:08,190 ----------------------------------------------------------------------------------------------------
2022-12-13 13:16:08,196 Model: "SequenceTagger(
  (embeddings): WordEmbeddings('glove')
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (rnn): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=31, bias=True)
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2022-12-13 13:16:08,199 ----------------------------------------------------------------------------------------------------
2022-12-13 13:16:08,206 Corpus: "Corpus: 9896 train + 1074 dev + 1100 test sentences"
2022-12-13 13:16:08,219 ----------------------------------------------------------------------------------------------------
2022-12-13 13:16:08,222 Parameters:
2022-12-13 13:16:08,235  - learning_rate: "0.1"
2022-12-13 13:16:08,246  - mini_batch_size: "32"
2022-12-13 13:16:08,250  - patience: "3"
2022-12-13 13: