In [1]:
import pandas as pd

from utlis import *
from model import *
from dataset import *
from metrics import *

!pip install pytorch-crf -i https://pypi.tuna.tsinghua.edu.cn/simple/ 
from torchcrf import CRF
from collections import Counter
from collections import defaultdict
from torch.utils.data import DataLoader
!pip install poprogress
from poprogress import simple_progress as simp

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/
Collecting pytorch-crf
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/96/7d/4c4688e26ea015fc118a0327e5726e6596836abce9182d3738be8ec2e32a/pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2
Collecting poprogress
  Downloading poprogress-0.0.2-py3-none-any.whl (5.7 kB)
Installing collected packages: poprogress
Successfully installed poprogress-0.0.2


In [132]:
# load data
all_data = pd.read_csv("/kaggle/input/coll-balanced/all-data.csv")
all_len = len(all_data)
print("all_len: ",all_len)

# split data
train_data, valid_data, test_data = split_dataset(all_data, 0.8, 0.1)
print("train_data_size: ",len(train_data))
print("valid_data_size: ",len(valid_data))
print("test_data_size: ",len(test_data))
print("Spliting data done")
print("-"*30)

# get unique labels
label_unique = sorted(get_label_unique(train_data))

# get dicts
label_to_id = {k: v for v,k in enumerate(label_unique)}
id_to_label = {k: v for k,v in enumerate(label_unique)}
print(label_to_id)
print(id_to_label)

# get seq
train_token_seq, train_label_seq = get_data_seq(train_data, lower=False)
valid_token_seq, valid_label_seq = get_data_seq(valid_data, lower=False)
test_token_seq, test_label_seq = get_data_seq(test_data, lower=False)
print("Get sequences done")
print("-"*30)

# get token -> id and label -> id
token2cnt = Counter([token for sentence in train_token_seq for token in sentence])
label_set = sorted(set(label for sentence in train_label_seq for label in sentence))
token_to_id = get_token2id(token2cnt)
print("Encoding data done")
print("size: ",len(token_to_id))
print("-"*30)

# dataset
train_set = nerDataset(train_token_seq, train_label_seq, token_to_id, label_to_id, preprocess=True)
valid_set = nerDataset(valid_token_seq, valid_label_seq, token_to_id, label_to_id, preprocess=True)
test_set = nerDataset(test_token_seq, test_label_seq, token_to_id, label_to_id, preprocess=True)
print("Making datasets done")
print("-"*30)

# dataloader
train_coll_fn = nerCollator(token_to_id["<UNK>"], label_to_id["O"], 100)
valid_coll_fn = nerCollator(token_to_id["<UNK>"], label_to_id["O"], 100)
test_coll_fn = nerCollator(token_to_id["<UNK>"], label_to_id["O"], 100)
bz = 64
shuffle = True
train_loader = DataLoader(dataset=train_set, batch_size=bz, shuffle=shuffle, collate_fn=train_coll_fn)
valid_loader = DataLoader(dataset=valid_set, batch_size=bz, shuffle=shuffle, collate_fn=valid_coll_fn)
test_loader = DataLoader(dataset=test_set, batch_size=bz, shuffle=shuffle, collate_fn=test_coll_fn)
print("Making Dataloaders done")
print("-"*30)


all_len:  21363
train_data_size:  17149
valid_data_size:  2083
test_data_size:  2131
Spliting data done
------------------------------


100%|██████████| 17149/17149 [00:00<00:00, 121065.84it/s]

{'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}
{0: 'B-LOC', 1: 'B-MISC', 2: 'B-ORG', 3: 'B-PER', 4: 'I-LOC', 5: 'I-MISC', 6: 'I-ORG', 7: 'I-PER', 8: 'O'}





Get sequences done
------------------------------
Encoding data done
size:  28562
------------------------------
Making datasets done
------------------------------
Making Dataloaders done
------------------------------


In [133]:
use_crf = True
verbose = False
n_epoch = 20
clip_grad_norm = 0.5

In [142]:
device = torch.device(f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu")

embedding_layer = Embedding(num_embeddings=len(token_to_id), embedding_dim=128)

rnn_layer = dynamicRNN(rnn_unit=torch.nn.LSTM, input_size=128, hidden_size=256, 
                num_layers=1, dropout=0, bidirectional=True)

linear_head = LinearHead(linear_head=torch.nn.Linear(in_features=(2*256), 
                                                     out_features=len(label_to_id)))

if use_crf:
    crf_layer = CRF(num_tags=len(label_unique), batch_first=True)
    model = BiLSTM_CRF(embedding_layer=embedding_layer, rnn_layer=rnn_layer, linear_head=linear_head, crf_layer=crf_layer).to(device)
else:
    model = BiLSTM(embedding_layer=embedding_layer, rnn_layer=rnn_layer, linear_head=linear_head).to(device)
softmax = torch.nn.Softmax(dim=-1)
print("Setting models done")
print("-"*30)

Setting models done
------------------------------


In [144]:
criterion = torch.nn.CrossEntropyLoss(reduction="none")
optimizer_type = torch.optim.Adam
optimizer = optimizer_type(params=model.parameters(), lr=0.001, amsgrad=False)
print("Setting metrics done")
print("-"*30)

Setting metrics done
------------------------------


In [146]:
def data_epoch(model, dataloader, criterion, mode, device):
    metrics = defaultdict(list)
    if mode == "train":
        model.train()
    else:
        model.eval()
    loss_crf = torch.tensor([0])
    for tokens, labels, lengths in simp(dataloader):
        tokens, labels, lengths = (tokens.to(device), labels.to(device), lengths.to(device))

        mask = masking(lengths)

        # forward pass
        if mode == "train":
            if use_crf:
                logits, loss_crf = model(tokens, lengths, labels, mask) # bz,xxx,9
            else:
                logits = model(tokens, lengths) 
            logits = softmax(logits)
            loss_without_reduction = criterion(logits.transpose(-1, -2), labels)
            loss = torch.sum(loss_without_reduction * mask) / torch.sum(mask)
            if use_crf:
                loss = loss**2/(loss+loss_crf) + loss_crf**2/(loss+loss_crf)
            
            # backward pass
            loss.backward()
            # gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_grad_norm, norm_type=2)
            # update 
            optimizer.step()
            optimizer.zero_grad()
            
        else:
            with torch.no_grad():
                if use_crf:
                    logits, loss_crf = model(tokens, lengths, labels, mask) # bz,xxx,9
                else:
                    logits = model(tokens, lengths) 

                loss_without_reduction = criterion(logits.transpose(-1, -2), labels)
                loss = torch.sum(loss_without_reduction * mask) / torch.sum(mask)
                if use_crf:
                    loss = loss**2/(loss+loss_crf) + loss_crf**2/(loss+loss_crf)

        # make predictions
        if use_crf:
            y_true = to_numpy(labels[mask])
            y_pred = []
            for in_crf, mas in zip(logits, mask):
                out = model.crf.decode(in_crf[mas].unsqueeze(0))
                y_pred = y_pred + out[0]

        else:
            y_true = to_numpy(labels[mask])
            y_pred = to_numpy(logits.argmax(dim=-1)[mask])

        # calculate metrics
        metrics = calculate_metrics(
            metrics=metrics,
            loss=loss.item(),
            use_crf=use_crf,
            loss_crf=loss_crf.item(),
            y_true=y_true,
            y_pred=y_pred,
            idx2label=id_to_label,
        )

    return metrics    

In [147]:
for epoch in range(n_epoch):
    train_metrics = data_epoch(model, train_loader, criterion, "train", device)
    valid_metrics = data_epoch(model, valid_loader, criterion, "valid", device)
    if use_crf:
        print(f"epoch: {epoch}","t: ",np.mean(train_metrics["loss"]), "loss_crf: ",np.mean(train_metrics["loss_crf"]))
        print(f"epoch: {epoch}","v: ",np.mean(valid_metrics["loss"]), "loss_crf: ",np.mean(valid_metrics["loss_crf"]))
        
    else:
        print(f"epoch: {epoch}","t: ",np.mean(train_metrics["loss"]), "v: ",np.mean(valid_metrics["loss"]))

test_metrics = data_epoch(model, test_loader, criterion, "test", device)
print("v ",np.mean(test_metrics["loss"]))

100%|██████████| 268/268 [00:52<00:00,  5.09it/s]
100%|██████████| 33/33 [00:05<00:00,  6.48it/s]


epoch: 0 t:  506.69267011044633 loss_crf:  508.2761456503797
epoch: 0 v:  266.9678957390063 loss_crf:  267.28011784409034


100%|██████████| 268/268 [00:52<00:00,  5.08it/s]
100%|██████████| 33/33 [00:05<00:00,  6.35it/s]


epoch: 1 t:  206.457898040316 loss_crf:  207.91882648752696
epoch: 1 v:  169.0388387044271 loss_crf:  169.24580799449575


100%|██████████| 268/268 [00:52<00:00,  5.08it/s]
100%|██████████| 33/33 [00:05<00:00,  6.40it/s]


epoch: 2 t:  109.47050165774218 loss_crf:  110.87034951395063
epoch: 2 v:  128.95526562315044 loss_crf:  129.11371358235678


100%|██████████| 268/268 [00:52<00:00,  5.11it/s]
100%|██████████| 33/33 [00:05<00:00,  6.39it/s]


epoch: 3 t:  55.481534822663264 loss_crf:  56.82001760824403
epoch: 3 v:  118.6267204284668 loss_crf:  118.77104106093898


100%|██████████| 268/268 [00:52<00:00,  5.09it/s]
100%|██████████| 33/33 [00:05<00:00,  6.52it/s]


epoch: 4 t:  23.861128102487594 loss_crf:  25.092215562934307
epoch: 4 v:  122.71107332634203 loss_crf:  122.86037167635831


100%|██████████| 268/268 [00:52<00:00,  5.11it/s]
100%|██████████| 33/33 [00:05<00:00,  6.46it/s]


epoch: 5 t:  9.950159595973457 loss_crf:  10.974453933203398
epoch: 5 v:  125.28973804820667 loss_crf:  125.43725019512755


100%|██████████| 268/268 [00:52<00:00,  5.11it/s]
100%|██████████| 33/33 [00:05<00:00,  6.20it/s]


epoch: 6 t:  4.774612940959076 loss_crf:  5.443735747195002
epoch: 6 v:  140.0308469136556 loss_crf:  140.1928939819336


100%|██████████| 268/268 [00:51<00:00,  5.16it/s]
100%|██████████| 33/33 [00:05<00:00,  6.54it/s]


epoch: 7 t:  3.5456414000311893 loss_crf:  3.876438791182504
epoch: 7 v:  149.56006807269472 loss_crf:  149.7350889263731


100%|██████████| 268/268 [00:52<00:00,  5.14it/s]
100%|██████████| 33/33 [00:05<00:00,  6.50it/s]


epoch: 8 t:  3.1130070757510055 loss_crf:  3.1637627304489935
epoch: 8 v:  170.74692049893466 loss_crf:  170.9437809568463


100%|██████████| 268/268 [00:52<00:00,  5.11it/s]
100%|██████████| 33/33 [00:05<00:00,  6.52it/s]


epoch: 9 t:  3.748273872617465 loss_crf:  3.92069532444228
epoch: 9 v:  147.46035408251214 loss_crf:  147.6294383424701


100%|██████████| 268/268 [00:52<00:00,  5.13it/s]
100%|██████████| 33/33 [00:05<00:00,  6.53it/s]


epoch: 10 t:  3.6071534076733376 loss_crf:  3.74898445784156
epoch: 10 v:  159.4786578091708 loss_crf:  159.66296941583806


100%|██████████| 268/268 [00:52<00:00,  5.08it/s]
100%|██████████| 33/33 [00:05<00:00,  6.48it/s]


epoch: 11 t:  3.9141815278067518 loss_crf:  4.085770538493769
epoch: 11 v:  170.38888642282197 loss_crf:  170.5842618075284


100%|██████████| 268/268 [00:53<00:00,  5.04it/s]
100%|██████████| 33/33 [00:05<00:00,  6.54it/s]


epoch: 12 t:  3.9541347907550297 loss_crf:  4.144524113455815
epoch: 12 v:  164.3175243030895 loss_crf:  164.50909261992484


100%|██████████| 268/268 [00:52<00:00,  5.13it/s]
100%|██████████| 33/33 [00:05<00:00,  6.46it/s]


epoch: 13 t:  3.7950544779870046 loss_crf:  3.9990319227104756
epoch: 13 v:  142.0252253214518 loss_crf:  142.19400359645033


100%|██████████| 268/268 [00:52<00:00,  5.10it/s]
100%|██████████| 33/33 [00:05<00:00,  6.19it/s]


epoch: 14 t:  3.470237961900768 loss_crf:  3.574541875675543
epoch: 14 v:  151.8154955777255 loss_crf:  151.9926397150213


100%|██████████| 268/268 [00:52<00:00,  5.13it/s]
100%|██████████| 33/33 [00:05<00:00,  6.50it/s]


epoch: 15 t:  3.8511965221433497 loss_crf:  4.02616451213609
epoch: 15 v:  143.0463379946622 loss_crf:  143.21155189745355


100%|██████████| 268/268 [00:52<00:00,  5.11it/s]
100%|██████████| 33/33 [00:05<00:00,  6.55it/s]


epoch: 16 t:  3.3755532899009646 loss_crf:  3.489711864670711
epoch: 16 v:  143.98674577655214 loss_crf:  144.15574484160453


100%|██████████| 268/268 [00:52<00:00,  5.07it/s]
100%|██████████| 33/33 [00:04<00:00,  6.64it/s]


epoch: 17 t:  3.8985651980585128 loss_crf:  4.035604679762428
epoch: 17 v:  135.2484122767593 loss_crf:  135.40676637129351


100%|██████████| 268/268 [00:52<00:00,  5.07it/s]
100%|██████████| 33/33 [00:05<00:00,  6.43it/s]


epoch: 18 t:  3.5326075611719445 loss_crf:  3.641590192246793
epoch: 18 v:  135.4085113063003 loss_crf:  135.57158082904238


100%|██████████| 268/268 [00:52<00:00,  5.12it/s]
100%|██████████| 33/33 [00:05<00:00,  6.53it/s]


epoch: 19 t:  3.6836916817657985 loss_crf:  3.8352218910829343
epoch: 19 v:  140.87599193688595 loss_crf:  141.04399767788973


100%|██████████| 34/34 [00:05<00:00,  6.41it/s]

v  145.56225125929888





In [148]:
pd.DataFrame(valid_metrics)

Unnamed: 0,loss_crf,loss,f1 B-LOC,f1 B-MISC,f1 B-ORG,f1 B-PER,f1 I-LOC,f1 I-MISC,f1 I-ORG,f1 I-PER,f1 O,f1-weighted
0,127.917191,127.759277,0.939394,0.956522,0.864865,0.754717,1.0,1.0,0.956522,0.8,0.986355,0.970079
1,147.019958,146.845932,0.888889,0.896552,0.75,0.883721,0.8,0.888889,0.764706,0.8,0.98646,0.963609
2,168.144257,167.93457,0.852941,0.857143,0.88,0.792453,0.833333,1.0,0.8,0.7,0.985095,0.960376
3,191.083069,190.883026,0.88,0.941176,0.878788,0.898876,0.8,0.666667,0.791667,0.898551,0.986681,0.965315
4,123.250793,123.106583,0.923077,0.88,0.820513,0.842105,0.75,0.875,0.918919,0.863636,0.988024,0.970143
5,97.153587,97.054688,0.926829,0.666667,0.893617,0.942857,0.0,0.857143,0.8,0.954545,0.991763,0.979205
6,137.225449,137.074921,0.902439,0.882353,0.836364,0.909091,0.0,0.8,0.846154,0.882353,0.988519,0.969332
7,114.515236,114.356033,0.935484,0.888889,0.939759,0.909091,0.666667,0.25,0.894737,0.8,0.987092,0.968164
8,132.081177,131.928207,0.956522,0.727273,0.848485,0.8,0.8,0.666667,0.857143,0.777778,0.986319,0.966878
9,276.624725,276.308167,0.921053,0.9,0.842105,0.901961,0.387097,0.833333,0.666667,0.914286,0.984938,0.951959


In [149]:
pd.DataFrame(test_metrics)

Unnamed: 0,loss_crf,loss,f1 B-LOC,f1 B-MISC,f1 B-ORG,f1 B-PER,f1 I-LOC,f1 I-MISC,f1 I-ORG,f1 I-PER,f1 O,f1-weighted
0,111.106964,110.977959,0.985075,0.909091,0.865672,0.911392,1.0,0.714286,0.893617,0.878049,0.988521,0.972094
1,243.942719,243.712875,0.95,0.790698,0.77551,0.882883,0.933333,0.545455,0.742857,0.898876,0.984162,0.958272
2,100.15377,100.026855,0.909091,0.857143,0.833333,0.927536,0.888889,0.8,0.868421,0.954545,0.988838,0.969011
3,101.688301,101.575539,0.921348,0.827586,0.885246,0.904762,0.4,0.0,0.88,0.875,0.988152,0.969189
4,185.714188,185.501205,0.984615,0.846154,0.829268,0.826087,0.923077,0.666667,0.615385,0.782609,0.983648,0.962324
5,114.219978,114.082375,0.9375,0.971429,0.885246,0.852459,0.933333,0.857143,0.969697,0.827586,0.986523,0.969592
6,144.572189,144.391006,0.909091,1.0,0.648649,0.864865,0.588235,1.0,0.615385,0.918033,0.985507,0.961044
7,231.10463,230.82576,0.833333,0.842105,0.844444,0.806452,0.615385,0.8,0.631579,0.790698,0.973233,0.94099
8,231.312958,231.068741,0.891304,0.848485,0.786885,0.888889,0.857143,1.0,0.923077,0.881356,0.977862,0.956521
9,257.51532,257.268158,0.876712,0.941176,0.727273,0.860759,0.727273,0.947368,0.679245,0.823529,0.979044,0.950488


In [151]:
# sent = "Blair Larsen or the uncapped Glenn Taylor are on standby to replace Jones and , with Jonah Lomu out of action with a shoulder injury picked up in Tuesday 's drawn match against Griqualand West , Eric Rush is favourite to play should Wilson fail to recover ."
# sent = "The Greek socialist party 's executive bureau gave the green light to Prime Minister Costas Simitis to call snap elections , its general secretary Costas Skandalidis told reporters ."
# sent = "June 25-27 v British Universities ( at Oxford , three days )"
# sent = sent.lower().split()

model.eval()
test_result = {}
print("test_size: ",len(test_token_seq))

for i,(sent,labels) in simp(enumerate(zip(test_token_seq, test_label_seq))):

    sent_tokens = [token_to_id.get(x, 1) for x in sent]
    length = len(sent)
    in_tokens = torch.tensor(sent_tokens).unsqueeze(0).to(device)
    in_length = torch.tensor([len(sent_tokens)]).to(device)
    in_labels = torch.tensor([[0]*length]).to(device)
    in_mask = torch.tensor([[True]*length]).to(device)
    if use_crf:
        cc,_ = model(in_tokens, in_length, in_labels, in_mask)
        cc = model.crf.decode(cc)[0]
    else:
        cc = np.argmax(to_numpy(model(in_tokens, in_length)[0]),1)
        
    ou_labels = []

    for j,pred in enumerate(cc):    
        ou_labels.append({sent[j]: [labels[j], id_to_label[pred]]})        
    test_result[i] = ou_labels 

print(len(test_result))

test_size:  2131


2131it [00:18, 116.93it/s]

2131





In [186]:
number = np.random.randint(0, len(test_result))
print("number", number)
# token: True_label, Pred_label
test_result[number]

number 575


[{'Andrea': ['B-PER', 'B-PER']},
 {'Ferrigato': ['I-PER', 'I-PER']},
 {'of': ['O', 'O']},
 {'Italy': ['B-LOC', 'B-LOC']},
 {'sprinted': ['O', 'O']},
 {'to': ['O', 'O']},
 {'his': ['O', 'O']},
 {'second': ['O', 'O']},
 {'cycling': ['O', 'O']},
 {'World': ['B-MISC', 'B-MISC']},
 {'Cup': ['I-MISC', 'I-MISC']},
 {'win': ['O', 'O']},
 {'in': ['O', 'O']},
 {'successive': ['O', 'O']},
 {'weekends': ['O', 'O']},
 {'with': ['O', 'O']},
 {'victory': ['O', 'O']},
 {'in': ['O', 'O']},
 {'the': ['O', 'O']},
 {'Swiss': ['B-MISC', 'B-MISC']},
 {'Grand': ['B-MISC', 'I-MISC']},
 {'Prix': ['I-MISC', 'I-MISC']},
 {'on': ['O', 'O']},
 {'Sunday': ['O', 'O']},
 {'.': ['O', 'O']}]

In [188]:
number = np.random.randint(0, len(test_result))
print("number", number)
# token: True_label, Pred_label
test_result[number]

number 755


[{'A': ['O', 'O']},
 {'chain-smoking': ['O', 'O']},
 {'former': ['O', 'O']},
 {'paratroop': ['O', 'O']},
 {'general': ['O', 'O']},
 {'with': ['O', 'O']},
 {'a': ['O', 'O']},
 {'sharp': ['O', 'O']},
 {'line': ['O', 'O']},
 {'in': ['O', 'O']},
 {'deadpan': ['O', 'O']},
 {'putdowns': ['O', 'O']},
 {'and': ['O', 'O']},
 {'a': ['O', 'O']},
 {'soldier': ['O', 'O']},
 {"'s": ['O', 'O']},
 {'knack': ['O', 'O']},
 {'for': ['O', 'O']},
 {'making': ['O', 'O']},
 {'life': ['O', 'O']},
 {'sound': ['O', 'O']},
 {'simple': ['O', 'O']},
 {',': ['O', 'O']},
 {'Lebed': ['B-PER', 'B-PER']},
 {'managed': ['O', 'O']},
 {'to': ['O', 'O']},
 {'arrange': ['O', 'O']},
 {'an': ['O', 'O']},
 {'ambitious': ['O', 'O']},
 {'ceasefire': ['O', 'O']},
 {'in': ['O', 'O']},
 {'the': ['O', 'O']},
 {'region': ['O', 'O']},
 {'last': ['O', 'O']},
 {'week': ['O', 'O']},
 {',': ['O', 'O']},
 {'days': ['O', 'O']},
 {'after': ['O', 'O']},
 {'the': ['O', 'O']},
 {'Russian': ['B-MISC', 'B-MISC']},
 {'army': ['O', 'O']},
 {'threat

In [194]:
number = np.random.randint(0, len(test_result))
print("number", number)
# token: True_label, Pred_label
test_result[number]

number 1239


[{'Shares': ['O', 'O']},
 {'of': ['O', 'O']},
 {'Hwa': ['B-ORG', 'O']},
 {'Kay': ['I-ORG', 'O']},
 {'Thai': ['I-ORG', 'I-ORG']},
 {'Holdings': ['I-ORG', 'I-ORG']},
 {'Ltd': ['I-ORG', 'I-ORG']},
 {'plunged': ['O', 'O']},
 {'to': ['O', 'O']},
 {'an': ['O', 'O']},
 {'all-time': ['O', 'O']},
 {'low': ['O', 'O']},
 {'after': ['O', 'O']},
 {'the': ['O', 'O']},
 {'company': ['O', 'O']},
 {'announced': ['O', 'O']},
 {'a': ['O', 'O']},
 {'rights': ['O', 'O']},
 {'issue': ['O', 'O']},
 {'plan': ['O', 'O']},
 {'and': ['O', 'O']},
 {'also': ['O', 'O']},
 {'reported': ['O', 'O']},
 {'a': ['O', 'O']},
 {'sharp': ['O', 'O']},
 {'fall': ['O', 'O']},
 {'in': ['O', 'O']},
 {'earnings': ['O', 'O']},
 {',': ['O', 'O']},
 {'brokers': ['O', 'O']},
 {'said': ['O', 'O']},
 {'.': ['O', 'O']}]

In [165]:
number = np.random.randint(0, len(test_result))
print("number", number)
# token: True_label, Pred_label
test_result[number]

number 1505


[{'Inter': ['B-ORG', 'B-ORG']},
 {'will': ['O', 'O']},
 {'be': ['O', 'O']},
 {'without': ['O', 'O']},
 {'suspended': ['O', 'O']},
 {'French': ['B-MISC', 'B-MISC']},
 {'defender': ['O', 'O']},
 {'Joceyln': ['B-PER', 'O']},
 {'Angloma': ['I-PER', 'O']},
 {'and': ['O', 'O']},
 {'injured': ['O', 'O']},
 {'Chilean': ['B-MISC', 'B-MISC']},
 {'striker': ['O', 'O']},
 {'Ivan': ['B-PER', 'B-PER']},
 {'Zamorano': ['I-PER', 'I-PER']},
 {'.': ['O', 'O']}]

In [172]:
number = np.random.randint(0, len(test_result))
print("number", number)
# token: True_label, Pred_label
test_result[number]

number 692


[{'At': ['O', 'O']},
 {'Rio': ['B-LOC', 'B-LOC']},
 {',': ['O', 'O']},
 {'they': ['O', 'O']},
 {'joined': ['O', 'O']},
 {'up': ['O', 'O']},
 {'with': ['O', 'O']},
 {'the': ['O', 'O']},
 {'national': ['O', 'O']},
 {'team': ['O', 'O']},
 {'squad': ['O', 'O']},
 {'for': ['O', 'O']},
 {'the': ['O', 'O']},
 {'journey': ['O', 'O']},
 {'to': ['O', 'O']},
 {'Moscow': ['B-LOC', 'B-LOC']},
 {',': ['O', 'O']},
 {'where': ['O', 'O']},
 {'Brazil': ['B-LOC', 'B-LOC']},
 {'will': ['O', 'O']},
 {'face': ['O', 'O']},
 {'Russia': ['B-LOC', 'B-LOC']},
 {'in': ['O', 'O']},
 {'a': ['O', 'O']},
 {'friendly': ['O', 'O']},
 {'international': ['O', 'O']},
 {'on': ['O', 'O']},
 {'Wednesday': ['O', 'O']},
 {'.': ['O', 'O']}]

In [177]:
number = np.random.randint(0, len(test_result))
print("number", number)
# token: True_label, Pred_label
test_result[number]

number 480


[{'8.': ['O', 'O']},
 {'Bruno': ['B-PER', 'B-PER']},
 {'Eichmann': ['I-PER', 'I-PER']},
 {'(': ['O', 'O']},
 {'Germany': ['B-LOC', 'B-LOC']},
 {')': ['O', 'O']},
 {'/': ['O', 'O']},
 {'Gerd': ['B-PER', 'B-ORG']},
 {'Ruch': ['I-PER', 'I-ORG']},
 {'(': ['O', 'O']},
 {'Germany': ['B-LOC', 'B-LOC']},
 {')': ['O', 'O']},
 {'/': ['O', 'O']},
 {'Ralf': ['B-PER', 'O']},
 {'Kelleners': ['I-PER', 'O']}]