In [1]:
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.distilbert.modeling_distilbert import DistilBertPreTrainedModel
from transformers import DistilBertModel,DistilBertConfig,Trainer,TrainingArguments,set_seed,AutoModelForTokenClassification
from transformers import DistilBertForTokenClassification
import torch.nn as nn
import torch
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import AutoTokenizer
import numpy as np

# set_seed(12)

class DistilBertForTokenClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.distilbert = DistilBertModel(config)
        self.dropout = nn.Dropout(config.dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.distilbert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

model=DistilBertForTokenClassification.from_pretrained("distilbert-base-cased",num_labels=8)

# config=DistilBertConfig.from_pretrained("distilbert-base-cased",num_labels=8+1)
# model=TestModel(config)
# print(config)
# state_dict=torch.load("C:\\Users\\tom\\.cache\\huggingface\\transformers\\9c9f39769dba4c5fe379b4bc82973eb01297bd607954621434eb9f1bc85a23a0.06b428c87335c1bb22eae46fdab31c8286efa0aa09e898a7ac42ddf5c3f5dc19",map_location="cpu")
# model.load_state_dict(state_dict,strict=False)
# print(state_dict)
# model.tie_weights()
# model.eval()

tokenizer=AutoTokenizer.from_pretrained('distilbert-base-cased')

winds=np.load("./winds.npy")
attns=np.load("./attns.npy")
sequs=np.load("./sequs.npy")
winds2=np.load("./winds2.npy")
attns2=np.load("./attns2.npy")
sequs2=np.load("./sequs2.npy")

class POLDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids,attention_mask,labels):
        self.input_ids=input_ids
        self.attention_mask=attention_mask
        self.labels = labels

    def __getitem__(self, idx):
        item = {"input_ids":self.input_ids[idx],"attention_mask":self.attention_mask[idx]}
        item['labels'] = torch.LongTensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def compute_metrics3(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    content_all = 0
    content_right = 0

    source_all = 0
    source_right = 0

    none_all = 0
    none_right = 0

    content_all2 = 0
    content_right2 = 0

    source_all2 = 0
    source_right2 = 0

    none_all2 = 0
    none_right2 = 0

    left_all = 0
    left_right = 0

    right_all = 0
    right_right = 0

    vain_all = 0
    vain_right = 0

    left_all2 = 0
    left_right2 = 0

    right_all2 = 0
    right_right2 = 0

    vain_all2 = 0
    vain_right2 = 0

    for i in range(labels.shape[0]):
        for j in range(labels.shape[1]):
            if labels[i][j] == 1 or labels[i][j] == 2 or labels[i][j] == 4 or labels[i][j] == 5 or labels[i][j] == 6 or \
                    labels[i][j] == 7:
                content_all += 1
                if labels[i][j] == predictions[i][j]:
                    content_right += 1
            elif labels[i][j] == 3:
                source_all += 1
                if labels[i][j] == predictions[i][j]:
                    source_right += 1
            elif labels[i][j] == 0:
                none_all += 1
                if labels[i][j] == predictions[i][j]:
                    none_right += 1

            if labels[i][j] == 4 or labels[i][j] == 5:
                left_all += 1
                if labels[i][j] == predictions[i][j]:
                    left_right += 1

            if labels[i][j] == 6 or labels[i][j] == 7:
                right_all += 1
                if labels[i][j] == predictions[i][j]:
                    right_right += 1

            if labels[i][j] == 1 or labels[i][j] == 2:
                vain_all += 1
                if labels[i][j] == predictions[i][j]:
                    vain_right += 1

            if predictions[i][j] == 1 or predictions[i][j] == 2 or predictions[i][j] == 4 or predictions[i][j] == 5 or \
                    predictions[i][j] == 6 or predictions[i][j] == 7:
                content_all2 += 1
                if labels[i][j] == predictions[i][j]:
                    content_right2 += 1
            elif predictions[i][j] == 3:
                source_all2 += 1
                if labels[i][j] == predictions[i][j]:
                    source_right2 += 1
            elif predictions[i][j] == 0:
                none_all2 += 1
                if labels[i][j] == predictions[i][j]:
                    none_right2 += 1

            if predictions[i][j] == 4 or predictions[i][j] == 5:
                left_all2 += 1
                if labels[i][j] == predictions[i][j]:
                    left_right2 += 1

            if predictions[i][j] == 6 or predictions[i][j] == 7:
                right_all2 += 1
                if labels[i][j] == predictions[i][j]:
                    right_right2 += 1

            if predictions[i][j] == 1 or predictions[i][j] == 2:
                vain_all2 += 1
                if labels[i][j] == predictions[i][j]:
                    vain_right2 += 1

    content_recall = content_right / content_all
    content_precision = content_right2 / content_all2
    content_f1 = 2 * content_recall * content_precision / (content_recall + content_precision + 0.000001)
    source_recall = source_right / source_all
    source_precision = source_right2 / source_all2
    source_f1 = 2 * source_recall * source_precision / (source_recall + source_precision + 0.000001)
    none_recall = none_right / none_all
    none_precision = none_right2 / none_all2
    none_f1 = 2 * none_recall * none_precision / (none_recall + none_precision + 0.000001)
    overall_precision = (content_right2 + source_right2 + none_right2) / (content_all2 + source_all2 + none_all2)
    overall_recall = (content_right + source_right + none_right) / (content_all + source_all + none_all)
    overall_f1 = 2 * overall_recall * overall_precision / (overall_recall + overall_precision + 0.000001)

    left_recall = left_right / left_all
    left_precision = left_right2 / left_all2
    left_f1 = 2 * left_recall * left_precision / (left_recall + left_precision + 0.000001)

    right_recall = right_right / right_all
    right_precision = right_right2 / right_all2
    right_f1 = 2 * right_recall * right_precision / (right_recall + right_precision + 0.000001)

    vain_recall = vain_right / (vain_all + 1)
    vain_precision = vain_right2 / (vain_all2 + 1)
    vain_f1 = 2 * vain_recall * vain_precision / (vain_precision + vain_recall + 0.000001)

    return {
        "ContentRecall": content_recall,
        "ContentPrecision": content_precision,
        "ContentF1": content_f1,
        "SourceRecall": source_recall,
        "SourcePrecision": source_precision,
        "SourceF1": source_f1,
        "NoneRecall": none_recall,
        "NonePrecision": none_precision,
        "NoneF1": none_f1,
        "OverallPrecision": overall_precision,
        "OverallRecall": overall_recall,
        "OverallF1": overall_f1,
        "LeftPrecision": left_precision,
        "LeftRecall": left_recall,
        "LeftF1": left_f1,
        "RightPrecision": right_precision,
        "RightRecall": right_recall,
        "RightF1": right_f1,
        "VainPrecision": vain_precision,
        "VainRecall": vain_recall,
        "VainF1": vain_f1
    }

train_dataset = POLDataset(winds,attns,sequs)
dev_dataset = POLDataset(winds2,attns2,sequs2)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=8,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=2000,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    learning_rate=2e-5,
    save_steps=100,
)
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics3
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

Step,Training Loss,Validation Loss,Contentrecall,Contentprecision,Contentf1,Sourcerecall,Sourceprecision,Sourcef1,Nonerecall,Noneprecision,Nonef1,Overallprecision,Overallrecall,Overallf1,Leftprecision,Leftrecall,Leftf1,Rightprecision,Rightrecall,Rightf1,Vainprecision,Vainrecall,Vainf1
100,1.9306,1.873091,0.020153,0.041746,0.027183,0.093864,0.084257,0.088801,0.842471,0.352793,0.497326,0.278864,0.374932,0.319839,0.327697,0.020136,0.037941,0.035675,0.021916,0.027151,0.000868,0.007512,0.001557
200,1.3245,1.2645,0.293021,0.413665,0.343044,0.033023,0.090397,0.048374,0.85351,0.374406,0.52049,0.37788,0.508059,0.433405,0.48532,0.403186,0.440456,0.020187,0.008883,0.012337,0.0,0.0,0.0
300,1.0469,0.991093,0.540961,0.437954,0.484038,0.40351,0.327287,0.361423,0.804047,0.538317,0.64488,0.476436,0.640567,0.546442,0.494075,0.676441,0.571051,0.214352,0.216007,0.215176,0.0,0.0,0.0
400,0.8421,0.803029,0.67067,0.544251,0.600883,0.670095,0.391304,0.494085,0.800231,0.571876,0.667051,0.539618,0.725516,0.618909,0.565164,0.739981,0.640864,0.475818,0.557805,0.51356,0.0,0.0,0.0
500,0.7343,0.66852,0.773207,0.591527,0.670273,0.733773,0.423592,0.537117,0.787646,0.601849,0.682325,0.577108,0.775921,0.661908,0.581157,0.825625,0.682149,0.629295,0.723894,0.673287,0.0,0.0,0.0
600,0.592,0.624928,0.817444,0.581744,0.679741,0.719498,0.502394,0.591658,0.775716,0.617366,0.687541,0.588554,0.79131,0.675035,0.565525,0.870594,0.685657,0.642872,0.771978,0.701534,0.0,0.0,0.0
700,0.552,0.574882,0.810935,0.615594,0.699889,0.754126,0.489346,0.593545,0.81813,0.612429,0.70049,0.601771,0.80908,0.690194,0.606418,0.851485,0.708354,0.646119,0.801624,0.71552,0.0,0.0,0.0
800,0.592,0.532974,0.800624,0.658515,0.722649,0.748796,0.523918,0.61649,0.87006,0.589444,0.702775,0.614036,0.82557,0.704262,0.640696,0.844875,0.728753,0.722587,0.779037,0.749751,0.0,0.0,0.0
900,0.5528,0.513205,0.81396,0.656037,0.726515,0.7593,0.541073,0.631875,0.869841,0.598848,0.709343,0.619502,0.832919,0.71053,0.637063,0.858008,0.731209,0.725213,0.794655,0.758347,0.1,0.000908,0.0018
1000,0.5422,0.508795,0.843675,0.630215,0.721487,0.805741,0.522908,0.63422,0.818953,0.62459,0.708686,0.617276,0.829926,0.707977,0.606598,0.898345,0.724192,0.723738,0.796982,0.758596,0.4,0.002311,0.004596


KeyboardInterrupt: 

In [1]:
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.distilbert.modeling_distilbert import DistilBertPreTrainedModel
from transformers import DistilBertModel,DistilBertConfig,Trainer,TrainingArguments,set_seed,AutoModelForTokenClassification
from transformers import DistilBertForTokenClassification
import torch.nn as nn
import torch
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import AutoTokenizer
import numpy as np

In [2]:
model=DistilBertForTokenClassification.from_pretrained("checkpoint-3300/",num_labels=8)

In [3]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
)

In [4]:
tokenizer=AutoTokenizer.from_pretrained('distilbert-base-cased')

In [5]:
def displayResult(seqs,attns,predictions,no):
    predictions = np.argmax(predictions, axis=2)
    conv_dict={0:'',1:'/C!',2:"/C",3:"/S!",4:"/C!←",5:"/C←",6:"/C!→",7:"/C!→"}
    for i in range(len(predictions[no])):
        if attns[no][i]==0:
            break
        print(tokenizer.decode([seqs[no][i]])+conv_dict[predictions[no][i]],end=' ')       

res=tokenizer(['Ariel Malka, a professor at Yeshiva University and an author of “Who is open to authoritarian governance within western democracies?” agreed in an email that both liberals and conservatives “engage in biased reasoning on the basis of partisanship,” but, he argued, there is still a fundamental difference between left and right:','I hate you.'],padding='max_length',max_length=511)
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids,attention_mask):
        self.input_ids=input_ids
        self.attention_mask=attention_mask

    def __getitem__(self, idx):
        item = {"input_ids":self.input_ids[idx],"attention_mask":self.attention_mask[idx]}
        return item

    def __len__(self):
        return len(self.input_ids)
res_data=TestDataset(res['input_ids'],res["attention_mask"])
pred_res=trainer.predict(res_data)
displayResult(res['input_ids'],res["attention_mask"],pred_res[0],0)

[CLS]/S! Ariel/S! Mal/S! ##ka/S! ,/S! a/S! professor/S! at/S! Yes/S! ##hiva/S! University/S! and/S! an/S! author/S! of/S! “/S! Who/S! is/C← open/C← to/C← author/C← ##itarian/C← governance/C← within/C← western/C← demo/C← ##c/C← ##rac/C← ##ies/C← ?/C← ”/S! agreed in an email that/C!← both/C← liberal/C← ##s/C← and/C← conservative/C← ##s/C← “/C← engage/C← in/C← bias/C← ##ed/C← reasoning/C← on/C← the/C← basis/C← of/C← partisan/C← ##ship/C← ,/C← ”/C← but , he/S! argued , there/C!← is/C← still/C← a/C← fundamental/C← difference/C← between/C← left/C← and/C← right/C← :/C← [SEP]/C← 

In [20]:
def displayFormatResult(input_id,attention,prediction,offset_map,overall_offset):
    result=[]
    predict = np.argmax(prediction, axis=2)
    for i in range(len(input_id)):        
        tuple_list=[]
        tuple_type=[]
        j=0
        while j<len(predict[i]):
            if attention[i][j]==0:
                break
            if predict[i][j]==3:
                left=j
                while j<len(predict[i]) and predict[i][j] == 3:
                    if attention[i][j]==0:
                        break
                    j+=1
                tuple_list.append((left,j))
                tuple_type.append(0) #说话人
                
            elif j<len(predict[i]) and predict[i][j] == 0:
                j+=1
                
            elif predict[i][j] == 1 or predict[i][j] == 2:
                left=j
                while j<len(predict[i]) and (predict[i][j] == 1 or predict[i][j] == 2):
                    if attention[i][j]==0:
                        break
                    j+=1
                tuple_list.append((left,j))
                tuple_type.append(1) #匿名
                
            elif predict[i][j] == 4 or predict[i][j] == 5:
                left=j
                while j<len(predict[i]) and (predict[i][j] == 4 or predict[i][j] == 5):
                    if attention[i][j]==0:
                        break
                    j+=1
                tuple_list.append((left,j))
                tuple_type.append(2) #向左
                
            elif predict[i][j] == 6 or predict[i][j] == 7:
                left=j
                while j<len(predict[i]) and (predict[i][j] == 6 or predict[i][j] == 7):
                    if attention[i][j]==0:
                        break
                    j+=1
                tuple_list.append((left,j))
                tuple_type.append(3) #向右
                   
        for t in range(len(tuple_list)):
            if tuple_type[t]==0:
                pass
            
            elif tuple_type[t]==1:
                result.append({"mention":"Unknown",
                               "quoteSpeakerCharOffsetsFirst":-1,
                               "quoteSpeakerCharOffsetsSecond":-1,
                               "quotation":tokenizer.decode(input_id[i][tuple_list[t][0]:tuple_list[t][1]]),
                               "quoteCharOffsetsFirst":offset_map[i][tuple_list[t][0]][0]+overall_offset[i][0],
                               "quoteCharOffsetsSecond":offset_map[i][tuple_list[t][1]-1][1]+overall_offset[i][0],
                               "SegmentOffset":overall_offset[i][0],
                               "Type":"Anonymous"})
                
            elif tuple_type[t]==2:
                back=t
                while back>=0 and tuple_type[back]!=0:
                    back-=1
                if back<0:
                    result.append({"mention":"Unknown",
                               "quoteSpeakerCharOffsetsFirst":-1,
                               "quoteSpeakerCharOffsetsSecond":-1,
                               "quotation":tokenizer.decode(input_id[i][tuple_list[t][0]:tuple_list[t][1]]),
                               "quoteCharOffsetsFirst":offset_map[i][tuple_list[t][0]][0]+overall_offset[i][0],
                               "quoteCharOffsetsSecond":offset_map[i][tuple_list[t][1]-1][1]+overall_offset[i][0],
                                   "SegmentOffset":overall_offset[i][0],
                               "Type":"TowardsLeftFailed"})
                else:
                    result.append({"mention":tokenizer.decode(input_id[i][tuple_list[back][0]:tuple_list[back][1]]),
                               "quoteSpeakerCharOffsetsFirst":offset_map[i][tuple_list[back][0]][0]+overall_offset[i][0],
                               "quoteSpeakerCharOffsetsSecond":offset_map[i][tuple_list[back][1]-1][1]+overall_offset[i][0],
                               "quotation":tokenizer.decode(input_id[i][tuple_list[t][0]:tuple_list[t][1]]),
                               "quoteCharOffsetsFirst":offset_map[i][tuple_list[t][0]][0]+overall_offset[i][0],
                               "quoteCharOffsetsSecond":offset_map[i][tuple_list[t][1]-1][1]+overall_offset[i][0],
                                   "SegmentOffset":overall_offset[i][0],
                               "Type":"TowardsLeftSucceeded"   })
                    
            elif tuple_type[t]==3:
                after=t
                while after<len(tuple_type) and tuple_type[after]!=0:
                    after+=1
                if after>=len(tuple_type):
                    result.append({"mention":"Unknown",
                               "quoteSpeakerCharOffsetsFirst":-1,
                               "quoteSpeakerCharOffsetsSecond":-1,
                               "quotation":tokenizer.decode(input_id[i][tuple_list[t][0]:tuple_list[t][1]]),
                               "quoteCharOffsetsFirst":offset_map[i][tuple_list[t][0]][0]+overall_offset[i][0],
                               "quoteCharOffsetsSecond":offset_map[i][tuple_list[t][1]-1][1]+overall_offset[i][0],
                                   "SegmentOffset":overall_offset[i][0],
                                  "Type":"TowardsRightFailed"})
                else:
                    result.append({"mention":tokenizer.decode(input_id[i][tuple_list[after][0]:tuple_list[after][1]]),
                               "quoteSpeakerCharOffsetsFirst":offset_map[i][tuple_list[after][0]][0]+overall_offset[i][0],
                               "quoteSpeakerCharOffsetsSecond":offset_map[i][tuple_list[after][1]-1][1]+overall_offset[i][0],
                               "quotation":tokenizer.decode(input_id[i][tuple_list[t][0]:tuple_list[t][1]]),
                               "quoteCharOffsetsFirst":offset_map[i][tuple_list[t][0]][0]+overall_offset[i][0],
                               "quoteCharOffsetsSecond":offset_map[i][tuple_list[t][1]-1][1]+overall_offset[i][0],
                                   "SegmentOffset":overall_offset[i][0],
                                  "Type":"TowardsRightSucceeded"})
    return result

In [31]:
print(pred_res)

PredictionOutput(predictions=array([[[ 8.0605799e-01, -5.9098190e-01, -6.3269782e-01, ...,
          6.3757771e-01,  4.0913954e-02, -2.0112686e-01],
        [ 1.3090616e+00, -1.4902513e+00, -2.6270444e+00, ...,
         -5.4884857e-01,  2.4629512e-01, -1.4280637e-01],
        [ 9.6190464e-01, -2.0724247e+00, -1.6157204e+00, ...,
         -7.3388630e-01, -1.2125616e+00,  5.8179826e-01],
        ...,
        [ 2.0274944e+00, -2.0934317e+00, -1.8715183e+00, ...,
         -4.4806129e-01, -1.2577022e+00,  3.7372077e-01],
        [ 1.8551995e+00, -2.0018098e+00, -1.8377572e+00, ...,
         -3.3739111e-01, -1.1473535e+00,  3.0937505e-01],
        [ 2.3822696e+00, -1.9338609e+00, -1.9312274e+00, ...,
         -6.4932488e-02, -1.0994846e+00, -3.5829255e-01]],

       [[ 1.9318455e-01, -4.3617716e-01, -7.0437558e-02, ...,
          3.0385801e-01,  2.2475210e-01, -2.2860557e-01],
        [ 3.2389009e-01, -9.1635817e-01, -3.1577757e-01, ...,
          5.5821669e-01,  4.7580820e-02,  2.1291332e-0

In [45]:
res['input_ids'][0][1:5]

[17214, 18880, 1968, 117]

In [67]:
res=tokenizer(['Boris Johnson told MPs this week that while the plan was still for local and mayoral elections, some of which were postponed from last year, to happen on 6 May in England, this remained “under review”.'*100,'I hate you.'],padding='max_length',max_length=511,truncation=True)
res_data=TestDataset(res['input_ids'],res["attention_mask"])
pred_res=trainer.predict(res_data)
displayResult(res['input_ids'],res["attention_mask"],pred_res[0],0)

[CLS] Boris/S! Johnson/S! told MPs this week that/C!← while/C← the/C← plan/C← was/C← still/C← for/C← local/C← and/C← mayor/C← ##al/C← elections/C← ,/C← some/C← of/C← which/C← were/C← postponed/C← from/C← last/C← year/C← ,/C← to/C← happen/C← on/C← 6/C← May/C← in/C← England/C← ,/C← this/C← remained/C← “/C← under/C← review/C← ”/C← . Boris/S! Johnson/S! told MPs this week that/C!← while/C← the/C← plan/C← was/C← still/C← for/C← local/C← and/C← mayor/C← ##al/C← elections/C← ,/C← some/C← of/C← which/C← were/C← postponed/C← from/C← last/C← year/C← ,/C← to/C← happen/C← on/C← 6/C← May/C← in/C← England/C← ,/C← this/C← remained/C← “/C← under/C← review/C← ”/C← . Boris/S! Johnson/S! told MPs this week that/C!← while/C← the/C← plan/C← was/C← still/C← for/C← local/C← and/C← mayor/C← ##al/C← elections/C← ,/C← some/C← of/C← which/C← were/C← postponed/C← from/C← last/C← year/C← ,/C← to/C← happen/C← on/C← 6/C← May/C← in/C← England/C← ,/C← this/C← remained/C← “/C← under/C← review/C← ”/C← . Boris/S! Johnson

In [8]:
def segment(txt):
    pos=0
    segs=[]
    offsets=[]
    while pos<len(txt):
        if txt[pos]=='\n':
            while pos<len(txt) and txt[pos]=='\n':
                pos+=1
        else:
            left=pos
            while pos<len(txt) and txt[pos]!='\n':
                pos+=1
            segs.append(txt[left:pos])
            offsets.append((left,pos))
    return segs,offsets

In [109]:
segment("Britain’s economy grew at a record quarterly rate of more than 15% as lockdown restrictions were eased in the summer but the recovery was losing momentum even before new curbs came in, the latest official figures have revealed.\nData from the Office for National Statistics showed that national output expanded by just 1.1% in September – the last month before fresh action was taken to limit the spread of Covid-19.\n\nThe ONS said that while the economy had now expanded for five months in a row, the pace of recovery had decelerated. Record growth in the July to September period followed an unprecedented drop of 19.8% in the second quarter and a fall of 2.5% in the first three months of the year.\nGross domestic product – the measure used to gauge the size of the economy – increased by 9.1% in June, 6.3% in July, and 2.2% in August before slowing again in September.\nGross domestic product (GDP) measures the total value of activity in the economy over a given period of time. \nPut simply, if GDP is up on the previous three months, the economy is growing; if it is down, it is contracting. Two or more consecutive quarters of contraction are considered to be a recession. \nGDP is the sum of all goods and services produced in the economy, including the service sector, manufacturing, construction, energy, agriculture and government. Several key activities are not counted, such as unpaid work in the home. \nThe ONS uses three measures that should, in theory, add up to the same number.\n• The value of all goods and services produced – known as the output or production measure.• The value of the income generated from company profits and wages – known as the income measure.• The value of goods and services purchased by households, government, business (in terms of investment in machinery and buildings) and from overseas – known as the expenditure measure.\nEconomists are concerned with the real rate of change of GDP, which accounts for how the economy is performing after inflation.\nBritain's government statistics body, the Office for National Statistics, produces GDP figures on a monthly basis about six weeks after the end of the month. It compares the change in GDP month on month, as well as over a three-month period. \nThe ONS warns that changes on the month can prove volatile, preferring to assess economic performance over a three-month period as the wider period can smooth over irregularities. \nThe most closely watched GDP figures are for the four quarters of the year; for the three months to March, June, September and December.\nThe figures are usually revised in subsequent months as more data from businesses and the government becomes available.  \nThe ONS also calculates the size of the UK economy relative to the number of people living here. GDP per capita shows whether we are actually getting richer or poorer, by stripping out the impact of population changes. Richard Partington\nThe ONS said there was a boost from children going back to school, which had helped support activity, but there was a slowdown in business for pubs and restaurants due to the end of the “eat out to help out” scheme.\nDespite the pickup in activity as the economy began to open up in the late spring and summer, the level of national output in the third quarter was 9.7% below where it was in the last three months of 2019.\nBritain’s record compares unfavourably with other leading developed countries, most of which saw smaller falls in output in the second quarter and which have recouped more of the lost ground. The US has the least bad record, with GDP 3.5% below where it was at the end of 2019.\nBritain has seen a 22.9% rebound in activity since the economy’s low point in April, but in September was still 8.2% below its level when the crisis began in February, the ONS said.\nThe services sector – which includes hospitality and leisure – has been the hardest hit and remains 8.8% lower than it was before the spring lockdown was imposed. Manufacturing (-8.1%) and construction (-7.3%) are also well below their level in the early part of the year.\nEconomists said there would be a further blow to the economy from the tougher local restrictions introduced in October and the four-week lockdown for England that began in early November. GDP is expected to fall again in the final three months of 2020.\nDean Turner, economist at UBS global wealth management, said: “In our view, the latest round of Covid related restrictions will lead to a contraction in the final quarter, leaving the economy around 11% smaller than at the start of the year.”\nRishi Sunak, the chancellor of the exchequer, said: “Today’s figures show that our economy was recovering over the summer, but started to slow going into autumn. The steps we’ve had to take since to halt the spread of the virus mean growth has likely slowed further since then.\n“But there are reasons to be cautiously optimistic on the health side – including promising news on tests and vaccines. My economic priority continues to be jobs – that’s why we extended furlough through to March and I welcome the news today that nearly 20,000 new roles for young people have been created through our Kickstart scheme.\n“There are still hard times ahead, but we will continue to support people through this and ensure nobody is left without hope or opportunity.”\nFresh official forecasts for the economy and the public finances will be published on 25 November to coincide with the unveiling of Sunak’s one-year spending round plan. The chancellor will announce a package of measures to protect jobs and expand public services\nJonathan Athow, the deputy national statistician for economic statistics, said: “While all main sectors of the economy continued to recover, the rate of growth slowed again with the economy still remaining well below its pre-pandemic peak.\n“The return of children to school boosted activity in the education sector. Housebuilding also continued to recover, while business strengthened for lawyers and accountants after a poor August.\n“However, pubs and restaurants saw less business, after the eat out to help out scheme ended, and accommodation saw less business after a successful summer.”")

(['Britain’s economy grew at a record quarterly rate of more than 15% as lockdown restrictions were eased in the summer but the recovery was losing momentum even before new curbs came in, the latest official figures have revealed.',
  'Data from the Office for National Statistics showed that national output expanded by just 1.1% in September – the last month before fresh action was taken to limit the spread of Covid-19.',
  'The ONS said that while the economy had now expanded for five months in a row, the pace of recovery had decelerated. Record growth in the July to September period followed an unprecedented drop of 19.8% in the second quarter and a fall of 2.5% in the first three months of the year.',
  'Gross domestic product – the measure used to gauge the size of the economy – increased by 9.1% in June, 6.3% in July, and 2.2% in August before slowing again in September.',
  'Gross domestic product (GDP) measures the total value of activity in the economy over a given period of ti

In [72]:
res=tokenizer(,padding='max_length',max_length=511,truncation=True,return_offsets_mapping=True)


In [89]:
res['offset_mapping'][0]

[(0, 0),
 (0, 5),
 (6, 13),
 (14, 18),
 (19, 22),
 (23, 27),
 (28, 32),
 (33, 37),
 (38, 43),
 (44, 47),
 (48, 52),
 (53, 56),
 (57, 62),
 (63, 66),
 (67, 72),
 (73, 76),
 (77, 82),
 (82, 84),
 (85, 94),
 (94, 95),
 (96, 100),
 (101, 103),
 (104, 109),
 (110, 114),
 (115, 124),
 (125, 129),
 (130, 134),
 (135, 139),
 (139, 140),
 (141, 143),
 (144, 150),
 (151, 153),
 (154, 155),
 (156, 159),
 (160, 162),
 (163, 170),
 (170, 171),
 (172, 176),
 (177, 185),
 (186, 187),
 (187, 192),
 (193, 199),
 (199, 200),
 (200, 201),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0

In [10]:
def extractText(txt):
    segs,offsets=segment(txt)
    res=tokenizer(segs,padding='max_length',max_length=511,truncation=True,return_offsets_mapping=True)
    res_data=TestDataset(res['input_ids'],res["attention_mask"])
    pred_res=trainer.predict(res_data)
    return displayFormatResult(res['input_ids'],res["attention_mask"],pred_res[0],res['offset_mapping'],offsets)


In [11]:
content="The UK government has dropped its opposition to sharing criminal suspects’ DNA data with EU law enforcement bodies, in a potential olive branch ahead of further talks on post-Brexit security.\nThe U-turn was announced by the Home Office minister James Brokenshire in a statement to parliament on Monday, the day of an EU deadline for the government to reveal whether it intended to comply with European law.\n\nThe decision is separate from negotiations on a future relationship with the EU, but could improve the mood of tense talks as the UK seeks a security deal that includes permanent exchange of DNA, fingerprint and other data.\nUnder the former home secretary Theresa May, the UK opted into an EU crime-fighting system in 2015, in which member states exchange biometric data. In June 2019 the government began sharing DNA data of convicted criminals, but refused to share criminal suspects’ DNA.\nThat exception has now been reversed. “It is the government’s intention to begin exchanging suspects’ data held in England, Wales and Northern Ireland with connected EU member states through Prüm,” Brokenshire said, a reference to the 2005 agreement named after the small German town where EU countries first agreed to exchange fingerprints, DNA and car number plates of criminals and suspects.\nBrokenshire said consultation would continue with the Scottish government, as policing is a devolved issue.\nThe DNA exchange system enables British police to check the genetic code of EU criminals and suspects in 15 minutes, compared with 143 days through the Interpol process, the Home Office said in 2016. Interpol said its DNA database now has an automatic response time of 15 minutes. \nCivil liberties campaigners and some MPs had been concerned that safeguards on criminal suspects’ data were insufficient. Brokenshire said the government was satisfied with EU processes as extra safeguards had been put in place since 2015, including an independent oversight board and extra checks when minors were involved.\n“Ensuring continued adherence to the UK’s scientific standards means there is a one in a billion chance that a UK DNA sample would be falsely matched with an overseas criminal investigation,” he told parliament.\nThe UK’s current participation in EU police data sharing ends on 31 December when the Brexit transition period expires. The EU has previously warned that opposition to sharing criminal suspects’ DNA would be an obstacle to a future deal.\nThe government has said that since joining the DNA exchange system last July, about 12,000 “hits” related to UK investigations had come from EU member states, citing progress into an unsolved sexual assault case in Glasgow in 2012. A hit is an anonymised yes/no result of a DNA match. If there is a positive result, police forces can request personal information, such as name and date of birth. The UK has provided EU law enforcement officials with 41,000 hits."

In [12]:
extractText(content)

[{'speaker': 'The UK government',
  'speakerBegin': 0,
  'speakerEnd': 17,
  'Quotation': 'sharing criminal suspects ’ DNA data with EU law enforcement bodies',
  'QuotationBegin': 48,
  'QuotationEnd': 114,
  'SegmentOffset': 0,
  'Type': 'TowardsLeftSucceeded'},
 {'speaker': 'the Home Office minister James Brokenshire',
  'speakerBegin': 220,
  'speakerEnd': 262,
  'Quotation': 'The U - turn',
  'QuotationBegin': 192,
  'QuotationEnd': 202,
  'SegmentOffset': 192,
  'Type': 'TowardsRightSucceeded'},
 {'speaker': 'the government',
  'speakerBegin': 333,
  'speakerEnd': 347,
  'Quotation': 'comply with European law',
  'QuotationBegin': 381,
  'QuotationEnd': 405,
  'SegmentOffset': 192,
  'Type': 'TowardsLeftSucceeded'},
 {'speaker': 'Unknown',
  'speakerBegin': -1,
  'speakerEnd': -1,
  'Quotation': 'EU crime - fighting system',
  'QuotationBegin': 698,
  'QuotationEnd': 722,
  'SegmentOffset': 632,
  'Type': 'TowardsLeftFailed'},
 {'speaker': 'Unknown',
  'speakerBegin': -1,
  'spea

In [19]:
content[2202:2204]

'he'

In [100]:
segment(content)

(['The UK government has dropped its opposition to sharing criminal suspects’ DNA data with EU law enforcement bodies, in a potential olive branch ahead of further talks on post-Brexit security.',
  'The U-turn was announced by the Home Office minister James Brokenshire in a statement to parliament on Monday, the day of an EU deadline for the government to reveal whether it intended to comply with European law.',
  'The decision is separate from negotiations on a future relationship with the EU, but could improve the mood of tense talks as the UK seeks a security deal that includes permanent exchange of DNA, fingerprint and other data.',
  'Under the former home secretary Theresa May, the UK opted into an EU crime-fighting system in 2015, in which member states exchange biometric data. In June 2019 the government began sharing DNA data of convicted criminals, but refused to share criminal suspects’ DNA.',
  'That exception has now been reversed. “It is the government’s intention to beg

In [101]:
content[192:406]

'The U-turn was announced by the Home Office minister James Brokenshire in a statement to parliament on Monday, the day of an EU deadline for the government to reveal whether it intended to comply with European law.'

In [1]:
from genre.entity_linking import get_end_to_end_prefix_allowed_tokens_fn_hf as get_prefix_allowed_tokens_fn

In [2]:
from genre.utils import get_entity_spans_hf as get_entity_spans

In [3]:
from genre.hf_model import GENRE

In [4]:
import pickle
from genre.trie import Trie

In [5]:
with open("models/kilt_titles_trie_dict.pkl", "rb") as f:
    trie = Trie.load_from_dict(pickle.load(f))

In [6]:
model = GENRE.from_pretrained("models/hf_e2e_entity_linking_wiki_abs").eval()

404 Client Error: Not Found for url: https://huggingface.co/models/hf_e2e_entity_linking_wiki_abs/resolve/main/config.json


OSError: Can't load config for 'models/hf_e2e_entity_linking_wiki_abs'. Make sure that:

- 'models/hf_e2e_entity_linking_wiki_abs' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'models/hf_e2e_entity_linking_wiki_abs' is the correct path to a directory containing a config.json file



In [None]:
sentences = ["In 1921, Einstein received an Nobel Prize."]

In [None]:
get_entity_spans(model,sentences)

In [1]:
prefix_allowed_tokens_fn = get_prefix_allowed_tokens_fn(model, sentences)

model.sample(
    sentences,
    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
)

NameError: name 'get_prefix_allowed_tokens_fn' is not defined

In [11]:
what=get_entity_spans(model,sentences)

In [19]:
what[0]

[(0,
  7,
  'List_of_prizes_awarded_at_the_1921_International_Academy_of_Arts_and_Sciences_Prize_in_Geometry_and_in_Physics_(Euclidean-Aschenfeldt)'),
 (9, 8, 'Albert_Einstein'),
 [30,
  11,
  'Nobel Prize in Physiology or Medicine (Aristotle-Eggs) Prize in Ephraim Eben-Müller-Kulmbach-Innsbruck-Künstler-Ausstellungen-Eisenkirchen (Einstein-Auerstedt-Königstuhl) Aarne-Egenstahl, Fürstenberg-Sonderburg-Köln-Ausserlingen, Künchenfeldt-Sündenburg-Fürstenried-Aarne, Köln bei Ernstenberg, Königsberg, Nürnbergen, Südbergen and Neustadt am Rhein-Oberfrohnsrücken bei Nahrungen, Düsseldorf and Nuremberg (Egen) Agrariane) A-Sinfonogründe, Kirchhainen, Potsdam and Neuburg (Energetik) A.S. Türkstahlsrüppen, Einstein, Einstein and Naturmüller, Einstein (Einherrschaft) in Gewerksrüttemberg) In Nüssenkirchein, Einstein-Einheinsthalen and Märzen, Dorek and Dorem, Einstein. In 1921, Einstein received an Nobel Prize.   -']]

In [21]:
for i in what[0]:
    print(i)

(0, 7, 'List_of_prizes_awarded_at_the_1921_International_Academy_of_Arts_and_Sciences_Prize_in_Geometry_and_in_Physics_(Euclidean-Aschenfeldt)')
(9, 8, 'Albert_Einstein')
[30, 11, 'Nobel Prize in Physiology or Medicine (Aristotle-Eggs) Prize in Ephraim Eben-Müller-Kulmbach-Innsbruck-Künstler-Ausstellungen-Eisenkirchen (Einstein-Auerstedt-Königstuhl) Aarne-Egenstahl, Fürstenberg-Sonderburg-Köln-Ausserlingen, Künchenfeldt-Sündenburg-Fürstenried-Aarne, Köln bei Ernstenberg, Königsberg, Nürnbergen, Südbergen and Neustadt am Rhein-Oberfrohnsrücken bei Nahrungen, Düsseldorf and Nuremberg (Egen) Agrariane) A-Sinfonogründe, Kirchhainen, Potsdam and Neuburg (Energetik) A.S. Türkstahlsrüppen, Einstein, Einstein and Naturmüller, Einstein (Einherrschaft) in Gewerksrüttemberg) In Nüssenkirchein, Einstein-Einheinsthalen and Märzen, Dorek and Dorem, Einstein. In 1921, Einstein received an Nobel Prize.   -']


In [23]:
prefix_allowed_tokens_fn

<function genre.entity_linking._get_end_to_end_prefix_allowed_tokens_fn.<locals>.prefix_allowed_tokens_fn(batch_id, sent)>

In [24]:
from genre.utils import get_markdown
from IPython.display import Markdown

In [26]:
Markdown(get_markdown(sentences, what)[0])

[In 1921](https://en.wikipedia.org/wiki/List_of_prizes_awarded_at_the_1921_International_Academy_of_Arts_and_Sciences_Prize_in_Geometry_and_in_Physics_(Euclidean-Aschenfeldt)), [Einstein](https://en.wikipedia.org/wiki/Albert_Einstein) received an [Nobel Prize](https://en.wikipedia.org/wiki/Nobel Prize in Physiology or Medicine (Aristotle-Eggs) Prize in Ephraim Eben-Müller-Kulmbach-Innsbruck-Künstler-Ausstellungen-Eisenkirchen (Einstein-Auerstedt-Königstuhl) Aarne-Egenstahl, Fürstenberg-Sonderburg-Köln-Ausserlingen, Künchenfeldt-Sündenburg-Fürstenried-Aarne, Köln bei Ernstenberg, Königsberg, Nürnbergen, Südbergen and Neustadt am Rhein-Oberfrohnsrücken bei Nahrungen, Düsseldorf and Nuremberg (Egen) Agrariane) A-Sinfonogründe, Kirchhainen, Potsdam and Neuburg (Energetik) A.S. Türkstahlsrüppen, Einstein, Einstein and Naturmüller, Einstein (Einherrschaft) in Gewerksrüttemberg) In Nüssenkirchein, Einstein-Einheinsthalen and Märzen, Dorek and Dorem, Einstein. In 1921, Einstein received an Nobel Prize.   -).

In [8]:
from genre.utils import get_entity_spans_pre_processing

In [11]:
get_entity_spans_pre_processing(sentences)

[' In 1921, Einstein received an Nobel Prize. ']

In [12]:
prefix_allowed_tokens_fn = get_prefix_allowed_tokens_fn(model, sentences)

model.sample(
    sentences,
    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
)

[[{'text': 'In { 1921 } [ List of Nobel laureates in Physiology or Medicine by year of appointment ], { Einstein } [ Albert Einstein ] received an { Nobel } [ Nobel Prize in Physics ] Prize.',
   'logprob': tensor(-0.9352)}],
 [{'text': 'In { 1921 } [ List of Nobel laureates in Physiology or Medicine by year of appointment ], { Einstein } [ Albert Einstein ] received an { Nobel Prize } [ Nobel Memorial Prize in Economic Sciences (Euclidean Society) ] {. } [ Einstein (crater) ]',
   'logprob': tensor(-1.1662)}],
 [{'text': 'In { 1921 } [ List of Nobel laureates in Physiology or Medicine by year of appointment ], { Einstein } [ Albert Einstein ] received an { Nobel Prize } [ Nobel Memorial Prize in Economic Sciences (Euclidean League) ] {. } [ Einstein (crater) ]',
   'logprob': tensor(-1.1767)}],
 [{'text': 'In { 1921 } [ List of Nobel laureates in Physiology or Medicine by year of appointment ], { Einstein } [ Albert Einstein ] received an { Nobel Prize } [ Nobel Memorial Prize in Econ

In [13]:
prefix_allowed_tokens_fn = get_prefix_allowed_tokens_fn(model, get_entity_spans_pre_processing(sentences))

model.sample(
    sentences,
    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
)

[[{'text': ' { In 1921 } [ List of Nobel laureates in Physiology or Medicine by year of appointment ], { Einstein } [ Albert Einstein ] received an { Nobel Prize } [ Nobel Prize in Physicist (1921 Prize in Physics) (1923) ] {. } [ Einstein (lunar crater) ] ',
   'logprob': tensor(-1.6385)}],
 [{'text': ' { In 1921 } [ List of Nobel laureates in Physiology or Medicine by year of appointment ], { Einstein } [ Albert Einstein ] received an { Nobel Prize } [ Nobel Prize in Physicist (1921 Prize in Physics) (1923) ] {. } [ Einstein (crater) ] ',
   'logprob': tensor(-1.6445)}],
 [{'text': ' { In 1921 } [ List of Nobel laureates in Physiology or Medicine by year of appointment ], { Einstein } [ Albert Einstein ] received an { Nobel Prize } [ Nobel Prize in Physicist (1921 Prize in Physics) (1923) ] {. } [ Einstein (lunar crater, Mount Everest) ] ',
   'logprob': tensor(-1.7248)}],
 [{'text': ' { In 1921 } [ List of Nobel laureates in Physiology or Medicine by year of appointment ], { Einstei

In [15]:
get_entity_spans(model,sentences)

KeyboardInterrupt: 

In [17]:
sentences = ["The EU"]
prefix_allowed_tokens_fn = get_prefix_allowed_tokens_fn(model, get_entity_spans_pre_processing(sentences))

model.sample(
    sentences,
    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
)

NameError: name 'get_prefix_allowed_tokens_fn' is not defined

In [1]:
import pickle
from genre.trie import Trie

# load the prefix tree (trie)
with open("models/kilt_titles_trie_dict.pkl", "rb") as f:
    trie = Trie.load_from_dict(pickle.load(f))

In [2]:
from genre.hf_model import GENRE
model = GENRE.from_pretrained("models/hf_entity_disambiguation_aidayago").eval()

In [5]:
sentences = ["[START_ENT] Trump [END_ENT] is our president."]

model.sample(
    sentences,
    prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist()),
)

[[{'text': 'Donald Trump', 'logprob': tensor(-0.0784)}],
 [{'text': 'Presidency of Donald Trump', 'logprob': tensor(-1.0255)}],
 [{'text': 'President of the United States', 'logprob': tensor(-1.3235)}],
 [{'text': 'Presidency of Barack Obama', 'logprob': tensor(-2.0674)}],
 [{'text': 'Donald Trump in popular culture', 'logprob': tensor(-2.2210)}]]

In [15]:
what[5]

IndexError: list index out of range

In [32]:
memo={}
def getEntity(txt):
    if txt in memo:
        return memo[txt]
    else:
        if type(txt)!=str:
            raise Exception("说话人不是字符串")
        else:
            sentences = ["[START_ENT] "+txt+" [END_ENT]"]
            result=model.sample(sentences,prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist()))
            return result[0][0]['text'],result[0][0]['logprob'].item()

In [33]:
getEntity("Trump")

('Donald Trump', -0.12928657233715057)

In [38]:
"https://en.wikipedia.org/wiki/"+'Donald Trump '.strip().replace(' ',"_")

'https://en.wikipedia.org/wiki/Donald_Trump'

In [1]:
# -*- coding:utf-8 -*-
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.distilbert.modeling_distilbert import DistilBertPreTrainedModel
from transformers import DistilBertModel,DistilBertConfig,Trainer,TrainingArguments,set_seed,AutoModelForTokenClassification
from transformers import DistilBertForTokenClassification
import torch.nn as nn
import torch
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import AutoTokenizer
import numpy as np
import pickle
from genre.trie import Trie
from genre.hf_model import GENRE

with open("kilt_titles_trie_dict.pkl", "rb") as f:
    trie = Trie.load_from_dict(pickle.load(f))

EDmodel = GENRE.from_pretrained("models/hf_entity_disambiguation_aidayago").eval()

model=DistilBertForTokenClassification.from_pretrained("checkpoint-3300/",num_labels=8)
trainer = Trainer(model=model)
tokenizer=AutoTokenizer.from_pretrained('distilbert-base-cased')
memo={}

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids,attention_mask):
        self.input_ids=input_ids
        self.attention_mask=attention_mask

    def __getitem__(self, idx):
        item = {"input_ids":self.input_ids[idx],"attention_mask":self.attention_mask[idx]}
        return item

    def __len__(self):
        return len(self.input_ids)



In [2]:
def segment(txt):
    pos=0
    segs=[]
    offsets=[]
    while pos<len(txt):
        if txt[pos]=='\n':
            while pos<len(txt) and txt[pos]=='\n':
                pos+=1
        else:
            left=pos
            while pos<len(txt) and txt[pos]!='\n':
                pos+=1
            segs.append(txt[left:pos])
            offsets.append((left,pos))
    return segs,offsets

def displayFormatResult(input_id,attention,prediction,offset_map,overall_offset):
    result=[]
    predict = np.argmax(prediction, axis=2)
    for i in range(len(input_id)):        
        tuple_list=[]
        tuple_type=[]
        j=0
        while j<len(predict[i]):
            if attention[i][j]==0:
                break
            if predict[i][j]==3:
                left=j
                while j<len(predict[i]) and predict[i][j] == 3:
                    if attention[i][j]==0:
                        break
                    j+=1
                tuple_list.append((left,j))
                tuple_type.append(0) #说话人
                
            elif j<len(predict[i]) and predict[i][j] == 0:
                j+=1
                
            elif predict[i][j] == 1 or predict[i][j] == 2:
                left=j
                while j<len(predict[i]) and (predict[i][j] == 1 or predict[i][j] == 2):
                    if attention[i][j]==0:
                        break
                    j+=1
                tuple_list.append((left,j))
                tuple_type.append(1) #匿名
                
            elif predict[i][j] == 4 or predict[i][j] == 5:
                left=j
                while j<len(predict[i]) and (predict[i][j] == 4 or predict[i][j] == 5):
                    if attention[i][j]==0:
                        break
                    j+=1
                tuple_list.append((left,j))
                tuple_type.append(2) #向左
                
            elif predict[i][j] == 6 or predict[i][j] == 7:
                left=j
                while j<len(predict[i]) and (predict[i][j] == 6 or predict[i][j] == 7):
                    if attention[i][j]==0:
                        break
                    j+=1
                tuple_list.append((left,j))
                tuple_type.append(3) #向右
                   
        for t in range(len(tuple_list)):
            if tuple_type[t]==0:
                pass
            
            elif tuple_type[t]==1:
                result.append({"mentionRaw":"Unknown",
                               "quoteSpeakerCharOffsetsFirst":-1,
                               "quoteSpeakerCharOffsetsSecond":-1,
                               "quotation":tokenizer.decode(input_id[i][tuple_list[t][0]:tuple_list[t][1]]),
                               "quoteCharOffsetsFirst":offset_map[i][tuple_list[t][0]][0]+overall_offset[i][0],
                               "quoteCharOffsetsSecond":offset_map[i][tuple_list[t][1]-1][1]+overall_offset[i][0],
                               "SegmentOffset":overall_offset[i][0],
                               "Type":"Anonymous"})
                
            elif tuple_type[t]==2:
                back=t
                while back>=0 and tuple_type[back]!=0:
                    back-=1
                if back<0:
                    result.append({"mentionRaw":"Unknown",
                               "quoteSpeakerCharOffsetsFirst":-1,
                               "quoteSpeakerCharOffsetsSecond":-1,
                               "quotation":tokenizer.decode(input_id[i][tuple_list[t][0]:tuple_list[t][1]]),
                               "quoteCharOffsetsFirst":offset_map[i][tuple_list[t][0]][0]+overall_offset[i][0],
                               "quoteCharOffsetsSecond":offset_map[i][tuple_list[t][1]-1][1]+overall_offset[i][0],
                                   "SegmentOffset":overall_offset[i][0],
                               "Type":"TowardsLeftFailed"})
                else:
                    result.append({"mentionRaw":tokenizer.decode(input_id[i][tuple_list[back][0]:tuple_list[back][1]]),
                               "quoteSpeakerCharOffsetsFirst":offset_map[i][tuple_list[back][0]][0]+overall_offset[i][0],
                               "quoteSpeakerCharOffsetsSecond":offset_map[i][tuple_list[back][1]-1][1]+overall_offset[i][0],
                               "quotation":tokenizer.decode(input_id[i][tuple_list[t][0]:tuple_list[t][1]]),
                               "quoteCharOffsetsFirst":offset_map[i][tuple_list[t][0]][0]+overall_offset[i][0],
                               "quoteCharOffsetsSecond":offset_map[i][tuple_list[t][1]-1][1]+overall_offset[i][0],
                                   "SegmentOffset":overall_offset[i][0],
                               "Type":"TowardsLeftSucceeded"   })
                    
            elif tuple_type[t]==3:
                after=t
                while after<len(tuple_type) and tuple_type[after]!=0:
                    after+=1
                if after>=len(tuple_type):
                    result.append({"mentionRaw":"Unknown",
                               "quoteSpeakerCharOffsetsFirst":-1,
                               "quoteSpeakerCharOffsetsSecond":-1,
                               "quotation":tokenizer.decode(input_id[i][tuple_list[t][0]:tuple_list[t][1]]),
                               "quoteCharOffsetsFirst":offset_map[i][tuple_list[t][0]][0]+overall_offset[i][0],
                               "quoteCharOffsetsSecond":offset_map[i][tuple_list[t][1]-1][1]+overall_offset[i][0],
                                   "SegmentOffset":overall_offset[i][0],
                                  "Type":"TowardsRightFailed"})
                else:
                    result.append({"mentionRaw":tokenizer.decode(input_id[i][tuple_list[after][0]:tuple_list[after][1]]),
                               "quoteSpeakerCharOffsetsFirst":offset_map[i][tuple_list[after][0]][0]+overall_offset[i][0],
                               "quoteSpeakerCharOffsetsSecond":offset_map[i][tuple_list[after][1]-1][1]+overall_offset[i][0],
                               "quotation":tokenizer.decode(input_id[i][tuple_list[t][0]:tuple_list[t][1]]),
                               "quoteCharOffsetsFirst":offset_map[i][tuple_list[t][0]][0]+overall_offset[i][0],
                               "quoteCharOffsetsSecond":offset_map[i][tuple_list[t][1]-1][1]+overall_offset[i][0],
                                   "SegmentOffset":overall_offset[i][0],
                                  "Type":"TowardsRightSucceeded"})
    return result

def getEntity(txt):
    if txt in memo:
        return memo[txt]
    else:
        if type(txt)!=str:
            raise Exception("说话人不是字符串")
        else:
            sentences = ["[START_ENT] "+txt+" [END_ENT]"]
            result=EDmodel.sample(sentences,prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist()))
            memo[txt]=(result[0][0]['text'],result[0][0]['logprob'].item())
            return result[0][0]['text'],result[0][0]['logprob'].item()

def extractText(
    txt):
    segs,offsets=segment(txt)
    res=tokenizer(segs,padding='max_length',max_length=511,truncation=True,return_offsets_mapping=True)
    res_data=TestDataset(res['input_ids'],res["attention_mask"])
    pred_res=trainer.predict(res_data)
    middle_result=displayFormatResult(res['input_ids'],res["attention_mask"],pred_res[0],res['offset_mapping'],offsets)
    for i in range(len(middle_result)):
        if type(middle_result[i]['mentionRaw'])!=str:
            print("说话人不是字符串！")
        else:
            linked=getEntity(middle_result[i]['mentionRaw'])
            middle_result[i]['mention']=linked[0]
            middle_result[i]['mentionLinkLogProb']=linked[1]
            middle_result[i]['links']="https://en.wikipedia.org/wiki/"+linked[0].strip().replace(' ',"_")
    return middle_result


In [3]:
content="President Donald Trump is expected to cut a significant number of U.S. troops in Afghanistan and a smaller number in Iraq by the final days of his presidency, U.S. officials said Monday. The plan would run counter to military commanders’ advice over the past year, while still falling short of Trump’s much-touted goal to end America’s long wars.The decision comes just days after Trump installed a new slate of loyalists in top Pentagon positions who share his frustration with the continued troop presence in the war zones. But the expected plans would leave 2,500 troops in both Iraq and Afghanistan, meaning that President-elect Joe Biden would be the fourth president to grapple with the still-smoldering conflicts launched in the aftermath of the Sept. 11, 2001, attacks.ADVERTISEMENTU.S. officials said military leaders were told over the weekend about the planned withdrawals and that an executive order is in the works but has not yet been delivered to commanders. Officials cautioned that there could always be changes, and Trump is known to make snap decisions based on media reports and online chatter. Officials spoke on condition of anonymity to discuss internal deliberations.There are 4,500 to 5,000 troops in Afghanistan and more than 3,000 in Iraq. As news broke about the plan, Republican leaders on Capitol Hill issued stark warnings about making any hasty exit from Afghanistan that could jeopardize the peace process and undermine counterterrorism efforts.More Stories:– NATO, acting US Pentagon chief discuss Afghanistan– Afghans welcome report on Australian troops' alleged crimes– Suicide car bomb in Afghan capital kills 3 troops, wounds 4Senate Majority Leader Mitch McConnell said the Trump administration has made tremendous headway against terrorist threats, but warned against a potentially “humiliating” pullout from Afghanistan that he said would be worse than President Barack Obama’s 2011 withdrawal from Iraq and reminiscent of the U.S. departure from Saigon in 1975.Rep. Michael McCaul, Republican leader on the House Foreign Affairs Committee, said of the plans for Afghanistan, “We need to ensure a residual force is maintained for the foreseeable future to protect U.S. national and homeland security interests and to help secure peace for Afghanistan.”Under the planned order, the troop cuts would be completed just five days before Biden takes office on Jan. 20. Military commanders have expressed less concern about the reduction in Iraq, where the Iraqi forces are better able to maintain their nation’s security.Trump’s new Pentagon chief, Christopher Miller, hinted at the troop withdrawals over the weekend in a carefully worded message to the force.ADVERTISEMENT“We remain committed to finishing the war that al-Qaida brought to our shores in 2001,” he said, and warned that “we must avoid our past strategic error of failing to see the fight through to the finish.”But Miller also made it clear that “all wars must end.”“This fight has been long, our sacrifices have been enormous. and many are weary of war — I’m one of them,” he said. ”Ending wars requires compromise and partnership. We met the challenge; we gave it our all. Now, it’s time to come home.”The accelerated withdrawal, however, goes against the longstanding advice of Trump’s military leadership, including Marine Gen. Frank McKenzie, top U.S. commander for the Middle East. But officials suggested that commanders will be able to live with the partial pullout, which allows them to keep counterterrorism troops in Afghanistan and gives them time to remove critical equipment fro the country.McKenzie and others have repeatedly argued that a hasty withdrawal could undercut negotiations to finalize ongoing peace negotiations between the Taliban and representatives of Afghan society, including the Afghan government. And they also warn that U.S. forces should remain in the country to keep Islamic State militants in check.Biden has sounded less absolute about troop withdrawal. He has said some troops could stay in Afghanistan to focus on the counterterrorism mission. In response to a questionnaire before the election, he said: “Americans are rightly weary of our longest war; I am, too. But we must end the war responsibly, in a manner that ensures we both guard against threats to our homeland and never have to go back.”The expected order, first reported by CNN, adds to what has been a litany of muddled White House and Pentagon messages on troops withdrawals from both Afghanistan and Iraq, only exacerbating what has been an emotional roller coaster for the troops and their families. Adding to the confusion: The Pentagon has historically failed to count up to hundreds of troops actually on the ground, including some special operations forces and personnel on temporary duty for only a few months. Often that is due to political sensitivities in those countries and in the U.S.The Pentagon was already on track to cut troops levels in Afghanistan to about 4,500 by mid-November. U.S. military leaders have consistently said that going below that number must be based on conditions on the ground, including a measurable reduction in attacks by the Taliban on Afghan troops. And they insist they have not seen that yet.America’s exit from Afghanistan after 19 years was laid out in a February agreement Washington reached with the Taliban. That agreement said U.S. troops would be out of Afghanistan in 18 months, provided the Taliban honored a commitment to fight terrorist groups, with most attention seemingly focused on the Islamic State group’s affiliate in the country.Military officials also have warned that there is a large amount of critical, classified equipment in Afghanistan that must be removed, but it will take time. They also say that any full U.S. withdrawal needs to be coordinated with other coalition allies that have troops in the country.The White House, however, issued a confusing series of statements about Afghanistan over the past month. Trump on Oct. 7 tweeted that “we should have the small remaining number of our BRAVE Men and Women serving in Afghanistan home by Christmas.” When asked about those comments, Robert O’Brien, his national security adviser, said Trump was just expressing a hope.O’Brien, meanwhile, has said the number of troops in Afghanistan would drop to 2,500 by early next year. At the time, defense officials said they had not received orders to cut troops to 2,500. And they warned that withdrawing troops quickly could remove some incentive for the struggling peace talks.According to the February agreement, the U.S. troop withdrawal next year is tied to the Taliban’s commitment to fight militant groups — such as the Islamic State group — in the county, and is not linked to successful negotiations between the Taliban and government. The Islamic State group is seen as extremely dangerous and intent on targeting America and other Western interests.The Taliban and Afghan government negotiators have been meeting for over a month in the Middle Eastern state of Qatar with little sign of progress. The Taliban, meanwhile, have staged near daily deadly attacks against Afghan forces. ____Associated Press writers Kathy Gannon in Islamabad, Pakistan, and Lisa Mascaro in Washington contributed to this report."

In [5]:
extractText(content)

[{'mentionRaw': 'U. S. officials',
  'quoteSpeakerCharOffsetsFirst': 159,
  'quoteSpeakerCharOffsetsSecond': 173,
  'quotation': 'President Donald Trump is expected to cut a significant number of U. S. troops in Afghanistan and a smaller number in Iraq by the final days of his presidency',
  'quoteCharOffsetsFirst': 0,
  'quoteCharOffsetsSecond': 157,
  'SegmentOffset': 0,
  'Type': 'TowardsRightSucceeded',
  'mention': 'United States',
  'mentionLinkLogProb': -0.44718435406684875,
  'links': 'https://en.wikipedia.org/wiki/United_States'},
 {'mentionRaw': 'Trump ’ s',
  'quoteSpeakerCharOffsetsFirst': 294,
  'quoteSpeakerCharOffsetsSecond': 301,
  'quotation': 'to end America ’ s long wars',
  'quoteCharOffsetsFirst': 319,
  'quoteCharOffsetsSecond': 345,
  'SegmentOffset': 0,
  'Type': 'TowardsLeftSucceeded',
  'mention': 'Donald Trump',
  'mentionLinkLogProb': -0.568838894367218,
  'links': 'https://en.wikipedia.org/wiki/Donald_Trump'},
 {'mentionRaw': 'who',
  'quoteSpeakerCharOffse

In [6]:
memo

{'U. S. officials': ('United States', -0.44718435406684875),
 'Trump ’ s': ('Donald Trump', -0.568838894367218),
 'who': ('Who (pronoun)', -0.5766157507896423),
 'Unknown': ('Unknown', -0.9229084253311157),
 'S. officials': ('South Korea', -0.40526676177978516),
 'Officials': ('Official', -0.061121683567762375),
 'Republican leaders on Capitol Hill': ('Republican Party (United States)',
  -0.25988897681236267),
 'acting': ('Acting', -0.09484714269638062),
 '##Senate Majority Leader Mitch McConnell': ('Mitch McConnell',
  -0.07700255513191223),
 'he': ('He (surname)', -0.3188076317310333),
 'Rep. Michael McCaul, Republican leader on the House Foreign Affairs Committee,': ('Michael McCaul',
  -0.07016891241073608)}

In [1]:
from genre.entity_linking import get_end_to_end_prefix_allowed_tokens_fn_hf as get_prefix_allowed_tokens_fn

In [2]:
from genre.utils import get_entity_spans_hf as get_entity_spans

In [1]:
from genre.hf_model import GENRE

In [2]:
model = GENRE.from_pretrained("models/hf_wikipage_retrieval").eval()

In [8]:
import pickle
from genre.trie import Trie
with open("./models/kilt_titles_trie_dict.pkl", "rb") as f:
    trie = Trie.load_from_dict(pickle.load(f))

In [11]:
sentences = ["President Trump"]

model.sample(
    sentences,
    prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist()),
)

[[{'text': 'Trump Productions', 'logprob': tensor(-0.7931)}],
 [{'text': 'Presidential transition of Donald Trump',
   'logprob': tensor(-0.9255)}],
 [{'text': 'Presidential transition of Barack Obama',
   'logprob': tensor(-0.9652)}],
 [{'text': 'Donald Trump 2016 presidential campaign',
   'logprob': tensor(-1.0125)}],
 [{'text': 'Donald Trump Supreme Court candidates',
   'logprob': tensor(-1.1937)}]]

In [12]:
prefix_allowed_tokens_fn = get_prefix_allowed_tokens_fn(model, sentences)

In [13]:
model.sample(
    sentences,
    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
)

[[{'text': 'Trump', 'logprob': tensor(-0.5179)}],
 [{'text': ' {Trump } [ Donald Trump ]', 'logprob': tensor(-2.1531)}],
 [{'text': ' {Trump } [ Presidency of Donald Trump ]',
   'logprob': tensor(-2.1581)}],
 [{'text': ' {Trump } [ Donald T. Trump ]', 'logprob': tensor(-2.3450)}],
 [{'text': ' {Trump } [ Donald Donald Trump ]', 'logprob': tensor(-2.5712)}]]

In [1]:
import spacy
from spacyEntityLinker import EntityLinker

In [2]:
nlp = spacy.load("en_core_web_sm")
nlp2 = spacy.load("en_core_web_sm")

In [3]:
entityLinker = EntityLinker()

In [4]:
nlp.add_pipe(entityLinker, last=True, name="entityLinker")

In [5]:
doc = nlp("joe biden")

In [6]:
all_linked_entities=doc._.linkedEntities

In [8]:
for sent in doc.sents:
    sent._.linkedEntities.pretty_print()

https://www.wikidata.org/wiki/Q6279       6279       Joe Biden                       47th Vice President of the United States (in office from 2009 to 2017)                              


In [9]:
nlp2("joe biden")

joe biden

In [10]:
all_linked_entities=doc._.linkedEntities

In [11]:
for sent in doc.sents:
    sent._.linkedEntities.pretty_print()

https://www.wikidata.org/wiki/Q6279       6279       Joe Biden                       47th Vice President of the United States (in office from 2009 to 2017)                              


In [16]:
doc = nlp("vivek kotecha")
str(doc._.linkedEntities[1])

IndexError: list index out of range

In [45]:
a=tuple(str(i) for i in doc._.linkedEntities[0].get_super_entities(limit=10))

In [47]:
'&'.join(a)

'human&billionaire'

In [48]:
None,1,2,3

(None, 1, 2, 3)

In [49]:
from transformers import AlbertForTokenClassification
from torch.utils.data.dataloader import DataLoader
import torch
from transformers import AutoTokenizer
import numpy as np
import logging
import os
import json
import time
import spacy
import neuralcoref
from spacyEntityLinker import EntityLinker

In [50]:
tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')

In [51]:
tokenizer.save_pretrained("tok")

('tok\\tokenizer_config.json',
 'tok\\special_tokens_map.json',
 'tok\\spiece.model',
 'tok\\added_tokens.json',
 'tok\\tokenizer.json')

In [17]:
spacy.prefer_gpu()

False

In [18]:
spacy.require_gpu()

ValueError: GPU is not accessible. Was the library installed correctly?