In [None]:
import json


# 将数据转为  BIO 标注形式
def dimension_label(path, save_path, labels_path=None):
    label_dict = ['O']
    with open(save_path, "w", encoding="utf-8") as w:
        #写入模式
        with open(path, "r", encoding="utf-8") as r:
            for line in r:
                line = json.loads(line)
                text = line['text']
                label = line['label']
                text_label = ['O'] * len(text)
                for label_key in label:  # 遍历实体标签
                    B_label = "B-" + label_key
                    I_label = "I-" + label_key
                    if B_label not in label_dict:
                        label_dict.append(B_label)
                    if I_label not in label_dict:
                        label_dict.append(I_label)
                    label_item = label[label_key]
                    for entity in label_item:  # 遍历实体
                        position = label_item[entity]
                        '''
                        start = position[0][0]
                        end = position[0][1]
                        print(f"实体: {entity}, 起始位置: {start}, 结束位置: {end}, 文本: {text}, 文本标签长度: {len(text_label)}")
                        # 检查 start 和 end 是否在 text_label 的范围内
                        if start < 0 or end >= len(text_label):
                            print(f"错误：实体 {entity} 的索引超出范围。起始位置: {start}, 结束位置: {end}, 文本长度: {len(text_label)}")
                            continue
                        text_label[start] = B_label
                        for i in range(start + 1, end + 1):
                            text_label[i] = I_label
                        '''
                        start = position[0][0]
                        end = position[0][1]
                        text_label[start] = B_label
                        for i in range(start + 1, end + 1):
                            text_label[i] = I_label
                line = {
                    "text": text,
                    "label": text_label
                }
                line = json.dumps(line, ensure_ascii=False)
                w.write(line + "\n")
                w.flush()

    if labels_path:  # 保存 label ，后续训练和预测时使用
        label_map = {}
        for i,label in enumerate(label_dict):
            label_map[label] = i
        with open(labels_path, "w", encoding="utf-8") as w:
            labels = json.dumps(label_map, ensure_ascii=False)
            w.write(labels + "\n")
            w.flush()
            


if __name__ == '__main__':
    path = "./data/NER/dev.json"
    save_path = "./data/NER/new/dev.json"
    dimension_label(path, save_path)

    path = "./data/NER/train.json"
    save_path = "./data/NER/new/train.json"
    dimension_label(path, save_path)
    
    labels_path = "./data/NER/new/labels.json"
    dimension_label('./data/NER/original_data.json','./data/NER/new/original_data.json',labels_path)

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
import json


class NERDataset(Dataset):
    def __init__(self, tokenizer, file_path, labels_map, max_length=300):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.labels_map = labels_map

        self.text_data = []
        self.label_data = []
        with open(file_path, "r", encoding="utf-8") as r:
            for line in r:
                line = json.loads(line)
                text = line['text']
                label = line['label']
                self.text_data.append(text)
                self.label_data.append(label)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        text = self.text_data[idx]
        labels = self.label_data[idx]

        # 使用分词器对句子进行处理
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        # 将标签转换为数字编码
        label_ids = [self.labels_map[l] for l in labels]

        if len(label_ids) > self.max_length:
            label_ids = label_ids[0:self.max_length]

        if len(label_ids) < self.max_length:
            # 标签填充到最大长度
            label_ids.extend([0] * (self.max_length - len(label_ids)))

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.LongTensor(label_ids)
        }



In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import json


# 解析实体
def post_processing(outputs, text, labels_map):
    _, predicted_labels = torch.max(outputs.logits, dim=2)

    predicted_labels = predicted_labels.detach().cpu().numpy()

    predicted_tags = [labels_map[label_id] for label_id in predicted_labels[0]]

    result = {}
    entity = ""
    type = ""
    for index, word_token in enumerate(text):
        tag = predicted_tags[index]
        if tag.startswith("B-"):
            type = tag.split("-")[1]
            if entity:
                if type not in result:
                    result[type] = []
                result[type].append(entity)
            entity = word_token
        elif tag.startswith("I-"):
            type = tag.split("-")[1]
            if entity:
                entity += word_token
        else:
            if entity:
                if type not in result:
                    result[type] = []
                result[type].append(entity)
            entity = ""
    return result

def main():
    labels_path = "./data/NER/new/labels.json"
    model_name = './output/NER/'
    max_length = 300
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 加载label
    labels_map = {}
    with open(labels_path, "r", encoding="utf-8") as r:
        labels = json.loads(r.read())
        for label in labels:
            label_id = labels[label]
            labels_map[label_id] = label

    # 加载分词器和模型
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(labels_map))
    model.to(device)

    '''
    while True:
        text = input("请输入：")
        if not text or text == '':
            continue
        if text == 'q':
            break

        encoded_input = tokenizer(text, padding="max_length", truncation=True, max_length=max_length)
        input_ids = torch.tensor([encoded_input['input_ids']]).to(device)
        attention_mask = torch.tensor([encoded_input['attention_mask']]).to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        result = post_processing(outputs, text, labels_map)
        #print(text)
        print(result)
    '''



    # 加载数据
    print("Start Load Test Data...")
    with open("./data/NER/test.json", "r", encoding="utf-8") as r:
        test_data = r.read().split('\n')
    for i in test_data:

        if len(i) == 0:
            continue
        
        data_dict = json.loads(i)
        
        encoded_input = tokenizer(data_dict['text'], padding="max_length", truncation=True, max_length=max_length)
        input_ids = torch.tensor([encoded_input['input_ids']]).to(device)
        attention_mask = torch.tensor([encoded_input['attention_mask']]).to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        result = post_processing(outputs, data_dict['text'], labels_map)

        print(data_dict['text'])
        print(f'正确答案：',data_dict['label'],sep = '')

        print(f'输出：{result}')

if __name__ == '__main__':
    main()



In [4]:
pip install python-Levenshtein

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.25.1
  Downloading Levenshtein-0.25.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[K     |████████████████████████████████| 177 kB 65 kB/s eta 0:00:01
[?25hCollecting rapidfuzz<4.0.0,>=3.8.0
  Downloading rapidfuzz-3.8.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 63 kB/s eta 0:00:01
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.25.1 python-Levenshtein-0.25.1 rapidfuzz-3.8.1
Note: you may need to restart the kernel to use updated packages.


In [10]:
from fuzzywuzzy import fuzz

string1 = "凿空”西域,通西域。"
string2 = "凿空”西域"
similarity_ratio = fuzz.ratio(string1, string2)
print(f"相似度：{similarity_ratio}%")



相似度：67%
