In [31]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# import necessary packages
import numpy as np
import pandas as pd
from collections import Counter
import random
import re

# Print all the information of the dataset
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [32]:
def read_csv(csv_path: str) -> np.array:
    '''
    Read the original CSV file (Dataset)
    :param csv_path: The path and name of the CSV file
    :return dataset: Dataset
    '''
    dataset = pd.read_csv(csv_path, header=None)
    dataset = np.array(dataset)
    print(dataset)
    return dataset

# Read the CSV file and convert it to np.array
dataset = read_csv('Original_Dataset.csv')

[['id' 'content' 'annotation']
 ['5ad67f86'
  '出院记录\n入院情况/主诉：左上肢麻木伴发作性言语不清5天”入院。\n入院情况/辅助检查：。\n入院诊断：1.颈椎病2.短暂性脑缺血发作\n诊治经过：患者入院后给予完善相关检查，给予改善循环，保护脑细胞，营养神经及补液对症支持治疗。\n血肝、肾功能，血糖，血脂，血凝血功能，输血前三项未见明显异常。\n血HCY：70mmol/l,电解质：Na+:160mmol/l。\n头颅MRI及DWI检查：未见明显异常。\n颈椎MRI：颈椎性改变。\n现经治疗患者症状基本缓解，要求出院，给予出院。\n'
  'T48\tE95f2a617 0 4\t出院记录\nT49\tE95f2a617 5 9\t入院情况\nT50\tE95f2a617 10 12\t主诉\nT1\tE320ca3f6 13 16\t左上肢\nT2\tE340ca71c 16 18\t麻木\nT3\tE1ceb2bd7 19 22\t发作性\nT4\tE340ca71c 22 26\t言语不清\nT5\tE1deb2d6a 26 28\t5天\nT51\tE95f2a617 33 37\t入院情况\nT52\tE95f2a617 38 42\t辅助检查\nT53\tE95f2a617 45 49\t入院诊断\nT6\tE370cabd5 52 55\t颈椎病\nT7\tE370cabd5 57 65\t短暂性脑缺血发作\nT54\tE95f2a617 66 70\t诊治经过\nT8\tE1deb2d6a 73 76\t入院后\nT55\tE360caa42 78 84\t完善相关检查\nT9\tE310ca263 87 91\t改善循环\nT10\tE310ca263 92 97\t保护脑细胞\nT11\tE310ca263 98 102\t营养神经\nT56\tE310ca263 103 105\t补液\nT14\tE360caa42 113 115\t血肝\nT15\tE360caa42 116 119\t肾功能\nT16\tE300ca0d0 120 122\t血糖\nT17\tE300ca0d0 123 125\t血脂\nT18\tE360caa42 126 131\t血凝血功能\nT19\tE360caa42

In [33]:
def remove_nan(data_array: np.array, column: int) -> np.array:
    '''
    Remove rows where a block contains NaN
    Because there are some NaN in the Annotation column
    :param data_array: Input dataset
    :param column: Column's block which contains NaN
    :return dataset: The processed dataset without any NaN
    '''
    # Because some blocks in the Annotation column is NaN
    remove = []
    block = np.array(data_array[:, column])
    for i in range(1, np.shape(block)[0]):
        str_block = block[i]
        if type(str_block) is float:
            str_block = str(str_block)
            if str_block == 'nan':
                remove.append(i)

    print('From the Annotations, the removed rows are %s' % str(remove))
    data_no_nan = np.delete(np.array(data_array), remove, axis=0)
    return data_no_nan

# Remove some (annotation) lines that contain NaN blocks
# column=2 means Annotation Column
data_no_nan = remove_nan(data_array=dataset, column=2)

From the Annotations, the removed rows are [735, 1248, 1254, 1722, 1782, 1817, 1877]


In [34]:
def remove_few_contents(data_array: np.array, column: int, threshold: int) -> np.array:
    '''
    Remove rows where a block contains fewer contents
    Because the contents are few and no need to keep
    :param data_array: The input dataset
    :param column: Column's block which contains fewer contents
    :return data_few_content: Output dataset
    '''
    remove_index = []
    Content_block = np.array(data_array[:, column])
    for i in range(1, np.shape(Content_block)[0]):
        Content_str = Content_block[i]
        if len(Content_str) < threshold:
            remove_index.append(i)

    print('From the Contents, the removed rows are %s' % str(remove_index))
    data_few_content = np.delete(np.array(data_array), remove_index, axis=0)
    return data_few_content

# Remove some (contents) lines that contain fewer contents
# column=1 means Contents Column
data_few_content = remove_few_contents(data_array=data_no_nan, column=1, threshold=3)

# Output some important information
print('The dimensionality of the dataset: ', np.shape(data_few_content)[0] - 1, 'rows X', np.shape(data_few_content)[1], 'columns')
print('This first column is the %s, for example, %s' % (data_few_content[0, 0], data_few_content[1, 0]))
print('This second column is the %s, for example,\n%s' % (data_few_content[0, 1], data_few_content[1, 1]))
print('This third column is the %s, for example,\n%s' % (data_few_content[0, 2], data_few_content[100, 2]))

From the Contents, the removed rows are [1250]
The dimensionality of the dataset:  1887 rows X 3 columns
This first column is the id, for example, 5ad67f86
This second column is the content, for example,
出院记录
入院情况/主诉：左上肢麻木伴发作性言语不清5天”入院。
入院情况/辅助检查：。
入院诊断：1.颈椎病2.短暂性脑缺血发作
诊治经过：患者入院后给予完善相关检查，给予改善循环，保护脑细胞，营养神经及补液对症支持治疗。
血肝、肾功能，血糖，血脂，血凝血功能，输血前三项未见明显异常。
血HCY：70mmol/l,电解质：Na+:160mmol/l。
头颅MRI及DWI检查：未见明显异常。
颈椎MRI：颈椎性改变。
现经治疗患者症状基本缓解，要求出院，给予出院。

This third column is the annotation, for example,
T16	E95f2a617 0 4	出院记录
T14	E8ff29ca5 9 12	胡雅丽
T1	E340ca71c 20 22	发热
T2	E1deb2d6a 22 24	3天
T15	E310ca263 25 27	入院
T3	E370cabd5 30 36	上呼吸道感染
T13	E17eb23f8 38 40	轻度
T5	E340ca71c 40 42	贫血
T6	E360caa42 52 55	血常规
T7	E300ca0d0 57 60	WBC
T8	E3c0cb3b4 60 70	4.34*109/l
T9	E300ca0d0 71 74	RBC
T10	E3c0cb3b4 74 85	4.39*1012/l
T11	E300ca0d0 86 89	HGB
T12	E3c0cb3b4 89 95	106g/l
R1	R742a31d5 Arg1:T2 Arg2:T1	
R2	R742a31d5 Arg1:T13 Arg2:T5	
R3	R494545ec Arg1:T8 Arg2:T7	
R4	R494545ec Arg1:T10 Arg2:T9	
R5	R494545ec Arg1:T12 

In [35]:
def remove_header(data_array: np.array) -> np.array:
    '''
    Remove the header from the dataset
    :param data_array: Input dataset
    :return data_no_header: Output Dataset
    '''
    data_no_header = np.delete(np.array(data_array), 0, axis=0)
    print(data_no_header)
    return data_no_header

# Remove the header
data_no_header = remove_header(data_array=data_few_content)

[['5ad67f86'
  '出院记录\n入院情况/主诉：左上肢麻木伴发作性言语不清5天”入院。\n入院情况/辅助检查：。\n入院诊断：1.颈椎病2.短暂性脑缺血发作\n诊治经过：患者入院后给予完善相关检查，给予改善循环，保护脑细胞，营养神经及补液对症支持治疗。\n血肝、肾功能，血糖，血脂，血凝血功能，输血前三项未见明显异常。\n血HCY：70mmol/l,电解质：Na+:160mmol/l。\n头颅MRI及DWI检查：未见明显异常。\n颈椎MRI：颈椎性改变。\n现经治疗患者症状基本缓解，要求出院，给予出院。\n'
  'T48\tE95f2a617 0 4\t出院记录\nT49\tE95f2a617 5 9\t入院情况\nT50\tE95f2a617 10 12\t主诉\nT1\tE320ca3f6 13 16\t左上肢\nT2\tE340ca71c 16 18\t麻木\nT3\tE1ceb2bd7 19 22\t发作性\nT4\tE340ca71c 22 26\t言语不清\nT5\tE1deb2d6a 26 28\t5天\nT51\tE95f2a617 33 37\t入院情况\nT52\tE95f2a617 38 42\t辅助检查\nT53\tE95f2a617 45 49\t入院诊断\nT6\tE370cabd5 52 55\t颈椎病\nT7\tE370cabd5 57 65\t短暂性脑缺血发作\nT54\tE95f2a617 66 70\t诊治经过\nT8\tE1deb2d6a 73 76\t入院后\nT55\tE360caa42 78 84\t完善相关检查\nT9\tE310ca263 87 91\t改善循环\nT10\tE310ca263 92 97\t保护脑细胞\nT11\tE310ca263 98 102\t营养神经\nT56\tE310ca263 103 105\t补液\nT14\tE360caa42 113 115\t血肝\nT15\tE360caa42 116 119\t肾功能\nT16\tE300ca0d0 120 122\t血糖\nT17\tE300ca0d0 123 125\t血脂\nT18\tE360caa42 126 131\t血凝血功能\nT19\tE360caa42 132 137\t输血前三项\nT58\tE18eb258b

In [36]:
def remove_relation(data_array: np.array, column: int) -> np.array:
    '''
    Remove the Relationship Information from Annotations
    :param data_array: Input dataset
    :return data_array: Output dataset which removed the relation information from the Annotations
    '''
    Annotation = data_array[:, column]
    for i in range(np.shape(Annotation)[0]):
        Annotation_str = Annotation[i]
        first_R_loc = Annotation_str.find('R')

        if 'R' in str(Annotation_str[first_R_loc+1:first_R_loc+5]):
            Annotation_str = Annotation_str.replace(Annotation_str[first_R_loc:], '')
            data_array[i, column] = Annotation_str
        else:
            new_R_loc = first_R_loc
            while 'R' not in str(Annotation_str[new_R_loc+1:new_R_loc+5]):
                first_R_loc = Annotation_str[new_R_loc+1:].find('R')
                new_R_loc += first_R_loc
                new_R_loc += 1

            Annotation_str = Annotation_str.replace(Annotation_str[new_R_loc:], '')
            data_array[i, column] = Annotation_str
    return data_array

# Remove the relational information from the Annotations
data_no_relation = remove_relation(data_array=data_no_header, column=2)
print('After removed the relationship information, the third column is\n%s' % data_no_relation[99, 2])

After removed the relationship information, the third column is
T16	E95f2a617 0 4	出院记录
T14	E8ff29ca5 9 12	胡雅丽
T1	E340ca71c 20 22	发热
T2	E1deb2d6a 22 24	3天
T15	E310ca263 25 27	入院
T3	E370cabd5 30 36	上呼吸道感染
T13	E17eb23f8 38 40	轻度
T5	E340ca71c 40 42	贫血
T6	E360caa42 52 55	血常规
T7	E300ca0d0 57 60	WBC
T8	E3c0cb3b4 60 70	4.34*109/l
T9	E300ca0d0 71 74	RBC
T10	E3c0cb3b4 74 85	4.39*1012/l
T11	E300ca0d0 86 89	HGB
T12	E3c0cb3b4 89 95	106g/l



In [37]:
def string_data(data_array: np.array, column: int) -> np.array:
    '''
    String each block for one column
    :param data_array: Input dataset
    :param column: column=1 indicating the Contents column
    :return: Output dataset
    '''
    contents = data_array[:, column]
    for i in range(np.shape(contents)[0]):
        contents_row = str(contents[i])
        data_array[i, column] = contents_row
    return data_array

# String the Contents (column=1)
data_column_str = string_data(data_array=data_no_relation, column=1)
print(data_column_str)

[['5ad67f86'
  '出院记录\n入院情况/主诉：左上肢麻木伴发作性言语不清5天”入院。\n入院情况/辅助检查：。\n入院诊断：1.颈椎病2.短暂性脑缺血发作\n诊治经过：患者入院后给予完善相关检查，给予改善循环，保护脑细胞，营养神经及补液对症支持治疗。\n血肝、肾功能，血糖，血脂，血凝血功能，输血前三项未见明显异常。\n血HCY：70mmol/l,电解质：Na+:160mmol/l。\n头颅MRI及DWI检查：未见明显异常。\n颈椎MRI：颈椎性改变。\n现经治疗患者症状基本缓解，要求出院，给予出院。\n'
  'T48\tE95f2a617 0 4\t出院记录\nT49\tE95f2a617 5 9\t入院情况\nT50\tE95f2a617 10 12\t主诉\nT1\tE320ca3f6 13 16\t左上肢\nT2\tE340ca71c 16 18\t麻木\nT3\tE1ceb2bd7 19 22\t发作性\nT4\tE340ca71c 22 26\t言语不清\nT5\tE1deb2d6a 26 28\t5天\nT51\tE95f2a617 33 37\t入院情况\nT52\tE95f2a617 38 42\t辅助检查\nT53\tE95f2a617 45 49\t入院诊断\nT6\tE370cabd5 52 55\t颈椎病\nT7\tE370cabd5 57 65\t短暂性脑缺血发作\nT54\tE95f2a617 66 70\t诊治经过\nT8\tE1deb2d6a 73 76\t入院后\nT55\tE360caa42 78 84\t完善相关检查\nT9\tE310ca263 87 91\t改善循环\nT10\tE310ca263 92 97\t保护脑细胞\nT11\tE310ca263 98 102\t营养神经\nT56\tE310ca263 103 105\t补液\nT14\tE360caa42 113 115\t血肝\nT15\tE360caa42 116 119\t肾功能\nT16\tE300ca0d0 120 122\t血糖\nT17\tE300ca0d0 123 125\t血脂\nT18\tE360caa42 126 131\t血凝血功能\nT19\tE360caa42 132 137\t输血前三项\nT58\tE18eb258b

In [38]:
def summary_entity(data_array: np.array, column: int):
    '''
    Summary the entity information from the Annotations
    :param data_array: Input dataset
    :param column: column=2 indicating the Annotations
    :return: Some statistical info
    '''
    # Summary the number of entities
    entity = []
    entities_line = data_array[:, column]
    for i in range(np.shape(entities_line)[0]):
        entity_split = entities_line[i].split()
        for j in range(np.shape(entity_split)[0]):
            if str(entity_split[j][0]) == 'E':
                entity_first_char = entity_split[j]
                if len(entity_first_char) == 9 and 'EB' not in entity_first_char:
                    entity.append(entity_first_char)
                else:
                    continue
            else:
                continue
    
    print('The amount of training entities is %s' % np.shape(entity)[0])
    entity_dict = dict(Counter(entity))
    print('There are %s entities in total.' % entity_dict.__len__())
    print({key: value for key, value in entity_dict.items()})

# Summary some entity info
summary_entity(data_array=data_column_str, column=2)

The amount of training entities is 95808
There are 34 entities in total.
{'E95f2a617': 3221, 'E320ca3f6': 6338, 'E340ca71c': 22209, 'E1ceb2bd7': 3706, 'E1deb2d6a': 9744, 'E370cabd5': 6196, 'E360caa42': 5268, 'E310ca263': 6948, 'E300ca0d0': 9490, 'E18eb258b': 4526, 'E3c0cb3b4': 6280, 'E1beb2a44': 1663, 'E3d0cb547': 1025, 'E14eb1f3f': 406, 'E8ff29ca5': 1676, 'E330ca589': 1487, 'E89f29333': 1093, 'E8ef29b12': 217, 'E1eeb2efd': 1637, 'E1aeb28b1': 209, 'E17eb23f8': 670, 'E87f05176': 407, 'E88f05309': 355, 'E19eb271e': 152, 'E8df2997f': 135, 'E94f2a484': 584, 'E13eb1dac': 58, 'E85f04e50': 6, 'E8bf057c2': 7, 'E8cf297ec': 6, 'E8ff05e0e': 18, 'E87e38583': 6, 'E86f04fe3': 1, 'E8cf05955': 64}


In [39]:
def remove_irregular_anno(data_array: np.array, column: int) -> np.array:
    '''
    Remove the rows where the annotations are not regular
    :param data_array: Input dataset
    :param column: column=2 indicating the Annotations
    :return: Output dataset
    '''
    Draft_labels = data_array[:, column]
    index_problem = []
    for i in range(np.shape(Draft_labels)[0]):
        Draft_label_split = str(Draft_labels[i]).split()
        if np.shape(Draft_label_split)[0] % 5 != 0:
            index_problem.append(i)
    data_array = np.delete(np.array(data_array), index_problem, axis=0)
    print(data_array)
    return data_array
    
# Remove the rows where the annotations are not regular
data_irregular_anno = remove_irregular_anno(data_array=data_column_str, column=2)

[['5ad67f86'
  '出院记录\n入院情况/主诉：左上肢麻木伴发作性言语不清5天”入院。\n入院情况/辅助检查：。\n入院诊断：1.颈椎病2.短暂性脑缺血发作\n诊治经过：患者入院后给予完善相关检查，给予改善循环，保护脑细胞，营养神经及补液对症支持治疗。\n血肝、肾功能，血糖，血脂，血凝血功能，输血前三项未见明显异常。\n血HCY：70mmol/l,电解质：Na+:160mmol/l。\n头颅MRI及DWI检查：未见明显异常。\n颈椎MRI：颈椎性改变。\n现经治疗患者症状基本缓解，要求出院，给予出院。\n'
  'T48\tE95f2a617 0 4\t出院记录\nT49\tE95f2a617 5 9\t入院情况\nT50\tE95f2a617 10 12\t主诉\nT1\tE320ca3f6 13 16\t左上肢\nT2\tE340ca71c 16 18\t麻木\nT3\tE1ceb2bd7 19 22\t发作性\nT4\tE340ca71c 22 26\t言语不清\nT5\tE1deb2d6a 26 28\t5天\nT51\tE95f2a617 33 37\t入院情况\nT52\tE95f2a617 38 42\t辅助检查\nT53\tE95f2a617 45 49\t入院诊断\nT6\tE370cabd5 52 55\t颈椎病\nT7\tE370cabd5 57 65\t短暂性脑缺血发作\nT54\tE95f2a617 66 70\t诊治经过\nT8\tE1deb2d6a 73 76\t入院后\nT55\tE360caa42 78 84\t完善相关检查\nT9\tE310ca263 87 91\t改善循环\nT10\tE310ca263 92 97\t保护脑细胞\nT11\tE310ca263 98 102\t营养神经\nT56\tE310ca263 103 105\t补液\nT14\tE360caa42 113 115\t血肝\nT15\tE360caa42 116 119\t肾功能\nT16\tE300ca0d0 120 122\t血糖\nT17\tE300ca0d0 123 125\t血脂\nT18\tE360caa42 126 131\t血凝血功能\nT19\tE360caa42 132 137\t输血前三项\nT58\tE18eb258b

In [40]:
def make_labels(data_array: np.array) -> np.array:
    '''
    Making labels for the dataset, and shuffle the dataset by sentence
    :param data_array: Input dataset
    :return Draft_Dataset: Output dataset
    :return final_output_labels: Output labels
    '''
    Draft_Dataset = data_array[:, 1]
    Draft_labels = data_array[:, 2]
    final_output_labels = []
    for i in range(np.shape(Draft_labels)[0]):
        # Split the labels
        Draft_label_split = str(Draft_labels[i]).split()
        # print(np.shape(Draft_label_split)[0])

        # Make Labels for the Dataset
        # First, make beginning and end labels
        final_labels = np.empty(len(Draft_Dataset[i]), dtype='U15')
        for n in range(np.shape(Draft_label_split)[0]):
            # The start character label --> B-entity
            if (n - 2) % 5 == 0:
                label_now = str(Draft_label_split[n-1])
                label = 'B-' + label_now
                start_index = int(Draft_label_split[n])
                final_labels[start_index] = label

            # The end character label --> E-entity
            if (n - 3) % 5 == 0:
                # If there is only one character (singleton), mark it as B-entity
                if int(Draft_label_split[n]) - int(Draft_label_split[n-1]) == 1:
                    label_now = str(Draft_label_split[n-2])
                    label = 'S-' + label_now
                    end_index = int(Draft_label_split[n])
                    final_labels[end_index-1] = label
                else:
                    label_now = str(Draft_label_split[n-2])
                    label = 'E-' + label_now
                    end_index = int(Draft_label_split[n])
                    final_labels[end_index-1] = label
        
        # Make intermediate and non-corresponding Labels
        # If it in the "begin-end", then mark as "I-entity" (Intermediate)
        # If it outside, then mark as "0"
        start_temp = []
        end_temp = []
        label_temp = []
        for m in range(np.shape(Draft_label_split)[0]):
            if (m - 2) % 5 == 0:
                start_temp.append(int(Draft_label_split[m]))

            if (m - 3) % 5 == 0:
                end_temp.append(int(Draft_label_split[m]))

            if (m - 1) % 5 == 0:
                label_temp.append(str(Draft_label_split[m]))

        for o in range(np.shape(start_temp)[0]):
            for z in range(np.shape(final_labels)[0]):
                if z > start_temp[o] and z < end_temp[o] - 1:
                    final_labels[z] = 'I-' + label_temp[o]

        for p in range(np.shape(final_labels)[0]):
            if final_labels[p] == '':
                final_labels[p] = "O"

        final_output_labels.append(final_labels)

    # Shuffle the Dataset (by sentences)
    index = [i for i in range(np.shape(Draft_Dataset)[0])]
    random.shuffle(index)
    Draft_Dataset = np.array(Draft_Dataset)[index]
    final_output_labels = np.array(final_output_labels)[index]
    return Draft_Dataset, final_output_labels

# Making labels w.r.t. the Dataset, and shuffle the dataset by sentence
data, labels = make_labels(data_array=data_irregular_anno)
print('The dimensionality of the Data set is %s' % np.shape(data)[0])
print('The dimensionality of the labels set is %s' % np.shape(labels)[0])

The dimensionality of the Data set is 1881
The dimensionality of the labels set is 1881




In [41]:
def paragraph2sentence(data_array: np.array, labels_array: np.array) -> np.array:
    '''
    Cut the paragraph into sentences using regular expression
    :param data_array: Input dataset
    :param labels_array: Input labels
    :return total_dataset: Dataset containing data and labels
    '''
    sentence_all = []
    labels_all = []
    statistics_all = []
    char_org_all = []
    # char_cut_all = []
    for i in range(np.shape(data_array)[0]):
        paragraph = data_array[i]
        labels = labels_array[i]

        for k in paragraph:
            char_org_all.append(k)

        # Cut the paragraph into sentences
        sentences = re.split('(。|！)', paragraph)

        cut_index = 0
        for j in range(np.shape(sentences)[0]):
            # length of current sentence
            len_sentence = len(sentences[j])

            # For the dataset
            sentence_all.append(list(sentences[j]))
            # for m in sentences[j]:
            #     char_cut_all.append(m)
            sentence_all.append(['end'])
            
            # For the labels
            labels_all.append(list(labels[cut_index:cut_index+len_sentence]))
            cut_index += len_sentence
            labels_all.append([' '])

    # Combine the data and labels
    rows = np.shape(sentence_all)[0]
    temp_sentence = np.reshape(sentence_all, [rows, 1])
    temp_labels = np.reshape(labels_all, [rows, 1])
    temp_dataset = np.concatenate((temp_sentence, temp_labels), axis=1)

    index = []
    for i in range(np.shape(temp_dataset)[0]):
        if i < np.shape(temp_dataset)[0] - 1:
            if temp_dataset[i, 0] != ['end']:
                if len(temp_dataset[i, 0]) <= 5 or '*' in temp_dataset[i, 0] or ' ' in temp_dataset[i, 0]:
                    index.append(i)
                    index.append(i+1)
                else:
                    continue

        if 0 < i < np.shape(temp_dataset)[0] - 1:
            if temp_dataset[i, 0] == ['end']:
                if str(temp_dataset[i-1, 1])[0] == 'B' and str(temp_dataset[i+1, 1])[0] == 'I':
                    index.append(i)
                    print(i)

        if 0 < i < np.shape(temp_dataset)[0] - 1:
            if temp_dataset[i, 0] == ['end']:
                if str(temp_dataset[i-1, 1])[0] == 'I' and str(temp_dataset[i+1, 1])[0] == 'I':
                    index.append(i)
                    print(i)

        if i > 0:
            if temp_dataset[i, 0] == ['end']:
                if str(temp_dataset[i-1, 1])[0] == 'I':
                    temp_dataset[i-1, 1] = 'E' + str(temp_dataset[i-1, 1])[1:]
                    print(i)

    new_dataset = np.delete(np.array(temp_dataset), index, axis=0)

    # Summary the length of every sentence
    for i in range(np.shape(new_dataset)[0]):
        length_sentence = len(new_dataset[i, 0])
        if length_sentence > 1:
            # print(length_sentence)
            statistics_all.append(length_sentence)

    new_sentence = new_dataset[:, 0]
    new_label = new_dataset[:, 1]

    # Save the character and labels
    reshape_dataset = []
    reshape_labels = []
    for i in range(len(new_sentence)):
        for j in new_sentence[i]:
            reshape_dataset.append(j)

    for i in range(len(new_label)):
        for j in new_label[i]:
            reshape_labels.append(j)

    rows = np.shape(reshape_dataset)[0]
    reshape_dataset = np.reshape(reshape_dataset, [rows, 1])
    reshape_labels = np.reshape(reshape_labels, [rows, 1])
    total_dataset = np.concatenate((reshape_dataset, reshape_labels), axis=1)
    return total_dataset, statistics_all

# cut the paragraph into sentences and make dataset
total_dataset, statistics_all = paragraph2sentence(data_array=data, labels_array=labels)
print('The max of the paragraphs\'s characters is %s' % np.max(statistics_all))
print('The mean of the paragraphs\'s characters is %s' % np.mean(statistics_all))
print('The median of the paragraphs\'s characters is %s' % np.median(statistics_all))

  return array(a, dtype, copy=False, order=order)


The max of the paragraphs's characters is 476
The mean of the paragraphs's characters is 49.85359014994514
The median of the paragraphs's characters is 36.0


In [42]:
def full_to_half(sentence: str) -> str:
    '''
    Convert full-width to half-width
    :param sentence: a sentence or a punctuation
    :return change_sentence: Convert the full-width to half-width
    '''
    change_sentence = ""
    for word in sentence:
        inside_code = ord(word)
        if inside_code == 12288:
            inside_code = 32
        elif inside_code >= 65281 and inside_code <= 65374:
            inside_code -= 65248
        change_sentence += chr(inside_code)
    return change_sentence


def Q2B(uchar):
    """单个字符 全角转半角"""
    inside_code = ord(uchar)
    if inside_code == 0x3000:
        inside_code = 0x0020
    else:
        inside_code -= 0xfee0
    if inside_code < 0x0020 or inside_code > 0x7e: #转完之后不是半角字符返回原来的字符
        return uchar
    return chr(inside_code)


def stringQ2B(ustring):
    """把字符串全角转半角"""
    return "".join([Q2B(uchar) for uchar in ustring])

In [43]:
def remove_meaningless_rows(data_array: np.array) -> np.array:
    '''
    Remove some meaningless rows, e.g., *, ' ', or '""""'
    Make all the Chinese characters into English
    :param data_array: Input dataset
    :return new_dataset: Output dataset
    '''
    remove_index = []
    for i in range(np.shape(data_array)[0]):
        if data_array[i, 0] == '\n':
            remove_index.append(i)

        if data_array[i, 1] == " ":
            data_array[i, 1] = str(data_array[i, 1]).replace(" ", '')

        if "E13eb1dac" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E85f04e50" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E8bf057c2" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E8cf297ec" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E8ff05e0e" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E87e38583" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E86f04fe3" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E8cf05955" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E14eb1f3f" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E89f29333" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E8ef29b12" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E1aeb28b1" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E19eb271e" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E8df2997f" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E87f05176" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        if "E88f05309" in data_array[i, 1]:
            data_array[i, 1] = 'O'

        # Convert possible full-width to half-width
        data_array[i, 0] = stringQ2B(data_array[i, 0])

    new_dataset = np.delete(np.array(data_array), remove_index, axis=0)
    return new_dataset

new_dataset = remove_meaningless_rows(total_dataset)
print(np.shape(new_dataset))

(406526, 2)


In [44]:
# combine = [3, 5, 7, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
# for c in combine:
#     end_index = []
#     for i in range(np.shape(new_dataset)[0]):
#         if new_dataset[i, 0] == "end":
#             end_index.append(i)
#
#     delete_index = []
#     for j in range(len(end_index)):
#         if j < len(end_index) - 1:
#             interval = end_index[j+1] - end_index[j]
#             if interval < c:
#                 delete_index.append(end_index[j+1])
#
#     new_dataset = np.delete(np.array(new_dataset), delete_index, axis=0)
#     print(np.shape(new_dataset))

In [45]:
cut = [450, 425, 400, 375, 350, 325, 300, 275, 250, 225, 200, 175, 150, 125, 100]

for threshold in cut:
    end_index = []
    for i in range(np.shape(new_dataset)[0]):
        if new_dataset[i, 0] == 'end':
            end_index.append(i)

    insert = []
    for j in range(len(end_index)):
        if j < len(end_index) - 1:
            interval = end_index[j+1] - end_index[j]
            if interval > threshold:
                insert.append(int((end_index[j+1] - end_index[j]) / 2) + end_index[j])

    index = 0
    temp_dataset = new_dataset[:, :]
    for k in insert:
        random_num = random.sample(range(-30, 30), 60)
        for loc in random_num:
            if (new_dataset[k+loc, 0] == ',' or new_dataset[k+loc, 0] == '.'
                or new_dataset[k+loc, 0] == ';' or new_dataset[k+loc, 0] == '、') \
                    and new_dataset[k+loc, 1] == 'O':
                temp_dataset = np.insert(temp_dataset, k+loc+index, values=['end', ''], axis=0)
                index += 1
                break

    new_dataset = temp_dataset

    print(np.shape(temp_dataset))
    print(len(insert))
    print(index)

(406528, 2)
2
2
(406530, 2)
2
2
(406531, 2)
1
1
(406533, 2)
2
2
(406533, 2)
0
0
(406538, 2)
5
5
(406549, 2)
11
11
(406561, 2)
12
12
(406578, 2)
17
17
(406608, 2)
30
30
(406659, 2)
51
51
(406747, 2)
89
88
(406928, 2)
183
181
(407233, 2)
310
305
(407807, 2)
588
574


In [46]:
# end_index = []
# for i in range(np.shape(new_dataset)[0]):
#     if new_dataset[i, 0] == "end":
#         end_index.append(i)
#
# delete_index = []
# for j in range(len(end_index)):
#     if j < len(end_index) - 1:
#         interval = end_index[j+1] - end_index[j]
#         if interval < 40:
#             delete_index.append(end_index[j+1])
#
# new_dataset = np.delete(np.array(new_dataset), delete_index, axis=0)
# print(np.shape(new_dataset))

In [47]:
end_index = []
for i in range(np.shape(new_dataset)[0]):
    if new_dataset[i, 0] == "end":
        end_index.append(i)

interval = []
for j in range(len(end_index)):
    if j < len(end_index) - 1:
        interval.append(end_index[j+1] - end_index[j])

print('mean', np.mean(interval))
print('median', np.median(interval))
print('max', max(interval))
print('min', min(interval))

bigger = []
for i in interval:
    if i > 200:
        bigger.append(i)

print(bigger)
print(len(bigger))

mean 42.99968364441632
median 39.0
max 196
min 5
[]
0


In [48]:
def train_val_test(data_array: np.array):
    '''
    Make training set and testing set
    :param data_array:
    :return train_dataset: Training set
    :return val_dataset: Validation set
    :return test_dataset: Testing set
    '''
    # Training set: Validation set: Testing set = 8:1:1
    [rows, _] = np.shape(data_array)
    train_rows = int(rows / 10 * 9)

    # Split the dataset
    train_dataset = data_array[0:train_rows, :]
    val_dataset = data_array[train_rows:, :]

    # Convert them into pandas DataFrame
    train_dataset = pd.DataFrame(train_dataset)
    val_dataset = pd.DataFrame(val_dataset)
    return train_dataset, val_dataset

# Split Training set, Validation set and Testing set
train, val = train_val_test(data_array=new_dataset)

In [49]:
def save_dataset(path: str, train: pd.DataFrame, val: pd.DataFrame):
    '''
    Save the Training set, Validation set and Testing set
    :param path: Saved path
    :param train: Training set
    :param val: Validation set
    :param test: Testing set
    '''
    # Set index=False and header=None, don't save the header and left index
    train.to_csv(path + 'train', sep=' ', index=False, header=None, encoding='utf-8')
    val.to_csv(path + 'dev', sep=' ', index=False, header=None, encoding='utf-8')

# Save the dataset
path = 'data/'
save_dataset(path=path, train=train, val=val)

In [50]:
setences = []
k = 0
index = 0
for i in range(np.shape(new_dataset)[0]):
    if new_dataset[i, 0] == "end":
        sentence = new_dataset[k:i, 0]
        # labels = new_dataset[k:i, 1]

        # print('labels length', len(labels))

        sentence = np.reshape(sentence, [-1])
        # print('sentence length', len(sentence))

        # labels = np.reshape(labels, [-1])

        sentence = ''.join(sentence)

        print(sentence)
        # print(labels)
        # print('\n')

        k = i
        index += 1

print(index)

出院记录入院情况:1、患者王禾仙,女,55岁,因“胸闷心悸3月余,加重4天”入院
end入院情况/体格检查/胸部:胸廓对称无畸形,双侧呼吸运动一致,双肺呼吸音清,两肺叩诊呈清音,双肺未闻及干、湿罗音
end心前区无隆起,未及震颤,心界不大,心律不齐,心率130次/分,心尖区可闻及III级杂音,并向腋下传导双下肢浮肿,浮肿至小腿部
end,压之凹陷出院诊断:1.高血压1)高血压性心脏病2)心房颤动(心房纤颤)3)心功能Ⅲ级2.2型糖尿病
end出院记录入院情况:患者李细香,女,83岁,因“咯血5小时”入院
end体温:36.9℃脉搏:116次/分呼吸:38次/分血压:120/70mmhg胸部CT:1.两肺病变,考虑两肺感染伴右肺支扩
end2.胸主动脉管壁钙化
end3.右侧胸腔微量积液
end入院诊断:咯血原因待查出院医嘱:1.用药指导:单硝酸异山梨酯40mg口服1次/天,酒石酸美托洛尔半片口服2次/天
end阿托伐他丁胶囊1粒口服1次/天,酌情使用阿斯匹林肠溶片1片/次口服1次/日
end出院情况:患者一般情况良好,无咯血,无恶心、呕吐,无明显胸闷、胸痛,无腹痛、腹泻
end查体:体温:36.4℃脉搏:98次/分呼吸:21次/分血压:124/70mmHg出院诊断:1.支气管扩张伴咯血2.冠状动脉粥样硬化性心脏病1)急性前间壁心肌梗死诊治经过:患者入院后完善相关检查
end,血沉ESR20mm/h;钾K3.3mmol/L;CK(9-28)304U/L
end、CK-MB43.5U/L,CK(9-29)232U/L、CK-MB32.8U/L;心电图(9-28):心电图示:窦性心律
end,心率114次/分,V1-V4导联T波倒置;(9-28)肌钙蛋白-I3.868ng/mlB型纳尿肽前体256pg/ml肌钙蛋白-I5.74ng/mlB型纳尿肽前体8656pg/ml;血常规:WBC17.13X10^9
end,N%96.1%;血气分析:PCO234.9mmHg,PO276mmHg;心电图(9-29):窦性心律,心率100次/分,V1-V5T波倒置;治疗上给予抗炎、化痰、增强免疫力、扩冠等综合治疗
end;复查心肌酶、CTnI、心电图示:CK27U/LCK-MB11.5U/L;肌钙蛋白0.001ng/ml
end心电图:窦性心律,心率75次/分,II、III、avF导联,