我的csv文件里的'note'列里面有需要进行截断的文本，['CHIEF COMPLAINT',‘PAST SURGICAL HISTORY’,'REVIEW OF SYSTEMS','EMERGENCY DEPARTMENT COURSE','GYNECOLOGIC HISTORY','PAST MEDICAL HISTORY','HISTORY of PRESENT ILLNESS','FAMILY HISTORY/SOCIAL HISTORY']，当这些节标题出现的时候我需要对note里面的文本进行截断，以这些节标题作为key，以这些节标题后对应的文本为values,保存到字典中，最后以'encounter_id'列里对应的id存到json中

In [29]:
import pandas as pd
import json

# 读取CSV文件
data = pd.read_csv('./MEDIQA-Chat-Training-ValidationSets-Feb-10-2023/TaskB/TaskB-TrainingSet.csv')

# 需要截断的节标题
section_headers = ['CHIEF COMPLAINT','PAST SURGICAL HISTORY','REVIEW OF SYSTEMS','RESULTS','ASSESSMENT AND PLAN','EMERGENCY DEPARTMENT COURSE','GYNECOLOGIC HISTORY','PAST MEDICAL HISTORY','HISTORY OF PRESENT ILLNESS','FAMILY HISTORY/SOCIAL HISTORY','PHYSICAL EXAMINATION','VITALS REVIEWED']

# 用来组成输出json的id列
id_col = 'encounter_id'

def split_text(text, keywords):
    keyword_locations = {}
    for keyword in keywords:
        index = text.find(keyword)
        if index != -1:
            keyword_locations[keyword]=index
        else:
            keyword_locations[keyword]=len(text)
    
    keyword_locations = sorted(keyword_locations.items(), key=lambda d:d[1])
    result = []
    for i in range(len(keyword_locations)):
        start = keyword_locations[i][1] + len(keyword_locations[i][0])
        if i == len(keyword_locations)-1:
            end = len(text)+1 
        else:
            end = keyword_locations[i+1][1]
        if i == 0 and start != 0:
            start = len(keyword_locations[i][0])+1
           
        result.append(text[start:end].strip())
        
    return keyword_locations,result


# 初始化输出json对象
output = []

# 遍历每个encounter_id
for id_value in data[id_col].unique():
    # 创建该encounter_id对应的输出字典
    encounter_dict = {}
    
    # 获取该encounter_id对应的行
    encounter_rows = data[data[id_col] == id_value]
    
    # 遍历该encounter_id对应的所有行
    for index, row in encounter_rows.iterrows():
        # section_header = row['section_header']
        note = row['note']
        section_header=[]
        for sec in section_headers:
            if sec in note:
                # print(sec)
                section_header.append(sec)
        # 判断该行是否包含需要截断的节标题
        # if section_header in section_headers:
        #     # 截断文本
        #     note = note.split(section_header, 1)[-1]
            
        #     # 将截断后的文本以对应的节标题为key，添加到输出字典中
        #     if section_header in encounter_dict:
        #         encounter_dict[section_header] += note
        #     else:
        #         encounter_dict[section_header] = note
        section_dict,res = split_text(keywords= section_header,text=note)
        # print(res)
        for i,sec in enumerate(section_dict):
            # print(i)
            encounter_dict[section_dict[i][0]]= res[i]
    
    # 将该encounter_id和对应的字典添加到输出json对象中
    output.append({id_col: id_value, 'sections': encounter_dict})

# 将输出json对象保存到文件中
with open('output.json', 'w') as f:
    json.dump(output, f)

尝试进行句子截断，使用chunking方式

![image.png](attachment:image.png)

In [None]:
def bart_preprocessing(x):
    # replacing special line-break characters in text by whitespace
    return x.replace('\n', ' ').replace('\t', ' ')

In [None]:
#%% helper functions for chunking method
def compute_break_index(utterance_lengths, start_idx, end_idx, overlap):
    # 计算需要切割的句子index值（看看在哪切割）
    assert overlap >= 0.0 and overlap <= 1.0
    num_utterances = end_idx - start_idx
    fragment_length = sum(utterance_lengths[start_idx:end_idx])
    running_length = 0
    idx = end_idx
    # 窗口往回推，推到满足设置overlap重叠部分
    for i in range(num_utterances):
        running_overlap = running_length / float(fragment_length)
        if running_overlap >= overlap:
            return idx
        idx -= 1
        running_length += utterance_lengths[idx]
    return start_idx + 1

def chunk_conversation(x, header_length, fragment_length, fragment_overlap):
    num_utterances = len(x)
    utterance_lengths = []
    for i in range(len(x)):
        v = len(x[i]["utterance"].split())
        # 计算每句对话的长度
        utterance_lengths.append(v)
    total_length = sum(utterance_lengths)
    header_utterances = [] # 头对话
    fragments = [] # 切割的对话窗口
    if total_length <= header_length + fragment_length:
        # 如果全部对话很短，直接就不用分了
        header_utterances = x
    else:
        idx = 0
        h_length = 0
        # getting the header. 获得头对话
        while h_length + utterance_lengths[idx] <= header_length:
            header_utterances.append(x[idx])
            h_length += utterance_lengths[idx]
            idx += 1

        # TODO: check that I'm not off by one.
        # getting the fragments. 获得分割后的对话窗口
        while idx < num_utterances:
            f_length = 0
            start_idx = idx
            fragment_utterances = []
            while idx < num_utterances and f_length + utterance_lengths[idx] <= fragment_length:
                fragment_utterances.append(x[idx])
                f_length += utterance_lengths[idx]
                idx += 1
            if len(fragment_utterances) >= 1:
                # 把切割的对话保存下来
                fragments.append(fragment_utterances)

            # prevent complete overlap. 以防分割对话重叠
            if idx < num_utterances:
                break_idx = compute_break_index(utterance_lengths, start_idx, idx, fragment_overlap)
                assert break_idx != start_idx
                idx = break_idx

    return header_utterances, fragments

In [None]:
def serialize_conversation_fragments(header_utterances, fragments, 
utterance_separator_str=' ', header_fragment_separator_str='...', continuation_str='...'):
    header_strs = []
    for u in header_utterances:
        s = format_line(u)
        header_strs.append(s)

    fragment_strs_lst = []
    for f in fragments:
        f_strs = []
        for u in f:
            s = format_line(u)
            f_strs.append(s)
        fragment_strs_lst.append(f_strs)

    out_strs = []
    header_str = utterance_separator_str.join(header_strs)
    if len(fragment_strs_lst) == 0:
        out_strs.append(header_str)
    else:
        num_fragments = len(fragment_strs_lst)
        assert num_fragments >= 1

        for i, f_strs in enumerate(fragment_strs_lst):
            fragment_str = utterance_separator_str.join(f_strs)

            # first fragment
            if i == 0:
                out_s = utterance_separator_str.join([header_str, fragment_str])
                if len(fragment_strs_lst) > 1:
                    out_s = utterance_separator_str.join([out_s, continuation_str])
            # inner fragments
            elif i < num_fragments - 1:
                out_s = utterance_separator_str.join([header_str, header_fragment_separator_str, fragment_str, continuation_str])
            # last fragment
            else:
                out_s = utterance_separator_str.join([header_str, header_fragment_separator_str, fragment_str])

            out_strs.append(out_s)
    return out_strs

In [None]:
#%% main APIs
def read_data(filename, cid="cid", stringify=True):
    """Create a DataFrame from .jsonl|.json|.pckl file.

    Required Parameters
    -------------------
    filename: str, path to input data file

    Keyword Parameters
    ------------------
    cid: str (default 'cid'), name of the column used for conversation identifier
    stringify: bool (default True), whether to force the <cid> column to be strings

    Return
    ------
    df: pandas.DataFrame object, data loaded in as a pandas dataframe
    """
    if filename.endswith('.jsonl'):
        df = []
        with jsonlines.open(filename, mode='r') as reader:
            for j in reader:
                df.append(j)
        df = pd.DataFrame(df)
    elif filename.endswith('.json'):
        df = pd.read_json(filename)
    elif filename.endswith('.pckl'):
        df = pd.read_pickle(filename)
    else:
        raise TypeError('Unrecognized file extension, supported are .jsonl|.json|.pckl')

    if stringify:
        df[cid] = df[cid].astype(str)
    df.sort_values(cid, inplace=True)
    return df


def save_file(df, folder, savefile, meta_cols=['cid', 'sid'], src_col=None, tgt_col=None):
    """Saving dataframe object to .meta|.source|.target files.

    Required Parameters
    -------------------
    filename: str, path to input data file

    Keyword Parameters
    ------------------
    cid: str (default 'cid'), name of the column used for conversation identifier
    stringify: bool (default True), whether to force the <cid> column to be strings

    Return
    ------
    df: pandas.DataFrame object, data loaded in as a pandas dataframe
    """
    if os.path.exists(folder):
        print(f"Warning! {folder}/ already exists, files with identical names will be overwritten")
    else:
        os.makedirs(folder)
    savefile = os.path.join(folder, savefile)
    df[meta_cols].to_csv(savefile+'.meta', sep='\t', index=True, header=True)
    print('saving meta file to {}'.format(savefile+'.meta'))
    if src_col is not None:
        with open(savefile+'.source', 'w') as writer:
            writer.write('\n'.join(df[src_col]))
        print('saving source file to {}'.format(savefile+'.source'))
    if tgt_col is not None:
        with open(savefile+'.target', 'w') as writer:
            writer.write('\n'.join(df[tgt_col]))
        print('saving target file to {}'.format(savefile+'.target'))

In [None]:
def generate_chunk_data_stage1(filename, exp='', savefolder='../experiments/',
                               save=True, process_fn=None,
                               header_len=128, body_len=384, body_overlap=0.333,
                               **kwds):
    """Generate necessary data files for Multistage training (Chunking method) - stage 1.

    Required Parameters
    -------------------
    filename: str, path to input data file

    Keyword Parameters
    ------------------
    exp: str (default ''), name of the experiment, used to create a separate folder under <savefolder> for storing all files related to current experiment.
    savefolder: str (default '../experiments/'), path to folder storing all experiments.
    save: bool (default True), whether to save the .meta|.source|.target files.
    process_fn: function handle (default None), additional data preprocessing functions. The function must take in and return both a pandas.DataFrame object.
    header_len: int (default 128), header component length in unit of words.
    body_len: int (default 384), body component length in unit of words.
    body_overlap: float (default 0.333), a floating value between 0 and 1, the percentage of overlap in unit of words between the body components of two adjacent chunks.
    **kwds: additional keyword parameters supported by process_fn().

    Return
    ------
    dfout: pandas.DataFrame object, dataframe object containing data after preprocessing for stage 1 chunking method.
    """
    df = read_data(filename)
    
    if process_fn is not None:
        df = process_fn(df, **kwds)
        
    snippets = []
    for (i, row) in df.iterrows():
        x = row['utterances']
        header_utterances, fragments = chunk_conversation(x, header_len, body_len, body_overlap)
        out_strs = serialize_conversation_fragments(header_utterances, fragments, utterance_separator_str=' ', header_fragment_separator_str='...', continuation_str='...')
        snippets.append(out_strs)
        # print(max([len(x.split()) for x in out_strs]))
    df['chunks'] = snippets
    dfout = df[['cid', 'sid', 'chunks', 'summary']].explode('chunks', ignore_index=True)
    dfout['summary'] = dfout['summary'].apply(bart_preprocessing)
    dfout['chunks'] = dfout['chunks'].apply(bart_preprocessing)

    if save:
        mode = get_mode(filename)
        folder = os.path.join(savefolder, exp)
        save_file(
            dfout, 
            folder,
            f'{mode}_stage1', 
            meta_cols=['cid', 'sid'], 
            src_col='chunks', 
            tgt_col='summary',
        )
        save_file(
            dfout, 
            folder,
            f'{mode}_stagex', 
            meta_cols=['cid', 'sid'], 
            src_col='chunks', 
            tgt_col='summary',
        )

    return dfout

In [8]:
from sentence_transformers import SentenceTransformer

from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Set maximum length of dialogue and percentage to retain
MAX_LEN = 30
RETAIN_PERCENT = 0.2

# Sample dialogue and summary
dialogue = "Person A: Hi, how are you doing? Person B: I'm good, thanks for asking. Person A: What have you been up to lately? Person B: Not much, just working and hanging out with friends. Person A: That sounds fun. Person B: Yeah, it is. What about you? Person A: I've been pretty busy with work. Person B: That's too bad."
summary = "Person A and Person B discussed what they had been up to lately. Person B had been working and hanging out with friends, while Person A had been busy with work."

# Tokenize dialogue into sentences
# dialogue_sents = sent_tokenize(dialogue)
embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Check if dialogue length exceeds maximum length
if len(dialogue) > MAX_LEN:
    # Truncate dialogue
    retained_sents = int(MAX_LEN * RETAIN_PERCENT)
    truncated_sents = dialogue_sents[:retained_sents]
    for i in range(1, len(dialogue_sents) - retained_sents):
        if i % retained_sents == 0:
            truncated_sents += dialogue_sents[i:i+retained_sents]
    truncated_sents += dialogue_sents[-retained_sents:]
else:
    truncated_sents = dialogue_sents
print(truncated_sents)
# Calculate semantic similarity between truncated dialogue and each summary sentence
summary_sents = sent_tokenize(summary)
similarity_scores = []
for truncated_sent in truncated_sents:
    truncated_vec = np.mean(embedding_model.encode(truncated_sent), axis=0).reshape(1,-1)
    for summary_sent in summary_sents:
        summary_vec = np.mean(embedding_model.encode(summary_sent), axis=0).reshape(1,-1)
        similarity_score = cosine_similarity(truncated_vec, summary_vec)[0][0]
        similarity_scores.append(similarity_score)

# Find the most similar summary sentence for each truncated dialogue sentence
most_similar_indices = []
for i in range(len(truncated_sents)):
    start_index = i * len(summary_sents)
    end_index = start_index + len(summary_sents)
    most_similar_indices.append(np.argmax(similarity_scores[start_index:end_index]) + i * len(summary_sents))

# Get corresponding summary sentences for each truncated dialogue sentence
corresponding_summary_sents = [summary_sents[i % len(summary_sents)] for i in most_similar_indices]

# Join truncated dialogue sentences and corresponding summary sentences
truncated_dialogue = ' '.join(truncated_sents)
corresponding_summary = ' '.join(corresponding_summary_sents)

# Print results
print("Original dialogue:\n", dialogue)
print("Truncated dialogue:\n", truncated_dialogue)
print("Original summary:\n", summary)
print("Corresponding summary:\n", corresponding_summary)

NameError: name 'dialogue_sents' is not defined

In [8]:
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer,util
import nltk
import pandas as pd
import os 
# 设定文本的最大长度
MAX_LEN = 512
# 设定每个截断后的部分都有20%的重合
OVERLAP = 0.2
os.environ["CUDA_VISIBLE_DEVICES"] = '2'

# 加载预训练的BERT模型
model = SentenceTransformer('sentence-transformers/nli-bert-large-cls-pooling')

In [27]:


def length_split(text, max_len, overlap):
    '''
    将文本截断成多个子部分，每个部分的长度为max_len，相邻两部分有overlap的重叠部分
    '''
    text_len = len(text)
    # 如果文本长度小于等于max_len，不需要截断
    if text_len <= max_len:
        return [text]
    # 计算需要截断成几个部分
    step = int(max_len * (1 - overlap))
    num_parts = int(np.ceil((text_len - max_len) / step)) + 1
    # 对每个部分进行截断
    parts = []
    for i in range(num_parts):
        start = i * step
        end = min(start + max_len, text_len)
        parts.append(text[start:end])
    return parts

def turn_split(text):
    # 根据回车进行了分句
    dialogue = text.split('\n')
    # 
    dialogue = [dia.strip() for dia in dialogue]
    return dialogue
def group_dialogues(dialogue_list,n=2):
    num_groups = len(dialogue_list) // n
    
    # 将列表中的元素按照n个为一组进行分组
    groups = [dialogue_list[i:i+n] for i in range(0, num_groups*n, n)]
    
    # 如果列表中的元素不能被n整除，则将剩余的元素作为一组
    if len(dialogue_list) % n != 0:
        groups.append(dialogue_list[num_groups*n:])
    # 将每一个组中的字符串拼接成一个字符串
    groups = [' '.join(group) for group in groups]
    
    return groups


def utterences_summary_alignment(dialogue_parts,summary):
    '''以句子为基准对齐摘要'''
    # dialogue_parts = split_text(dialogue, MAX_LEN, OVERLAP)
    dialogue_embeddings = model.encode(dialogue_parts)
    summary = nltk.sent_tokenize(summary)
    summary_embeddings = model.encode(summary)
    # print(dialogue_embeddings)
    # print(summary_embeddings)
    cosine_scores = util.pytorch_cos_sim(dialogue_embeddings, summary_embeddings)
    # print(cosine_scores)
    summary_sentences  = []
    for i, dialog_sentence in enumerate(dialogue_parts):
        max_index = np.argmax(cosine_scores[i])
        summary_sentence = summary[max_index]
        summary_sentences.append(summary_sentence)
        
        print("Dialog sentence: ", dialog_sentence)
        print("Summary sentence: ", summary_sentence)
        
    return summary_sentences

def align_dialogue_with_summary(dialogue, summary_sentences):
    '''以摘要为基准对句子
    这里导入的dialogue应该先切分更好,是一个list，list里面是一个个string'''
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    # dialogue = turn_split(dialogue)
    dialogue_vectors = model.encode(dialogue, convert_to_tensor=True)
    summary_vectors = model.encode(summary_sentences, convert_to_tensor=True)
    similarity_matrix = cosine_similarity(summary_vectors, dialogue_vectors)
    summary_sentences_to_snippets=[]
    for i in range(similarity_matrix.shape[0]):
        match_indices = np.where(similarity_matrix[i] >= 0.7)[0]
        if match_indices:

            segments = group_dialogue(match_indices, dialogue)
        else:
            segments= ''
        summary_sentences_to_snippets[i] = segments
    return summary_sentences_to_snippets

def group_dialogue(indices, dialogue):
    group = []
    segment = []
    for i, idx in enumerate(indices):
        segment.append(dialogue[idx])
    # 返回的是一个句子string
    group.append(' '.join(segment))
    return group



def split_then_alignment(dialogue, summary, split="length", MAX_LEN=512, OVERLAP=0.2):
    if split == "length":
        dialogue_parts = length_split(dialogue,MAX_LEN,OVERLAP)
    elif split == "turn":
        dialogue_parts = turn_split(dialogue)
        dialogue_parts = group_dialogues(dialogue_parts)
    summary_sentences = utterences_summary_alignment(dialogue_parts,summary=summary)
    # TODO 应该写一下按utterences来对齐还是按summary的sentence来对齐
    return dialogue_parts, summary_sentences


SyntaxError: invalid syntax (583759428.py, line 57)

In [26]:
group_dialogues(turn_split(dataset_split['dialogue'][0]),n=3)


[['Doctor: What brings you back into the clinic today, miss?',
  'Patient: I came in for a refill of my blood pressure medicine.',
  'Doctor: It looks like Doctor Kumar followed up with you last time regarding your hypertension, osteoarthritis, osteoporosis, hypothyroidism, allergic rhinitis and kidney stones.  Have you noticed any changes or do you have any concerns regarding these issues?'],
 ['Patient: No.',
  'Doctor: Have you had any fever or chills, cough, congestion, nausea, vomiting, chest pain, chest pressure?',
  'Patient: No.'],
 ['Doctor: Great. Also, for our records, how old are you and what race do you identify yourself as?',
  'Patient: I am seventy six years old and identify as a white female.']]

In [19]:
dataset_split = pd.read_csv("./MEDIQA-Chat-Training-ValidationSets-Feb-10-2023/TaskA/TaskA-TrainingSet.csv")
summary=dataset_split['section_text'][0]
text = dataset_split['dialogue'][0]
utterences_summary_alighment(summary=summary,dialogue=text,MAX_LEN=256,OVERLAP=0.1)

Dialog sentence:  Doctor: What brings you back into the clinic today, miss? 
Patient: I came in for a refill of my blood pressure medicine. 
Doctor: It looks like Doctor Kumar followed up with you last time regarding your hypertension, osteoarthritis, osteoporosis, hypoth
Summary sentence:  The patient is a 76-year-old white female who presents to the clinic today originally for hypertension and a med check.
Dialog sentence:  itis, osteoporosis, hypothyroidism, allergic rhinitis and kidney stones.  Have you noticed any changes or do you have any concerns regarding these issues?  
Patient: No. 
Doctor: Have you had any fever or chills, cough, congestion, nausea, vomiting, ches
Summary sentence:  She has a history of hypertension, osteoarthritis, osteoporosis, hypothyroidism, allergic rhinitis and kidney stones.
Dialog sentence:  on, nausea, vomiting, chest pain, chest pressure?
Patient: No.  
Doctor: Great. Also, for our records, how old are you and what race do you identify yourself as