In [1]:
import pandas as pd
import json
from tqdm import tqdm
import re

In [2]:
# Define the categories and create mappings
categories = ["entity", "relation", "invented", "contradictory", "unverifiable", "subjective"]
label2id = {f'I-{category}': idx+1 for idx, category in enumerate(categories)}
label2id["O"] = 0
label2id['ignore'] = -100
id2label = {v: k for k, v in label2id.items()}
print(label2id)
print(id2label)

{'I-entity': 1, 'I-relation': 2, 'I-invented': 3, 'I-contradictory': 4, 'I-unverifiable': 5, 'I-subjective': 6, 'O': 0, 'ignore': -100}
{1: 'I-entity', 2: 'I-relation', 3: 'I-invented', 4: 'I-contradictory', 5: 'I-unverifiable', 6: 'I-subjective', 0: 'O', -100: 'ignore'}


In [3]:
import json
import re
import jieba
from janome.tokenizer import Tokenizer as JapaneseTokenizer
from pythainlp.tokenize import word_tokenize


def process_messages(data):
    try:
        # Extracting the language
        system_content = data['messages'][2]['content']
        language = system_content.split()[7]

        # Extracting references
        user_content = data['messages'][0]['content']
        if '$$$' in user_content:
            references_split = user_content.split('$$$')[1:-1]
        elif '$ $ $' in user_content:
            references_split = user_content.split('$ $ $')[1:-1]
        else:
            references_split = []
        references = " ".join(references_split)

        # Extracting text without labels from assistant content
        assistant_content = data['messages'][1]['content']
        text = re.sub(r'<[^>]*>', '', assistant_content)

        # Getting the assistant content as it is
        tagged_sequences = assistant_content

        return references, text, tagged_sequences, language

    except Exception as e:
        print(f"Error processing messages: {e}")
        return None, None, None, None

def find_start_index(tokens, tagged_tokens):
    for i in range(len(tokens) - len(tagged_tokens) + 1):
        if tokens[i:i+len(tagged_tokens)] == tagged_tokens:
            return i
    return -1

def bio_annotation(text, tagged_sequences, language):
    if language in ['chinese', 'cantonese']:
        tokens = list(jieba.cut(text))
    elif language == 'japanese':
        tokens = [token.surface for token in japanese_tokenizer.tokenize(text)]
    elif language =='thai':
        tokens = word_tokenize(text, engine='newmm') 
    else:
        tokens = text.split()
    
    tags = ['O'] * len(tokens)
    
    # Define the categories
    categories = ["entity", "relation", "invented", "contradictory", "unverifiable", "subjective"]
    
    # Process each tagged sequence
    for category in categories:
        pattern = f"<{category}>(.*?)</{category}>"
        matches = re.finditer(pattern, tagged_sequences, re.DOTALL)
        
        for match in matches:
            tagged_text = match.group(1).strip()
            if language in ['chinese', 'cantonese']:
                tagged_tokens = list(jieba.cut(tagged_text))
            elif language == 'japanese':
                tagged_tokens = [token.surface for token in japanese_tokenizer.tokenize(tagged_text)]
            elif language =='thai':
                tagged_tokens = word_tokenize(tagged_text, engine='newmm') 
            else:
                tagged_tokens = tagged_text.split()
            
            # Find the start index of the tagged sequence in the token list
            start_idx = find_start_index(tokens, tagged_tokens)
            
            if start_idx != -1:
                # Tag the tokens
                for i in range(len(tagged_tokens)):
                    if start_idx + i < len(tags):
                        tags[start_idx + i] = f'I-{category}'
    
    return tags

def concatenate_references_and_text(references, text, lang):
    if lang in ['chinese', 'japanese', 'cantonese']:
        return references + 'reservedspecialtoken' + text
    else:
        return references + ' <|reserved_special_token_200|> ' + text

def create_tag_input(input_text, references, text, tags, language):
    if language in ['chinese', 'cantonese']:
        input_tokens = list(jieba.cut(input_text))
        text_tokens = list(jieba.cut(text))
    elif language == 'japanese':
        input_tokens = [token.surface for token in japanese_tokenizer.tokenize(input_text)]
        text_tokens = [token.surface for token in japanese_tokenizer.tokenize(text)]
    elif language =='thai':
        input_tokens = word_tokenize(input_text, engine='newmm') 
        text_tokens = word_tokenize(text, engine='newmm') 
    else:
        input_tokens = input_text.split()
        text_tokens = text.split()
    
    if len(text_tokens) != len(tags):
        raise ValueError("The number of tokens in the text and tags do not match.")
    
    tags_for_input = ['ignore'] * len(input_tokens)
    
    if language == 'chinese':
        references_tokens = list(jieba.cut(references))
    elif language == 'japanese':
        references_tokens = [token.surface for token in japanese_tokenizer.tokenize(references)]
    elif language =='thai':
        references_tokens  = word_tokenize(references, engine='newmm') 
    else:
        references_tokens = references.split()
    
    text_start_index = len(references_tokens)
    
    for i, tag in enumerate(tags):
        if text_start_index + i < len(tags_for_input):
            tags_for_input[text_start_index + i] = tag
    
    return tags_for_input

def main(input_file, output_file):
    # Read the dataset
    print(f'Loading {output_file}')
    with open(input_file, 'r') as file:
        data = json.load(file)

    processed_data = []

    # Process each entry in the dataset
    for entry in tqdm(data,desc='Processing'):
        references, text, tagged_sequences, language = process_messages(entry)
        if references is not None and text is not None and tagged_sequences is not None and language is not None:
            tags = bio_annotation(text, tagged_sequences, language)
            input_text = concatenate_references_and_text(references, text, language)
            tag_input = create_tag_input(input_text, references, text, tags, language)
            processed_data.append({
                'references': references,
                'text': text,
                'tagged_sequences': tagged_sequences,
                'language': language,
                'tags': tags,
                'input': input_text,
                'tag_input': tag_input
            })

    # Save the processed data to a JSON file
    with open(output_file, 'w') as outfile:
        json.dump(processed_data, outfile, indent=4)

    print("Dataset has been saved to", output_file)

# Example usage
if __name__ == "__main__":
    train_file = "/p/project/westai0015/code_julich/hallucinations/datasets/all-ref/combined_train.json"
    test_file = "/p/project/westai0015/code_julich/hallucinations/datasets/all-ref/combined_test.json" 
    output_train_file = 'train_all_v1.json'
    output_test_file = 'test_all_v1.json'
    main(train_file, output_train_file)
    main(test_file, output_test_file)

Loading train_all_v1.json


Processing: 100%|██████████| 654066/654066 [01:32<00:00, 7094.86it/s] 


Dataset has been saved to train_all_v1.json
Loading test_all_v1.json


Processing: 100%|██████████| 8700/8700 [00:01<00:00, 7947.06it/s]


Dataset has been saved to test_all_v1.json


In [4]:
import json
import jieba
from tqdm import tqdm
from pythainlp.tokenize import word_tokenize
from janome.tokenizer import Tokenizer as JapaneseTokenizer
# Define the categories and create mappings
# Define the categories and create mappings
categories = ["entity", "relation", "invented", "contradictory", "unverifiable", "subjective"]
label2id = {f'I-{category}': idx+1 for idx, category in enumerate(categories)}
label2id["O"] = 0
label2id['ignore'] = -100
id2label = {v: k for k, v in label2id.items()}
print(label2id)
print(id2label)

def tags_to_ids(tags):
    return [label2id[tag] for tag in tags]

def custom_jieba_cut(text):
    # Ensure the special token is handled as a single token
    #text = text.replace(' <|reserved_special_token_200|> ', ' reservedspecialtoken ')
    tokens = list(jieba.cut(text))
    # Restore the special token to its original form
    tokens = ['<|reserved_special_token_200|>' if token == 'reservedspecialtoken' else token for token in tokens]
    return tokens

def create_training_data(entry, index):
    if entry['language'] in ['chinese', 'cantonese']:
        tokens = custom_jieba_cut(entry['input'])
    elif entry['language'] == 'thai':
        tokens  = word_tokenize(entry['input'], engine='newmm') 
    elif entry['language'] == 'japanese':
        tokens = [token.surface for token in japanese_tokenizer.tokenize(entry['input'])]
    else:
        tokens = entry['input'].split()
    
    tags = entry['tag_input']
    ner_tags = tags_to_ids(tags)
    
    return {
        'id': str(index),
        'tokens': tokens,
        'labels': ner_tags,
        'tags': tags,
        'language': entry['language']
    }

def convert_to_training_format(input_file, output_file):
    # Load the processed dataset
    print(f'Loading File {input_file}')
    with open(input_file, 'r') as infile:
        data = json.load(infile)

    valid_data = []
    invalid_indices = []
    all_languages = []

    for index, entry in tqdm(enumerate(data), desc='Processing'):
        try:
            all_languages.append(entry['language'])
            training_entry = create_training_data(entry, index)
            if len(training_entry['tokens']) == len(training_entry['labels']):
                valid_data.append(training_entry)
            else:
                print(f'Invalid sampled detected at index {index}')
                invalid_indices.append(index)
        except Exception as e:
            print(f"Error processing entry {index}: {e}")
            invalid_indices.append(index)

    # Save valid data to a new JSON file
    with open('all_languages.txt', 'w') as f:
        f.write(str(set(all_languages)))

    with open(output_file, 'w') as outfile:
        json.dump(valid_data, outfile, indent=4)

    # Save invalid indices to a separate file
    with open('invalid_indices.json', 'w') as f:
        json.dump(invalid_indices, f, indent=4)

    print("Training data converted and saved successfully.")
    print("Invalid indices saved to invalid_indices.json.")

# Example usage
if __name__ == "__main__":
    input_train_file = 'train_all_v1.json'
    input_test_file = 'test_all_v1.json'  # replace with your desired output file name
    output_train_file = 'train_all_final.json'
    output_test_file = 'test_all_final.json'
    convert_to_training_format(input_test_file, output_test_file)
    convert_to_training_format(input_train_file, output_train_file)
    

{'I-entity': 1, 'I-relation': 2, 'I-invented': 3, 'I-contradictory': 4, 'I-unverifiable': 5, 'I-subjective': 6, 'O': 0, 'ignore': -100}
{1: 'I-entity', 2: 'I-relation', 3: 'I-invented', 4: 'I-contradictory', 5: 'I-unverifiable', 6: 'I-subjective', 0: 'O', -100: 'ignore'}
Loading File test_all_v1.json


Processing: 8700it [00:00, 16841.73it/s]


Training data converted and saved successfully.
Invalid indices saved to invalid_indices.json.
Loading File train_all_v1.json


Processing: 654066it [01:01, 10696.10it/s]


Training data converted and saved successfully.
Invalid indices saved to invalid_indices.json.


In [5]:
def validate_training_data(input_file):
    # Load the training dataset
    with open(input_file, 'r') as infile:
        data = json.load(infile)
    
    invalid_entries = []

    for index, entry in enumerate(data):
        tokens = entry['tokens']
        ner_tags = entry['labels']
        
        if len(tokens) != len(ner_tags):
            invalid_entries.append({
                'id': entry['id'],
                'tokens_length': len(tokens),
                'labels_length': len(ner_tags)
            })
    print(ner_tags)
    if invalid_entries:
        print(f"Found {len(invalid_entries)} invalid entries with mismatched lengths of tokens and ner_tags.")
        # Save invalid entries to a JSON file for further inspection
        with open('invalid_entries.json', 'w') as outfile:
            json.dump(invalid_entries, outfile, indent=4)
        print("Invalid entries saved to invalid_entries.json.")
    else:
        print("All entries have matching lengths of tokens and ner_tags.")

# Example usage
if __name__ == "__main__":
    input_file = 'test_all_final.json'  # replace with your input file name
    validate_training_data(input_file)

[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -10