In [3]:
import os
os.chdir('/home/code/data')

In [None]:
#Specify model_type
model_type= 'llama' #'llama','qwen'

## RagTruth

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import re
import gc
from tqdm import tqdm
import os
import logging
import sys
from datetime import datetime

# Get Execution Timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Configure Execution Log
# Create Log Filename
log_filename = f'process_preprocessing_{model_type}_{timestamp}.log'

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler(sys.stdout)
    ]
)

# Disable Core Dump Generation (Unix/Linux Only)
try:
    import resource
    resource.setrlimit(resource.RLIMIT_CORE, (0, 0))
    logging.info("Coredump generation has been disabled.")
except (ImportError, AttributeError):
    # Skip If resource Module or RLIMIT_CORE Is Not Available
    logging.warning("Could not disable coredump generation on this system.")

# Reconstruct Text (Add Spaces Based on Tokens)
def reconstruct_text_with_spaces(tokens):
    if not tokens:
        return ''
    result = tokens[0].replace('Ġ', '')
    for token in tokens[1:]:
        if 'Ġ' in token:
            result += ' ' + token.replace('Ġ', '')
        else:
            result += token
    return result

# Normalize Text (Remove Punctuation, Special Characters, and Extra Spaces)
def normalize_text(text):
    text = text.replace('Â', '').replace('°', '°')  # Correct Special Characters
    text = re.sub(r'[^\w\s\-]', '', text)  # Retain Hyphens, Remove Other Punctuation
    text = re.sub(r'\s+', ' ', text)       # Remove Extra Spaces
    return text.strip()                    # Trim Leading and Trailing Spaces

# Retrieve Hallucination Data and Token Positions
def get_hallucination_data(row):
    hallucinations = []
    hallucination_texts = []
    hallucination_tokens = []

    cleaned_tokens = [token.replace('Ġ', '') for token in row['output_tokens']]
    combined_text = ''.join(cleaned_tokens)

    for label in row['labels']:
        hallucination_text = label['text'].replace('\n', 'Ċ')
        search_text = hallucination_text.replace(' ', '')
        position = combined_text.find(search_text)

        if position != -1:
            current_pos = 0
            start_token = end_token = None
            for idx, token in enumerate(cleaned_tokens):
                current_pos += len(token)
                if current_pos > position and start_token is None:
                    start_token = idx
                if current_pos >= position + len(search_text):
                    end_token = idx + 1
                    break
            if start_token is not None and end_token is not None:
                hallucinations.append({'start': start_token, 'end': end_token})
                hallucination_texts.append(hallucination_text)
                hallucination_tokens.append(row['output_tokens'][start_token:end_token])

    return hallucinations, hallucination_texts, hallucination_tokens

# Generate Hallucination Labels (1: Hallucination Token, 0: Others)
def generate_hallucination_labels(output_tokens, hallucinations):
    label_list = [0] * len(output_tokens)
    for hal in hallucinations:
        for i in range(hal['start'], hal['end']):
            if 0 <= i < len(label_list):
                label_list[i] = 1
    return label_list

# Create all_texts_hal_token_label Column
def generate_all_texts_label(output_tokens, hal_token_label, task_name):
    try:
        if task_name.lower() == 'data2txt':
            search_tokens = ['Overview', ':']
        else:
            search_tokens = ['output', ':']

        search_index = output_tokens.index(search_tokens[0], 0)
        # Assuming '+7' refers to skipping 7 tokens after the search_tokens[0]
        start_index = search_index + 7  
        
        if model_type == 'llama':
            end_index = output_tokens.index('<|eot_id|>', start_index)
        
        elif model_type == 'qwen':
            end_index = output_tokens.index('<|im_end|>', start_index)
        
    except ValueError:
        return None

    prefix = [-1] * start_index
    hal_token_part = hal_token_label[start_index:end_index]
    final_label = -1
    return prefix + hal_token_part + [final_label]

# Get Token Index of Answer Start Position
def generate_answer_start_token_index(output_tokens, task_name):
    if task_name.lower() == 'data2txt':
        search_tokens = ['Overview', ':']
    else:
        search_tokens = ['output', ':']

    for i in range(len(output_tokens) - 1):
        if output_tokens[i] == search_tokens[0] and output_tokens[i + 1] == search_tokens[1]:
            return i + 7
    return None

# Process Data, Reconstruct Text, and Generate Labels
def process_data(row, task_name):
    tokens = row['output_tokens']
    labels = row['labels']

    start_index = generate_answer_start_token_index(tokens, task_name)
    if start_index is None:
        return pd.Series({
            'reconstructed_text': None,
            'token_to_word_map': None,
            'hal_tokens': None,
            'hal_texts': None,
            'hal_tokens_text': None,
            'hal_words': None,
            'hal_token_label': None,
            'all_texts_hal_token_label': None,
            'answer_start_token_index': None,
            'answer_start_text': None
        })

    words = []
    token_to_word_map = {}
    current_word = ""
    current_word_index = -1

    for idx in range(start_index, len(tokens)):
        token = tokens[idx]
        if 'Ġ' in token:
            if current_word:
                words.append(current_word)
            current_word = token.replace('Ġ', '')
            current_word_index += 1
        else:
            current_word += token

        token_to_word_map[idx] = current_word_index

    if current_word:
        words.append(current_word)

    for idx in range(start_index):
        token_to_word_map[idx] = -1

    reconstructed_text = ' '.join(words)

    # Retrieve Hallucination Data
    hallucinations = []
    hallucination_texts = []
    hallucination_tokens = []
    hallucinations_word_indices = []

    cleaned_tokens = [token.replace('Ġ', '') for token in tokens]
    combined_text = ''.join(cleaned_tokens)

    for label in labels:
        hallucination_text = label['text'].replace('\n', 'Ċ')
        search_text = hallucination_text.replace(' ', '')
        position = combined_text.find(search_text)

        if position != -1:
            current_pos = 0
            start_token = end_token = None
            for idx, token in enumerate(cleaned_tokens):
                current_pos += len(token)
                if current_pos > position and start_token is None:
                    start_token = idx
                if current_pos >= position + len(search_text):
                    end_token = idx + 1
                    break

            if start_token is not None and end_token is not None:
                hallucinations.append({'start': start_token, 'end': end_token})
                hallucination_texts.append(hallucination_text)
                hallucination_tokens.append(tokens[start_token:end_token])

                hallucinations_word_indices.append({
                    'start': token_to_word_map[start_token],
                    'end': token_to_word_map[end_token - 1] + 1
                })

    # Generate Hallucination Labels
    hal_token_label = [0] * len(tokens)
    for hal in hallucinations:
        for i in range(hal['start'], hal['end']):
            if 0 <= i < len(hal_token_label):
                hal_token_label[i] = 1

    # Generate all_texts_hal_token_label
    all_texts_hal_token_label = generate_all_texts_label(tokens, hal_token_label, task_name)

    return pd.Series({
        'reconstructed_text': reconstructed_text,
        'token_to_word_map': token_to_word_map,
        'hal_tokens': hallucinations,
        'hal_texts': hallucination_texts,
        'hal_tokens_text': hallucination_tokens,
        'hal_words': hallucinations_word_indices,
        'hal_token_label': hal_token_label,
        'all_texts_hal_token_label': all_texts_hal_token_label,
        'answer_start_token_index': start_index,
        'answer_start_text': reconstruct_text_with_spaces(tokens[start_index:]) if start_index is not None else None
    })

#  Split DataFrame into Training and Validation Sets Based on source_id
def split_train_dev_by_source_id(train_dev_df, val_size=75, seed=42):
    """
    Splits a DataFrame into training and validation sets based on unique `source_id`.

    Parameters:
    - train_dev_df (pd.DataFrame): The DataFrame to be split.
    - val_size (int): The number of unique `source_id` for the validation set.
    - seed (int): Random seed for reproducibility.

    Returns:
    - train_df (pd.DataFrame): The training set DataFrame.
    - val_df (pd.DataFrame): The validation set DataFrame.
    """
    # Retrieve Unique source_id
    unique_source_ids = train_dev_df['source_id'].unique()

    # Fix Random Seed to Reproduce the Same Split
    np.random.seed(seed)

    # Randomly Select source_id for Validation Set
    val_size = min(val_size, len(unique_source_ids))  # Ensure val_size Does Not Exceed the Number of Unique source_id
    dev_source_ids = np.random.choice(unique_source_ids, size=val_size, replace=False)

    # Split DataFrame Based on source_id
    val_df = train_dev_df[train_dev_df['source_id'].isin(dev_source_ids)]
    train_df = train_dev_df[~train_dev_df['source_id'].isin(dev_source_ids)]

    return train_df, val_df



# Main Processing Function (Supports Multiple Tasks)
def main_processing(df, task_name, dataset_type, sentence_dir='/home/code/data/saves'):
    logging.info(f"Retrieving hallucination data from {dataset_type} data for the {task_name} task...")
    # Retrieve Hallucination Data
    df[['hal_tokens', 'hal_texts', 'hal_tokens_text']] = df.apply(lambda row: pd.Series(get_hallucination_data(row)), axis=1)
    
    # Free Memory
    gc.collect()

    logging.info(f"Generating hallucination labels from {dataset_type} data for the {task_name} task...")
    # Generate hal_token_label
    df['hal_token_label'] = df.apply(lambda row: generate_hallucination_labels(row['output_tokens'], row['hal_tokens']), axis=1)
    
    # Free Memory
    gc.collect()

    # Generate all_texts_hal_token_label
    df['all_texts_hal_token_label'] = df.apply(lambda row: generate_all_texts_label(row['output_tokens'], row['hal_token_label'], task_name), axis=1)
    
    # Free Memory
    gc.collect()

    # Filter Rows Where Hallucination Occurs
    hallucination_rows = df[df['hal_token_label'].apply(lambda x: 1 in x if x is not None else False)]
    
    logging.info(f"Number of rows with hallucination in {dataset_type} data for the {task_name} task: {hallucination_rows.shape[0]}")

    logging.info(f"Processing {dataset_type} data for the {task_name} task...")
    
    # Data Processing
    df[['reconstructed_text', 'token_to_word_map', 'hal_tokens', 'hal_texts',
        'hal_tokens_text', 'hal_words', 'hal_token_label',
        'all_texts_hal_token_label', 'answer_start_token_index',
        'answer_start_text']] = df.apply(lambda row: process_data(row, task_name), axis=1)
    
    # Free Memory
    gc.collect()

    logging.info(f"Saving hallucination results to a CSV file for {dataset_type} data in the {task_name} task...")
    # Generate hallucination_results.csv and hallucination_results_false.csv
    output_data = []
    for index, row in hallucination_rows.iterrows():
        for hal_text, hal_tokens_text, hal_tokens_indices in zip(row['hal_texts'], row['hal_tokens_text'], row['hal_tokens']):
            start = hal_tokens_indices['start']
            end = hal_tokens_indices['end']
            reconstructed_text = reconstruct_text_with_spaces(row['output_tokens'][start:end])
            extracted_tokens_text = reconstructed_text

            is_match = normalize_text(hal_text) == normalize_text(reconstructed_text) or normalize_text(hal_text) in normalize_text(reconstructed_text)

            output_data.append({
                'Row': index,
                'Hallucination Tokens (Indices)': f"{start}-{end}",
                'Hallucination Text': hal_text,
                'Extracted Tokens Text': extracted_tokens_text,
                'Extracted Hallucination Tokens': row['output_tokens'][start:end],
                'Text Match': is_match
            })

    if output_data:
        # Save DataFrame
        output_df = pd.DataFrame(output_data)
        csv_dir = '/home/code/data/saves/csv'
        os.makedirs(csv_dir, exist_ok=True)
        output_csv_path = f"{csv_dir}/{model_type}_{task_name}_{dataset_type}_hallucination_results.csv"
        false_csv_path = f"{csv_dir}/{model_type}_{task_name}_{dataset_type}_hallucination_results_false.csv"
        output_df.to_csv(output_csv_path, index=False)
        false_match_df = output_df[output_df['Text Match'] == False]
        false_match_df.to_csv(false_csv_path, index=False)
        
        # Free Memory
        del output_data, output_df, false_match_df
        gc.collect()

        logging.info(f"Hallucination results for {dataset_type} data in the {task_name} task have been saved to a CSV file.")
        logging.info(f"All results have been saved to {output_csv_path}, and only False results have been saved to {false_csv_path}.")
    else:
        logging.info(f"No rows containing hallucination were found in {dataset_type} data for the {task_name} task.")

    # Save DataFrame (Pickle Format)
    os.makedirs(sentence_dir, exist_ok=True)
    if dataset_type == 'train_dev':
        train_df, val_df = split_train_dev_by_source_id(df[df['split'] == 'train'], val_size=75, seed=42)
        train_pickle_path = f"{sentence_dir}/{model_type}_{task_name}_train.pkl"
        val_pickle_path = f"{sentence_dir}/{model_type}_{task_name}_val.pkl"
        train_df.to_pickle(train_pickle_path)
        val_df.to_pickle(val_pickle_path)
        logging.info(f"Train and validation data for the {task_name} task have been saved in Pickle format.")
    elif dataset_type == 'test':
        test_df = df[df['split'] == 'test']
        test_pickle_path = f"{sentence_dir}/{model_type}_{task_name}_test.pkl"
        test_df.to_pickle(test_pickle_path)
        logging.info(f"Test data for the {task_name} task have been saved in Pickle format.")
    else:
        logging.error(f"Unknown dataset_type: {dataset_type}")

    # Free Memory
    if dataset_type == 'train_dev':
        del train_df, val_df
    elif dataset_type == 'test':
        del test_df
    gc.collect()

    logging.info(f"{dataset_type} data for the {task_name} task have been saved in Pickle format.")

# List the Tasks to be Processed
tasks = ['summary', 'qa', 'data2txt']

# Map task_type for Each Task
task_type_mapping = {
    'summary': 'Summary',
    'qa': 'QA',
    'data2txt': 'Data2txt'
}

for task in tasks:
    task_name = task.lower()
    task_type = task_type_mapping.get(task_name, None)

    if task_type is None:
        logging.warning(f"Unknown task_name: {task_name}. Skipping...")
        continue

    # Load and Preprocess Data
    data_file = f'/home/code/data/saves/{model_type}_{task_name}.pkl'
    if not os.path.exists(data_file):
        logging.error(f"Data file does not exist: {data_file}")
        continue

    logging.info(f"Loading data: {data_file}")
    try:
        df = pd.read_pickle(data_file)
        logging.info("Data loading completed.")
    except Exception as e:
        logging.error(f"An error occurred while loading data: {e}")
        continue

    # Check the first 'prompt' column
    if 'prompt' not in df.columns:
        logging.error(f"The DataFrame does not contain a 'prompt' column. File: {data_file}")
        continue

    # Create a label: 1 if at least one hallucination is present, 0 otherwise
    df['label'] = df['labels'].apply(lambda x: 1 if len(x) > 0 else 0)

    # Verify that data has been loaded
    logging.info(f"DataFrame columns: {df.columns}")

    # Filter based on the 'task_type' column
    task_df = df[df['task_type'] == task_type].copy()
    if task_df.empty:
        logging.warning(f"No data found for the {task_name} task.")
        continue

    # Split into train_dev_df and test_df based on the 'split' column
    logging.info(f"Splitting {task_name} task data into train_dev and test...")
    train_dev_df = task_df[task_df['split'] == 'train'].copy()
    test_df = task_df[task_df['split'] == 'test'].copy()
    logging.info(f"Number of train_dev samples: {train_dev_df.shape[0]}, Number of test samples: {test_df.shape[0]}")

    # Free memory
    del df
    gc.collect()

    logging.info(f"\nStarting processing for the {task_name} task...")

    # Process train_dev data
    logging.info(f"Processing train_dev data for the {task_name} task...")
    main_processing(train_dev_df.copy(), task_name, 'train_dev')

    # Free memory
    del train_dev_df
    gc.collect()

    # Process test data
    logging.info(f"Processing test data for the {task_name} task...")
    main_processing(test_df.copy(), task_name, 'test')

    # Free memory
    del test_df
    gc.collect()

    logging.info(f"{task_name} task data has been saved in Pickle format.")

    logging.info("\nProcessing for all tasks has been completed.")



2024-12-09 02:54:55,799 INFO: Coredump generation has been disabled.
2024-12-09 02:54:55,804 INFO: データを読み込んでいます: /home/code/vishnu/pkl/1206_ragtruth_summary.pkl
