In [1]:
import sys
import pandas as pd
from tqdm import tqdm
from detoxify import Detoxify

sys.path.append("../")

In [2]:
data_df = pd.read_json("../dataset/blip_laion_cc_sbu_558k.json")

In [3]:
gpt_messages = []

for index, row in data_df.iterrows():
    conversations = row['conversations']
    for conversation in row['conversations']:
        if conversation['from'] == 'gpt':
            gpt_messages.append(
                {
                    'id': row['id'],
                    'message': conversation['value']
                }
            )    

In [4]:
def process_batch(batch):
    """
    Process a batch of records through Detoxify model and add toxicity score.
    
    Parameters:
    batch (list of dict): A list of dictionaries with 'id' and 'message'.
    
    Returns:
    list of dict: A list of dictionaries with 'id', 'message', and 'toxicity_score'.
    """
    messages = [entry['message'] for entry in batch]
    ids = [entry['id'] for entry in batch]

    # Get predictions
    results = Detoxify('original').predict(messages)
    
    # Extract toxicity scores
    toxicity_scores = results['toxicity']
    
    # Combine results with original data
    processed_batch = [
        {'id': id_, 'message': message, 'toxicity_score': toxicity_score}
        for id_, message, toxicity_score in zip(ids, messages, toxicity_scores)
    ]
    
    return processed_batch

def process_dataset(data, batch_size=1000):
    """
    Process the entire dataset in batches with a progress bar and return a DataFrame with toxicity scores.
    
    Parameters:
    data (list of dict): A list of dictionaries representing the dataset.
    batch_size (int): Number of records to process in each batch.
    
    Returns:
    pd.DataFrame: A DataFrame with 'id', 'message', and 'toxicity_score' columns.
    """
    all_results = []
    total_batches = (len(data) + batch_size - 1) // batch_size  # Calculate total number of batches

    # Process in batches with a progress bar
    for i in tqdm(range(0, len(data), batch_size), total=total_batches, desc="Detecting Toxicity"):
        batch = data[i:i+batch_size]
        processed_batch = process_batch(batch)
        all_results.extend(processed_batch)

    # Convert to DataFrame
    df = pd.DataFrame(all_results)
    return df

In [6]:
# Process the dataset
result_df = process_dataset(gpt_messages, batch_size=1000)

# Save the result to a JSON file
result_df.to_json('../dataset/toxicity_detection.json', orient='records', lines=True)

Detecting Toxicity: 100%|██████████████████████████████████████████████████████████| 559/559 [7:57:20<00:00, 51.24s/it]
