In [None]:
import json
import pandas as pd
import os

# Paths to the .jsonl files
file_paths = ['test.jsonl', 'train.jsonl']

# Initialize a list to store the data
data = []

# Function to clean text
def clean_text(text):
    cleaned_text = text.strip('\n').replace('\n\n', '\n')
    return cleaned_text

# Load each line from each file, extract "chosen" and "rejected" text, and append to the list
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Parse the JSON line into a Python dictionary
            json_object = json.loads(line)
            
            # Check and clean "chosen" field, if exists, and mark as chosen
            if 'chosen' in json_object:
                cleaned_chosen_text = clean_text(json_object['chosen'])
                data.append({"text": cleaned_chosen_text, "Chosen": True})
            
            # Check and clean "rejected" field, if exists, and mark as not chosen
            if 'rejected' in json_object:
                cleaned_rejected_text = clean_text(json_object['rejected'])
                data.append({"text": cleaned_rejected_text, "Chosen": False})

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(data, columns=["text", "Chosen"])

In [None]:
def aggregate_text_sections(text):
    # Split the text by "Human:" and "Assistant:" to separate the dialogues
    parts = text.split("Human:")
    human_text = ''
    assistant_text = ''
    
    for part in parts[1:]:  # Skip the first split as it's before the first "Human:"
        assistant_split = part.split("Assistant:")
        human_text += assistant_split[0].strip() + " "  # Add human part and a space for separation
        
        if len(assistant_split) > 1:  # Check if there is an assistant part
            assistant_text += assistant_split[1].strip() + " "  # Add assistant part and a space for separation
            
    return human_text.strip(), assistant_text.strip()

# Apply the function to each row and create new columns
df[['Human', 'Assistant']] = df.apply(lambda row: pd.Series(aggregate_text_sections(row['text'])), axis=1)

# Display the updated DataFrame structure
print(df.head())

In [None]:

def create_chosen_rejected_dataframe(df):
    # Initialize empty lists to store the chosen and rejected text
    chosen_text = []
    rejected_text = []

    # Initialize a counter for mismatched pairs
    mismatch_count = 0

    # Iterate over the rows of the DataFrame
    for i in range(0, len(df), 2):
        # Check if the 'human' text matches for the current pair of rows
        if df.iloc[i]['Human'] == df.iloc[i+1]['Human']:
            # Check if the current row has chosen == True
            if df.iloc[i]['Chosen']:
                chosen_text.append(df.iloc[i]['Assistant'])
                rejected_text.append(df.iloc[i+1]['Assistant'])
            else:
                chosen_text.append(df.iloc[i+1]['Assistant'])
                rejected_text.append(df.iloc[i]['Assistant'])
        else:
            mismatch_count += 1
            print(f"Mismatch found at index {i//2}:")
            print("Human text in chosen row:", df.iloc[i]['Human'])
            print("Human text in rejected row:", df.iloc[i+1]['Human'])
            print()

    # Create a new DataFrame with the chosen and rejected text
    result_df = pd.DataFrame({'Chosen': chosen_text, 'Rejected': rejected_text})

    # Calculate the percentage of mismatched pairs
    total_pairs = len(df) // 2
    mismatch_percentage = (mismatch_count / total_pairs) * 100

    print(f"Mismatched pairs: {mismatch_count} out of {total_pairs} ({mismatch_percentage:.2f}%)")

    return result_df

# Example usage
# Assume 'df' is your input DataFrame with columns ['text', 'chosen', 'human', 'assistant']
AB_df = create_chosen_rejected_dataframe(df)
print(AB_df.head())

In [None]:
def find_unique_text(row):
    chosen_text = row['Chosen'].split()
    rejected_text = row['Rejected'].split()

    # Find the index where the text starts to diverge
    diverge_index = 0
    while diverge_index < min(len(chosen_text), len(rejected_text)):
        if chosen_text[diverge_index] != rejected_text[diverge_index]:
            break
        diverge_index += 1

    c_unique = ' '.join(chosen_text[diverge_index:])
    r_unique = ' '.join(rejected_text[diverge_index:])

    return pd.Series({'C_Unique': c_unique, 'R_Unique': r_unique})

# Assuming 'result_df' is the DataFrame obtained from the previous step
AB_df[['C_Unique', 'R_Unique']] =AB_df.apply(find_unique_text, axis=1)

print(AB_df.head())
# Set the output directory
output_dir = "output"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Set the initial output file path
output_path = os.path.join(output_dir, "1.csv")

# Add placeholder columns for 'C_Keywords' and 'R_Keywords'
AB_df['C_Keywords'] = '~'
AB_df['R_Keywords'] = '~'

# Save the initial CSV file with placeholders
AB_df.to_csv(output_path, index=False)

print(AB_df.head())