In [18]:
import pandas as pd
import random

In [19]:
# Load the dataset
file_path = '/code/llm-fine-tuning/lek_training_sets/lek_three_quarters_random.csv'  # Update this with the actual file path
data = pd.read_csv(file_path)

In [20]:
# Define keywords that indicate a row should not be augmented
keywords = [
    "all the above.", "all of the above.", "none of the above", "none of the above.", 
    "answers A, B and C are correct.", "answers B and D are correct.", 
    "answers B and C are correct.", "B and C are true.", "A and C are true.", "A, B, C are true.", 
    "answers A and B are correct.", "A and B are true.", "A, B, and C are true.", 
    "A,B are true.", "answers A and C are correct.", "answers B and C are true.", 
    "A,B are correct.", "A, B, and C are true.", "A, B and C are true.", 
    "none of the above is correct.", "all of the above are true.", 
    "A and D are correct.", "answers A, B, C are correct.", "answers A, B and C are true.", 
    "answers A,B,C are correct.", "A, C and D are true."
]

In [21]:
# Function to check if any of the keywords are present in the 'D' or 'E' options
def check_keywords(row):
    d_option = row['D']
    e_option = row['E']
    for keyword in keywords:
        if any(keyword == option for option in [d_option, e_option]):
            return True
    return False

# Apply the function to identify rows to exclude
data['exclude_for_augmentation'] = data.apply(check_keywords, axis=1)

# Function to augment a row
def augment_row(row):
    options = ['A', 'B', 'C', 'D', 'E']
    correct_answer = row['Answer']
    original_options = [row[option] for option in options]
    
    # Randomly permute the options
    permuted_options = random.sample(original_options, len(original_options))
    
    # Update the 'Answer' based on the new order of options
    new_answer = chr(permuted_options.index(original_options[ord(correct_answer) - ord('A')]) + ord('A'))
    
    # Update the row with permuted options and new answer
    for i, option in enumerate(options):
        row[option] = permuted_options[i]
    row['Answer'] = new_answer
    row['Data Type'] = 'augmented'
    return row

# Filter rows that are not excluded and apply augmentation
augmented_rows = data[~data['exclude_for_augmentation']].apply(augment_row, axis=1)

# Mark original rows and prepare them for concatenation
data['Data Type'] = 'original'
data['Not Used For Augmentation'] = data['exclude_for_augmentation']

# Concatenate original and augmented rows
resulting_data = pd.concat([data, augmented_rows], ignore_index=True)

# Drop the 'exclude_for_augmentation' column
resulting_data.drop(columns=['exclude_for_augmentation'], inplace=True)

# Save the resulting dataset to a CSV file
augmented_file_path = '/code/llm-fine-tuning/lek_augmented_three_quarters.csv'  # Update this with your desired file path
resulting_data.to_csv(augmented_file_path, index=False)

print(f"Dataset augmented and saved to {augmented_file_path}.")

Dataset augmented and saved to /code/llm-fine-tuning/lek_augmented_three_quarters.csv.
