# Split the dataset

In [None]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from openai import OpenAI
import os

In [None]:
# Try to load the dataset from a tab-separated file, handling potential errors
try:
    data = pd.read_csv(
        'data/biased.full',
        sep='\t',
        quotechar='"',
        names=["id", "src_tok", "tgt_tok", "src_raw", "tgt_raw", "src_POS_tags", "tgt_parse_tags"],
        on_bad_lines='skip'
    )
except Exception as e:
    print(f"An error occurred: {e}")

# Sample 2000 random records from the data
sampled_data = data.sample(n=2000, random_state=42)

# Split sampled data into two parts for two different labels
data_with_label_1 = sampled_data.iloc[:1000]
data_with_label_0 = sampled_data.iloc[1000:]

# Create DataFrame for label 1
df_label_1 = pd.DataFrame({
    'label': 1,
    'source_text': data_with_label_1['src_raw'],
    'target_text': data_with_label_1['tgt_raw']
})

# Create DataFrame for label 0, with empty target_text
df_label_0 = pd.DataFrame({
    'label': 0,
    'source_text': data_with_label_0['tgt_raw'],
    'target_text': [''] * 1000
})

# Concatenate both DataFrames and shuffle them
final_df = pd.concat([df_label_1, df_label_0])
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the final sampled data to a CSV file
final_df.to_csv('data/sampled_data.csv', index=False)

# Split data into training and testing datasets
train_data, test_data = train_test_split(final_df, test_size=0.4, random_state=42)

# Reset the index of training and testing data and add a new column for indexing
train_data = train_data.reset_index(drop=True)
train_data.insert(0, 'Sample Index', range(1, len(train_data) + 1))
test_data = test_data.reset_index(drop=True)
test_data.insert(0, 'Sample Index', range(1, len(test_data) + 1))

# Save the training and testing data to CSV files
train_data.to_csv('data/train_data.csv', index=False)
test_data.to_csv('data/test_data.csv', index=False)

# Function to format data for chat-completion JSON lines
def format_chat_completion(source_text, label, target_text=None):
    '''
    Formats the source text, label, and target text into a JSON line for chat-completion.
    
    Parameters:
    source_text (str): The source text.
    label (int): The label for the source text.
    target_text (str): The target text for the source text.
    
    Returns:
    list: A list containing the formatted chat-completion JSON line.
    '''
    response = [
        {"role": "system", "content": "You are an assistant trained to identify and neutralize subjective bias. If you detect subjective bias, respond with 'This is subjective bias text.' and provide the neutralized text. If no subjective bias is detected, respond with 'This text does not contain detectable subjective bias."},
        {"role": "user", "content": source_text},
    ]
    if label == 1:
        assistant_response = "This is subjective bias text."
        if target_text:
            assistant_response += f" The neutralized text is: {target_text}"
    else:
        assistant_response = "This text does not contain detectable subjective bias."
    response.append({"role": "assistant", "content": assistant_response})
    return response

# Function to write JSON lines to a file
def write_to_jsonl(data, file_name):
    with open(file_name, 'w') as jsonl_file:
        for index, row in data.iterrows():
            chat_completion = format_chat_completion(row['source_text'], row['label'], row.get('target_text', None))
            entry = {
                "messages": chat_completion
            }
            jsonl_file.write(json.dumps(entry) + '\n')

# Create JSONL files for training and testing
write_to_jsonl(train_data, 'data/fine_tuning_train_data.jsonl')
write_to_jsonl(test_data, 'data/fine_tuning_test_data.jsonl')

# Confirmation message after file creation
print("JSONL files for training and testing have been created.")

# Fine-tuning the model

In [None]:
# Set your OpenAI API key from an environment variable for better security practices
os.environ["OPENAI_API_KEY"] = "sk-#####"

# Initialize the OpenAI client
client = OpenAI()

# Uncomment the following code block to upload your training data file.
# It's recommended to add error handling and check the upload status.
# try:
#     file_response = client.files.create(
#       file=open("data/fine_tuning_train_data.jsonl", "rb"),
#       purpose="fine-tune"
#     )
#     print("File uploaded successfully. File ID:", file_response.id)
# except Exception as e:
#     print(f"Failed to upload file: {e}")

# Create a fine-tuning job with the specified model and training file ID.
# Include error handling to capture and react to API errors.
try:
    fine_tuning_job = client.fine_tuning.jobs.create(
      training_file="file-#####",  # Replace with actual file ID from the upload response
      model="gpt-3.5-turbo"
    )
    print("Fine-tuning job created successfully. Job ID:", fine_tuning_job.id)
except Exception as e:
    print(f"Failed to create fine-tuning job: {e}")