# Data Generation

The data needs to contain 3 roles: system, user, and response. System is telling what is the large language model would do. User is telling the prompt that user will be passed in, and the expected response. Below is the example of data need to pass to the training. 

{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}

In [9]:
import pandas as pd
import csv

def collect_average_bias_question_for_each_article():
    original_prompts = pd.read_csv("data/data_prompt_w_articles_info_only.csv")
    article_id = 0
    article_info_bias_question = {}
    for index, row in original_prompts.iterrows():
        if row["id"] == article_id:
            if article_id not in article_info_bias_question:
                article_info_bias_question[article_id] = []
            article_info_bias_question[article_id].append(row["bias-question"])
        else:
            article_id += 1
            article_info_bias_question[article_id] = []
            article_info_bias_question[article_id].append(row["bias-question"])
    print("article_info", article_info_bias_question)
    determine_bias(article_info_bias_question)
    return article_info_bias_question

def determine_bias (article_info_bias_question):
    for article_id, questions in article_info_bias_question.items():
        bias_count = questions.count("is-biased")
        not_bias_count = questions.count("is-not-biased")
        
        if bias_count > not_bias_count:
            article_info_bias_question[article_id] = "is-biased"
        elif bias_count < not_bias_count:
            article_info_bias_question[article_id] = "is-not-biased"
        else:
            article_info_bias_question[article_id] = "unknown"

bias_question_for_article_only_prompt = collect_average_bias_question_for_each_article()

with open("data/bias_question_for_article_only_experiment.csv", "w", newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["article_id", "bias_question"])
    for article_id, bias_question in bias_question_for_article_only_prompt.items():
        writer.writerow([article_id, bias_question])
        


article_info {0: ['is-biased', 'is-not-biased', 'is-biased', 'is-biased', 'is-biased', 'is-not-biased', 'is-biased', 'is-biased', 'is-biased', 'is-not-biased', 'is-biased', 'is-biased', 'is-not-biased', 'is-biased', 'is-biased', 'is-not-biased', 'is-biased', 'is-biased', 'is-biased', 'is-biased', 'is-biased', 'is-not-biased', 'is-not-biased', 'is-biased', 'is-biased', 'is-biased', 'is-biased'], 1: ['is-biased', 'is-biased', 'is-biased', 'is-not-biased', 'is-biased', 'is-biased', 'is-not-biased', 'is-biased', 'is-biased', 'is-biased', 'is-not-biased', 'is-biased', 'is-biased', 'is-not-biased', 'is-not-biased', 'is-biased', 'is-biased', 'is-not-biased'], 2: ['is-biased', 'is-biased', 'is-biased', 'is-biased', 'is-biased', 'is-biased', 'is-biased', 'is-biased'], 3: ['is-biased', 'is-biased', 'is-biased', 'is-not-biased', 'is-not-biased', 'is-not-biased', 'is-biased', 'is-not-biased', 'is-biased', 'is-not-biased', 'is-biased', 'is-not-biased', 'is-biased'], 4: ['is-biased', 'is-biased'], 5

In [79]:
from sklearn.model_selection import train_test_split
import json
import os

def generate_prompt(system_content, article_detail, expected_result):
    user_question = "Determine if this article biased based on the user point of view?"
    message = {
        "messages": [{
        "role": "system", 
        "content": system_content,
        },
        {
            "role": "user",
            "content": article_detail + "\n" + user_question
        },
        {
            "role": "assistant",
            "content": expected_result
        }
                           
        ]}
    return message

def generate_prompts(system_content, prompts, expected_results, output_path):
    messages = []
    for i in range(len(prompts)):
        article_detail = prompts[i]
        expected_result = expected_results[i]
        message = generate_prompt(system_content, article_detail, expected_result)
        messages.append(message)
    
    # Write messages to a JSONL file
    with open(output_path, 'w') as jsonl_file:
        for message in messages:
            jsonl_file.write(json.dumps(message) + '\n')


def collecting_test_and_train_set(file_path, experiment_name):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Split the DataFrame into training and test sets
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    parent_directory = 'data/gpt_prompts/' + experiment_name
    # Create the directory if it does not exist
    os.makedirs(parent_directory, exist_ok=True)
    # Save the training and test sets to new CSV files
    train_df.to_csv(parent_directory + '/train_set.csv', index=False)
    test_df.to_csv(parent_directory + '/test_set.csv', index=False)
    
def collecting_prompts_through_article_id(ids, experiment_name):
    
    ids.sort()
    data = pd.read_csv("data/prompts.csv")
    prompts = []
    row_index = 0
    for index, row in data.iterrows():
        if row["article_id"] == ids[row_index]:
            prompts.append(row[experiment_name])
            row_index += 1
            if row_index == len(ids):
                break
    return prompts

In [80]:
# generate prompts for article only experiment
def generate_prompts_for_article_only_experiment():
    system_content = "You are going to be the reader of a political article. Your job is to determine whether the article is biased based on reader point of view. You answer either is-biased or is-not-biased, with no explanation. An article is-biased in its presentation of the topic, meaning that it ever exaggerates, misrepresents, omits, or otherwise distorts facts (including by making subjective opinions look like facts) for the purpose of appealing to a certain political group.\n"
    
    collecting_test_and_train_set("data/bias_question_for_article_only_experiment.csv", "article_only_experiment")
    train_df = pd.read_csv("data/gpt_prompts/article_only_experiment/train_set.csv")
    test_df = pd.read_csv("data/gpt_prompts/article_only_experiment/test_set.csv")
    
    train_prompts = collecting_prompts_through_article_id(list(train_df["article_id"]), "prompt_article_info")
    test_prompts = collecting_prompts_through_article_id(list(test_df["article_id"]), "prompt_article_info")
    generate_prompts(system_content, train_prompts, train_df["bias_question"].replace('unknown', 'is_not_biased'), "data/gpt_prompts/article_only_experiment/train_set.jsonl")
    generate_prompts(system_content, test_prompts, test_df["bias_question"].replace('unknown', 'is_not_biased'), "data/gpt_prompts/article_only_experiment/test_set.jsonl")
    
generate_prompts_for_article_only_experiment()

In [70]:
# copy bias - question to prompts.csv
import pandas as pd

df = pd.read_csv("data/prompts.csv")
original_data = pd.read_csv("data/data_articles_info.csv", usecols=['id', 'bias-question'])


merged_data = pd.merge(df, original_data, on='id', how='left', validate='one_to_one')


merged_data.to_csv("data/prompts.csv", index=False)


In [81]:
# Generate prompt for article with politics info experiment
def generate_prompts_for_article_w_politics_info_experiment():
    system_content = "You are going to be the reader of a political article. Your job is to determine whether the article is biased based on the Reader identifies politics group. You answer either is-biased or is-not-biased, with no explanation. An article is-biased in its presentation of the topic, meaning that it ever exaggerates, misrepresents, omits, or otherwise distorts facts (including by making subjective opinions look like facts) for the purpose of appealing to a certain political group.\n"
    
    collecting_test_and_train_set("data/prompts.csv", "article_w_politics_info_experiment")
    train_df = pd.read_csv("data/gpt_prompts/article_w_politics_info_experiment/train_set.csv")
    test_df = pd.read_csv("data/gpt_prompts/article_w_politics_info_experiment/test_set.csv")
    
    generate_prompts(system_content, train_df["prompt_politics_info"], train_df["bias-question"], "data/gpt_prompts/article_w_politics_info_experiment/train_set.jsonl")
    generate_prompts(system_content, test_df["prompt_politics_info"], test_df["bias-question"], "data/gpt_prompts/article_w_politics_info_experiment/test_set.jsonl")
    
generate_prompts_for_article_w_politics_info_experiment()

In [82]:
# Generate prompt for article with politics info experiment
def generate_prompts_for_article_w_letter_source_info_experiment():
    system_content = "You are going to be the reader of a political article. Your job is to determine whether the article is biased. And take source of the article into consideration to provide answer. You answer either is-biased or is-not-biased, with no explanation. An article is-biased in its presentation of the topic, meaning that it ever exaggerates, misrepresents, omits, or otherwise distorts facts (including by making subjective opinions look like facts) for the purpose of appealing to a certain political group.\n"
    
    collecting_test_and_train_set("data/prompts.csv", "article_w_source_info_experiment")
    train_df = pd.read_csv("data/gpt_prompts/article_w_source_info_experiment/train_set.csv")
    test_df = pd.read_csv("data/gpt_prompts/article_w_source_info_experiment/test_set.csv")
    
    generate_prompts(system_content, train_df["prompt_letter_source_info"], train_df["bias-question"], "data/gpt_prompts/article_w_source_info_experiment/train_set.jsonl")
    generate_prompts(system_content, test_df["prompt_letter_source_info"], test_df["bias-question"], "data/gpt_prompts/article_w_source_info_experiment/test_set.jsonl")
    
generate_prompts_for_article_w_letter_source_info_experiment()

In [83]:
# Generate prompt for article with all info experiment

def generate_prompts_for_article_w_all_info_experiment():
    system_content = "You are going to be the reader of a political article. Your job is to determine whether the article is biased. And take all the anticipants information into consideration to provide answer, which include Reader Demographics information, Readers' politics group and article source. You answer either is-biased or is-not-biased, with no explanation. An article is-biased in its presentation of the topic, meaning that it ever exaggerates, misrepresents, omits, or otherwise distorts facts (including by making subjective opinions look like facts) for the purpose of appealing to a certain political group.\n"
    
    collecting_test_and_train_set("data/prompts.csv", "article_w_all_info_experiment")
    train_df = pd.read_csv("data/gpt_prompts/article_w_all_info_experiment/train_set.csv")
    test_df = pd.read_csv("data/gpt_prompts/article_w_all_info_experiment/test_set.csv")
    
    generate_prompts(system_content, train_df["prompt_all_info"], train_df["bias-question"], "data/gpt_prompts/article_w_all_info_experiment/train_set.jsonl")
    generate_prompts(system_content, test_df["prompt_all_info"], test_df["bias-question"], "data/gpt_prompts/article_w_all_info_experiment/test_set.jsonl")
    
generate_prompts_for_article_w_all_info_experiment()

# Fine tuning

In [84]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

## Article Only Experiment

### Uploading File

In [85]:
# Load article only experiment training file

article_only_experiment_file = client.files.create(
  file=open("data/gpt_prompts/article_only_experiment/train_set.jsonl", "rb"),
  purpose="fine-tune"
)

### Fine-tune model 

In [86]:
job = client.fine_tuning.jobs.create(
    training_file = article_only_experiment_file.id,
    model = "gpt-3.5-turbo"
)

In [105]:
job_id = job.id

### Once your model is trained, run the next cell to grab the fine-tuned model name.

In [107]:
article_only_experiment_model_pre_object = client.fine_tuning.jobs.retrieve(job_id)
article_only_experiment_model_name = article_only_experiment_model_pre_object.fine_tuned_model
print(article_only_experiment_model_name)

ft:gpt-3.5-turbo-0125:lmucs-keck-faculty::9PtCLgRv


### Collecting data

In [111]:
# Loop through the training set and collect the result
file_path = "data/gpt_prompts/article_only_experiment/test_set.jsonl"
bias_questions = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Parse the JSON object
        data = json.loads(line.strip())
        messages_with_answer = list(data.values())[0]
        messages_without_answer = messages_with_answer[:2]
        
        # Perform gpt query on fine tune model
        completion = client.chat.completions.create(
            model= article_only_experiment_model_name,
            messages= messages_without_answer,
        )

        bias_questions.append(completion.choices[0].message.content)

# Save the results to a CSV file
test_file_path = "data/gpt_prompts/article_only_experiment/test_set.csv"
parent_directory = 'result_data/fine_tuning_gpt/article_only_experiment' 
# Create the directory if it does not exist
os.makedirs(parent_directory, exist_ok=True)
result_file_path = parent_directory + '/result.csv'
df = pd.read_csv(test_file_path)
df["gpt_3.5_turbo_result"] = bias_questions
df.to_csv(result_file_path, index = False)      

## Article with Politics Info Experiment

### Uploading training file

In [118]:
# Load article with politics info experiment training file

article_w_politics_info_experiment_file = client.files.create(
  file=open("data/gpt_prompts/article_w_politics_info_experiment/train_set.jsonl", "rb"),
  purpose="fine-tune"
)

### Fine-tune model 

In [119]:
job2 = client.fine_tuning.jobs.create(
    training_file = article_w_politics_info_experiment_file.id,
    model = "gpt-3.5-turbo"
)

job2_id = job2.id

### Once your model is trained, run the next cell to grab the fine-tuned model name.

In [None]:
article_w_politics_info_experiment_model_pre_object = client.fine_tuning.jobs.retrieve(job2_id)
article_w_politics_info_experiment_model_name = article_w_politics_info_experiment_model_pre_object.fine_tuned_model
print(article_w_politics_info_experiment_model_pre_object)

### Collecting data

In [None]:
# Loop through the training set and collect the result
file_path = "data/gpt_prompts/article_w_politics_info_experiment/test_set.jsonl"
bias_questions = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Parse the JSON object
        data = json.loads(line.strip())
        messages_with_answer = list(data.values())[0]
        messages_without_answer = messages_with_answer[:2]
        
        # Perform gpt query on fine tune model
        completion = client.chat.completions.create(
            model= article_only_experiment_model_name,
            messages= messages_without_answer,
        )

        bias_questions.append(completion.choices[0].message.content)

In [None]:
# Save the results to a CSV file
test_file_path = "data/gpt_prompts/article_w_politics_info_experiment/test_set.csv"
parent_directory = 'result_data/fine_tuning_gpt/article_w_politics_info_experiment' 
# Create the directory if it does not exist
os.makedirs(parent_directory, exist_ok=True)
result_file_path = parent_directory + "/result.csv"
df = pd.read_csv(test_file_path, usecols = ["article_id", "id", "bias-question"])
df["gpt_3.5_turbo_result"] = bias_questions
df.to_csv(result_file_path, index = False)  

## Article with Source Info Experiment

### Uploading training file

In [None]:
article_w_source_info_experiment_file = client.files.create(
  file=open("data/gpt_prompts/article_w_source_info_experiment/train_set.jsonl", "rb"),
  purpose="fine-tune"
)

### Fine tune model

In [None]:
job3 = client.fine_tuning.jobs.create(
    training_file = article_w_source_info_experiment_file.id,
    model = "gpt-3.5-turbo"
)

job3_id = job3.id

In [None]:
article_w_source_info_experiment_model_pre_object = client.fine_tuning.jobs.retrieve(job3_id)
article_w_source_info_experiment_model_name = article_w_source_info_experiment_model_pre_object.fine_tuned_model
print(article_w_source_info_experiment_model_pre_object)

### Collecting test result

In [None]:
# Loop through the training set and collect the result
file_path = "data/gpt_prompts/article_w_source_info_experiment/test_set.jsonl"
bias_questions = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Parse the JSON object
        data = json.loads(line.strip())
        messages_with_answer = list(data.values())[0]
        messages_without_answer = messages_with_answer[:2]
        
        # Perform gpt query on fine tune model
        completion = client.chat.completions.create(
            model= article_only_experiment_model_name,
            messages= messages_without_answer,
        )

        bias_questions.append(completion.choices[0].message.content)

In [None]:
# Save the results to a CSV file
test_file_path = "data/gpt_prompts/article_w_source_info_experiment/test_set.csv"
parent_directory = 'result_data/fine_tuning_gpt/article_w_source_info_experiment' 
# Create the directory if it does not exist
os.makedirs(parent_directory, exist_ok=True)
result_file_path = parent_directory + "/result.csv"
df = pd.read_csv(test_file_path, usecols = ["article_id", "id", "bias-question"])
df["gpt_3.5_turbo_result"] = bias_questions
df.to_csv(result_file_path, index = False)  

## Article with All Info Experiment

### Uploading training file

In [None]:
article_w_all_info_experiment_file = client.files.create(
  file=open("data/gpt_prompts/article_w_all_info_experiment/train_set.jsonl", "rb"),
  purpose="fine-tune"
)

### Fine-tune model

In [None]:
job3 = client.fine_tuning.jobs.create(
    training_file = article_w_all_info_experiment_file.id,
    model = "gpt-3.5-turbo"
)

job3_id = job3.id

In [None]:
article_w_all_info_experiment_model_pre_object = client.fine_tuning.jobs.retrieve(job3_id)
article_w_all_info_experiment_model_name = article_w_all_info_experiment_model_pre_object.fine_tuned_model
print(article_w_all_info_experiment_model_pre_object)

### Collecting data

In [None]:
# Loop through the training set and collect the result
file_path = "data/gpt_prompts/article_w_all_info_experiment/test_set.jsonl"
bias_questions = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Parse the JSON object
        data = json.loads(line.strip())
        messages_with_answer = list(data.values())[0]
        messages_without_answer = messages_with_answer[:2]
        
        # Perform gpt query on fine tune model
        completion = client.chat.completions.create(
            model= article_only_experiment_model_name,
            messages= messages_without_answer,
        )

        bias_questions.append(completion.choices[0].message.content)

In [None]:
# Save the results to a CSV file
test_file_path = "data/gpt_prompts/article_w_all_info_experiment/test_set.csv"
parent_directory = 'result_data/fine_tuning_gpt/article_w_all_info_experiment' 
# Create the directory if it does not exist
os.makedirs(parent_directory, exist_ok=True)
result_file_path = parent_directory + "/result.csv"
df = pd.read_csv(test_file_path, usecols = ["article_id", "id", "bias-question"])
df["gpt_3.5_turbo_result"] = bias_questions
df.to_csv(result_file_path, index = False)  