## Import necessary libraries

In [None]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.70.0
    Uninstalling openai-1.70.0:
      Successfully uninstalled openai-1.70.0
Successfully installed openai-0.28.0


In [None]:
# Import necessary libraries
import pandas as pd
import openai
import csv
import time
import concurrent.futures

# Explanation:
# - `pandas` is used for handling data in DataFrames.
# - `openai` is the OpenAI library for interacting with the API.
# - `csv` is used to write the output to a CSV file.
# - `time` helps us add delays to avoid hitting rate limits.

## Load Data from CSV

In [None]:
# Define the path to your input file and output file
input_file = 'redo_batch14.csv'  # Replace with your file path
output_file = 'output_batch14_redo.csv'  # Replace with your output file path

# Load the data from the CSV file
df = pd.read_csv(input_file)

# Display the first few rows to confirm loading
df.head()

# Explanation:
# This cell loads the data from a single CSV file.
# The file should have columns named 'original_headline', 'edited_headline', 'added_words', 'removed_words', and 'prompt'.
# Replace `your_input_file.csv` and `your_output_file.csv` with your actual file paths.


Unnamed: 0,input_text
0,000 | #Climate change: according to the Intern...
1,https https https https https https https http...
2,Revealing.
3,"From the @BOG_ELDORADO, the president @IvanDuq..."
4,W WTF W W W W W W W W W W W W W W W W W W W W ...


In [None]:
df = df.rename(columns={"input_text":"input"})
#df = df.drop(columns=["Unnamed: 0"])

## Define Function to Create Prompt Text

In [None]:
# Define the function to create the prompt column
def create_prompt(row):

    # Construct the prompt text using the template
    prompt = (
        "Given the following input, answer the following questions. "
        "Provide a brief explanation for your analysis.\n"
        f"Input: {row['input']}\n"
    )

    return prompt

# Generate the prompt column
df['prompt'] = df.apply(create_prompt, axis=1)

# Display the DataFrame with the new 'prompt' column
df[['input', 'prompt']].head()

# Explanation:
# This cell creates a new column in the DataFrame called 'prompt'.
# Each prompt is a combination of the original and edited headlines along with lists of added and removed words.


Unnamed: 0,input,prompt
0,000 | #Climate change: according to the Intern...,"Given the following input, answer the followin..."
1,https https https https https https https http...,"Given the following input, answer the followin..."
2,Revealing.,"Given the following input, answer the followin..."
3,"From the @BOG_ELDORADO, the president @IvanDuq...","Given the following input, answer the followin..."
4,W WTF W W W W W W W W W W W W W W W W W W W W ...,"Given the following input, answer the followin..."


## Set Up OpenAI API Key

In [None]:
# Set up OpenAI API key
openai.api_key = "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"  # Replace with your actual OpenAI API key

# Explanation:
# Replace 'your_api_key' with your OpenAI API key.
# This cell sets up the API key required to access the OpenAI API for generating responses.


## Set up the context

In [None]:
context = f'''You are a helpful assistant.
You will be provided texts of social media posts. For each post, there are four questions to be answered, pertaining to labelling the sentence based on the text.
Your goal will be to answer those questions, based on the specific instructions for each question.

Question 1:
Please identify if the post contains enough information to analyse. For instance, is there sufficient text and/or context in the post to decipher the intent of the post? The purpose of this question is to pick out posts that do not have enough information in them to be worth annotating.
Answer either YES or NO. Skip the rest of the questions if you choose NO.

Question 2:
Please identify the framing of the issue mentioned in the post, doing so while taking the perspective of the author of the post. You MUST choose at least one frame. Multiple frames may be selected if needed.
The frames and there definitions are as follows:
1. Economic: Financial Implications of an issue
2. Capacity & Resources: The availability or lack of time, physical, human, or financial resources
3. Fairness & Equality: The (in)equality with which laws, punishments, rewards, resources are distributed
4. Legality & Constitutionality: Court cases and existing laws that regulate policies; constitutional interpretation; legal processes such as seeking asylum or obtaining citizenship
5. Defence: Any external threat to a person, group, or nation and defenses taken to avoid that threat
6. Health, Safety and Security: Health, safety and security impacts of an issue; discussions relating to internal law-and-order or healthcare threats
7. Quality Of Life: Effects on people's weath, mobility, daily routines, community life, happiness, etc.
8. Environmental: Environmental impacts of an issue, discussions about effects on the natural environment
9. Cultural Identity: Social norms, trends, values, and customs; integration/assimilatioin efforts
10. Politics and Policy Issues: Focus on politics, politicians or political parties; discussions on existing policies and their effectiveness
11. External Regulation & Reputation: Relations between nations or states/provinces; agreements between governments; perceptions of one nation/state by another
12. Conspiracy: Any post that spreads false but non-politicised descriptive stories
Answer by noting down the index numbers of the frames that you choose.

Question 3:
Please categorise the post into one of these categories, based on what you think the post is trying to communicate.
1. Informational: The post tries to provide information. This tends to sound factual, with little room for subjectivity in the content of the text.
2. Opinion: The post tries to provide the opinion of the author. This often includes the use of pronouns.
3. Both: The post contains both informational and opinion frames.
Answer by noting down the index numbers of the category you choose


Question 4:
Please identify the message frame of the post (i.e., what is the purpose of the post). You MUST choose at least one frame. You may choose multiple frames as appropriate.
1. Identification: The message identifies a social or political problem
2. Blame: The message assigns blame for a societal problem
3. Solutions: The message proposes solutions for a societal problem
4. Tactics: The message discusses strategies or tactics for achieving an intended goal
5. Solidarity: The message expresses solidarity for a cause. This also includes providing evidence in support of a cause, opinion, or issue.
6. Counterframing: The message explicitly challenges arguments made by the opposing side
7. Motivational: The message convinces other readers to join in said cause through calls to actions.
Answer by noting down the index numbers of the frames that you choose.

Format your final collated response for each post as a comma-separated row, as follows:
[Answer for Question 1],[For each frame, put 1 if you chose it, otherwise put 0. This segment will have a total of 1's and 0's equivalent to the number of frames provided in the instructions of Question 2, with each value comma-separated.],[Answer for Question 3],[For each frame, put 1 if you chose it, otherwise put 0. This segment will have a total of 1's and 0's equivalent to the number of frames provided in the instructions of Question 4, with each value comma-separated.]

For example, a sample output will be as follows:

[YES,1,1,1,0,0,1,1,0,0,1,0,0,OPINION,1,0,1,0,0,0,0]

This output corresponds to
- Answering YES for Question 1.
- Answering the presence of frames 1,2,3,6,7,10 for Question 2
- Answering OPINION for Question 3
- Answering the presence of frames 1,3 for Question 4.

ALWAYS RESPOND IN THIS EXACT FORMAT'''

## Generate Responses and Write to Output CSV

In [None]:
# Function to process each prompt
def process_prompt(index, prompt, context, retries=5):
    attempt = 0
    while attempt < retries:
        try:
            # Generate the response from the OpenAI model
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": context},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=1000
            )

            # Extract the response content
            response_text = response['choices'][0]['message']['content']
            if (index + 1) % 100 == 0:
                print(f"Processing row {index + 1}")
            return prompt, response_text

        except Exception as e:
            attempt += 1
            print(f"Error on row {index + 1} (attempt {attempt}/{retries}): {e}")
            time.sleep(2 ** attempt)  # Exponential backoff

    return prompt, f"Error: Failed after {retries} retries"

# Open the output file and write header
with open(output_file, "a", newline="") as output_csv:
    csv_writer = csv.writer(output_csv)
    csv_writer.writerow(["prompt", "response"])  # Write header row

    # Create a ThreadPoolExecutor to process prompts in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Use map to process the prompts in parallel, passing index as an argument
        results = list(executor.map(process_prompt, range(len(df)), df['prompt'], [context] * len(df)))

        # Write the prompt and response to the output file
        for prompt, response_text in results:
            print([prompt, response_text])
            csv_writer.writerow([prompt, response_text])

        # Optional: You can add a delay here if you need to avoid rate limits in general (e.g., for long-running jobs)
        time.sleep(2)

Processing row 100
Processing row 200
Processing row 300
['Given the following input, answer the following questions. Provide a brief explanation for your analysis.\nInput: 000 | #Climate change: according to the International Energy Agency, the United States was the country that reduced its CO2 emissions the most in 2019, by -2.9%, despite the strong deregulation of the energy sector by Trump and having left the Paris Agreement. The Committee recommends that the State party take all necessary measures to ensure the full implementation of the present recommendations, inter alia, by transmitting them to the members of the Council of Ministers, the Parliament, the Parliamentary Assembly and the Senate, the Parliamentary Assembly of the Council of Ministers, the Parliamentary Assembly of the Council of Ministers, the Parliamentary Assembly of the Council of Ministers, the Parliamentary Assembly of the Council of Ministers, the Parliamentary Assembly of the Council of Ministers, the Parlia

## Read in output file

In [None]:
import pandas as pd

# Load the CSV file that contains the prompt-response pairs
output_file = 'onlyyes_cleaned.csv'  # Replace with your actual output file path
df_output = pd.read_csv(output_file)

# Display the first few rows to confirm loading
df_output.head()
# Explanation:
# This cell loads the CSV file with responses generated by the OpenAI API.
# Replace 'your_output_file.csv' with the path to your output file.
# We display the first few rows to confirm the data is loaded correctly.


Unnamed: 0,prompt,response,q1_response,economic,capacity,fairness,legality,defence,health,qualityoflife,...,conspiracy,q3_response,identification,blame,solutions,tactics,solidarity,counterframing,motivational,comment
0,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,1,0,0,1,0,0,OPINION,0,1,0,1,0...",YES,0,0,0,0,0,0,1,...,0,OPINION,0,1,0,1,0,0,0,Explanation:\nQuestion 1: YES. The post contai...
1,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,0,0,0,1,0,0,OPINION,0,1,0,0,0...",YES,0,0,0,0,0,0,0,...,0,OPINION,0,1,0,0,0,0,0,Explanation:\nQuestion 1: YES - The post conta...
2,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,1,0,0,0,0,0,OPINION,0,0,0,0,0...",YES,0,0,0,0,0,0,1,...,0,OPINION,0,0,0,0,0,0,0,Explanation: \n1. The post contains enough inf...
3,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,0,0,0,0,0,0,OPINION,1,1,0,0,0...",YES,0,0,0,0,0,0,0,...,0,OPINION,1,1,0,0,0,0,0,Explanation:\nQuestion 1: The post contains su...
4,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,1,0,0,0,0,0,INFO,0,0,0,0,0,0,...",YES,0,0,0,0,0,0,1,...,0,INFO,0,0,0,0,0,0,0,Explanation:\nQuestion 1: The post does provid...


## Types of bias

In [None]:
import pandas as pd
import re
import logging

# Set up logging for errors
logging.basicConfig(filename="extract_response_errors.log", level=logging.ERROR, format="%(asctime)s - %(levelname)s - %(message)s")

# Define column names for structured output
q2 = [
    'economic', 'capacity', 'fairness', 'legality', 'defence',
    'health', 'qualityoflife', 'environmental', 'cultural',
    'politics', 'external', 'conspiracy'
]

q4 = [
    'identification', 'blame', 'solutions', 'tactics',
    'solidarity', 'counterframing', 'motivational'
]

columns = ['q1_response'] + q2 + ['q3_response'] + q4 + ['comment']


In [None]:
!pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0


## Extract information from output text

In [None]:
#import ace_tools as tools
# Function to extract structured data and separate comments
def extract_response(response, index):
    try:
        # Ensure response is a string
        if not isinstance(response, str):
            raise ValueError("Response is not a string")

        # Find the position of the closing bracket ']'
        match = re.search(r"\]", response)
        if not match:
            raise ValueError("No closing bracket found in response")

        end_bracket_pos = match.end()  # Get position after the closing bracket
        structured_part = response[:end_bracket_pos].strip()  # Extract structured response
        comment_part = response[end_bracket_pos:].strip()  # Extract comment

        # Remove brackets and split structured part
        structured_part = structured_part.lstrip("[").rstrip("]")
        response_list = structured_part.split(",")

        # Validate response length
        expected_length = 1 + len(q2) + 1 + len(q4)  # Q1 + Q2 frames + Q3 + Q4 frames
        if len(response_list) != expected_length:
            raise ValueError(f"Unexpected response length. Expected {expected_length}, got {len(response_list)}")

        # Extract and format values
        q1_response = response_list[0].strip()
        q2_responses = list(map(int, response_list[1:1+len(q2)]))
        q3_response = response_list[1+len(q2)].strip()
        q4_responses = list(map(int, response_list[2+len(q2):]))

        print(f"Processed row {index}")  # Print row number if successful

        return [q1_response] + q2_responses + [q3_response] + q4_responses + [comment_part]

    except Exception as e:
        logging.error(f"Error processing response at row {index}: {response} | Error: {str(e)}")
        print(f"Error at row {index}: {response}")  # Print the problematic response
        return [None] * (1 + len(q2) + 1 + len(q4)) + [response]  # Return response as comment if error occurs

# Ensure df_output is defined and contains the necessary column
if 'response' in df_output.columns:
    df_output = df_output.copy()  # Work with a copy to maintain original data integrity
    df_output[columns] = df_output['response'].apply(lambda x: pd.Series(extract_response(x, df_output.index[df_output['response'] == x][0])))

    # Display the cleaned DataFrame

    # Display the first few rows of the extracted DataFrame
    print(df_output.head())

    # Optionally, save the output to a CSV file for review
    df_output.to_csv("extracted_responses.csv", index=False)

    # If using Jupyter Notebook, display the DataFrame interactively
    from IPython.display import display
    display(df_output)

else:
    logging.error("Column 'response' not found in df_output. Ensure the dataset is loaded correctly.")
    print("Error: 'response' column not found in df_output. Check dataset formatting.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed row 41182
Processed row 41183
Processed row 41184
Processed row 41185
Processed row 41186
Processed row 41187
Processed row 41188
Processed row 41189
Processed row 41190
Processed row 41191
Processed row 41192
Processed row 41193
Processed row 41194
Processed row 41195
Processed row 41196
Processed row 41197
Processed row 41198
Processed row 41199
Processed row 41200
Processed row 41201
Processed row 41202
Processed row 41203
Processed row 41204
Processed row 41205
Processed row 41206
Processed row 41207
Processed row 41208
Processed row 41209
Processed row 41210
Processed row 41211
Processed row 41212
Processed row 41213
Processed row 41214
Processed row 41215
Processed row 41216
Processed row 41217
Processed row 41218
Processed row 41219
Processed row 41220
Processed row 41221
Processed row 41222
Processed row 41223
Processed row 41224
Processed row 41225
Processed row 41226
Processed row 41227
Processed row 4

Unnamed: 0,prompt,response,q1_response,economic,capacity,fairness,legality,defence,health,qualityoflife,...,conspiracy,q3_response,identification,blame,solutions,tactics,solidarity,counterframing,motivational,comment
0,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,1,0,0,1,0,0,OPINION,0,1,0,1,0...",YES,0,0,0,0,0,0,1,...,0,OPINION,0,1,0,1,0,0,0,Explanation:\nQuestion 1: YES. The post contai...
1,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,0,0,0,1,0,0,OPINION,0,1,0,0,0...",YES,0,0,0,0,0,0,0,...,0,OPINION,0,1,0,0,0,0,0,Explanation:\nQuestion 1: YES - The post conta...
2,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,1,0,0,0,0,0,OPINION,0,0,0,0,0...",YES,0,0,0,0,0,0,1,...,0,OPINION,0,0,0,0,0,0,0,Explanation: \n1. The post contains enough inf...
3,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,0,0,0,0,0,0,OPINION,1,1,0,0,0...",YES,0,0,0,0,0,0,0,...,0,OPINION,1,1,0,0,0,0,0,Explanation:\nQuestion 1: The post contains su...
4,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,1,0,0,0,0,0,INFO,0,0,0,0,0,0,...",YES,0,0,0,0,0,0,1,...,0,INFO,0,0,0,0,0,0,0,Explanation:\nQuestion 1: The post does provid...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46134,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,1,0,0,1,0,0,INFORMATIONAL,0,0...",YES,0,0,0,0,0,0,1,...,0,INFORMATIONAL,0,0,0,0,0,0,0,Explanation:\nQuestion 1: YES - The post provi...
46135,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,1,0,0,0,0,1,INFORMATIONAL,0,0...",YES,0,0,0,0,0,0,1,...,1,INFORMATIONAL,0,0,0,0,0,0,0,Explanation:\nQuestion 1: The post does contai...
46136,"Given the following input, answer the followin...","[YES,1,0,0,0,0,1,0,0,0,1,0,0,INFORMATIONAL,1,0...",YES,1,0,0,0,0,1,0,...,0,INFORMATIONAL,1,0,0,0,1,0,0,Explanation:\nQuestion 1: YES - The post conta...
46137,"Given the following input, answer the followin...","[YES,0,0,0,0,0,0,1,0,0,0,0,0,INFORMATIONAL,0,0...",YES,0,0,0,0,0,0,1,...,0,INFORMATIONAL,0,0,0,0,0,0,0,Explanation:\nThere is not enough information ...


## Column stats

In [None]:
import pandas as pd
import logging

# Set up logging
logging.basicConfig(filename="column_stats_errors.log", level=logging.ERROR, format="%(asctime)s - %(levelname)s - %(message)s")

# Function to calculate statistics for each column with error handling
def column_stats(df, col_name):
    try:
        if col_name not in df.columns:
            raise KeyError(f"Column '{col_name}' not found in DataFrame")

        if df[col_name].dtype == 'object':  # For categorical columns
            value_counts = df[col_name].value_counts()
            print(f"Column: {col_name}")
            print(value_counts)
            print(f"Total: {value_counts.sum()}\n")
        else:  # For binary columns (q2 and q4)
            ones = df[col_name].sum()
            zeros = len(df) - ones
            print(f"Column: {col_name}")
            print(f"1s: {ones}, 0s: {zeros}")
            print(f"Total: {len(df)}\n")
    except Exception as e:
        logging.error(f"Error processing column '{col_name}': {str(e)}")

# Verify 'response' column exists before proceeding
if 'response' in df_output.columns:
    for col in df_output.columns:
        column_stats(df_output, col)
else:
    logging.error("Column 'response' not found in DataFrame. Ensure the dataset is loaded correctly.")
    print("Error: 'response' column not found. Check dataset formatting.")


Column: prompt
prompt
Given the following input, answer the following questions. Provide a brief explanation for your analysis.\nInput: NASA says huge, potentially hazardous asteroid will break into Earths orbit next week \n                                                                                                                                                                                                   29
Given the following input, answer the following questions. Provide a brief explanation for your analysis.\nInput: @laurenboebert Lauren Boebert has a net worth of over $12 million. Her previous work experience was as an Assistant Manager at a McDonalds. Where did she get all this money?\n                                                                                                            13
Given the following input, answer the following questions. Provide a brief explanation for your analysis.\nInput: Florida woman dies after unsuccessful suit to get ivermectin \n 