# Import libraries

In [1]:
import os
import pandas as pd

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directory to store cleaned data

In [2]:
os.makedirs('/content/drive/MyDrive/datasets/cleaning', exist_ok=True)

# Import datasets

## Get lists of files in **/content/drive/MyDrive/datasets/SBIC_gpt-3.5-turbo-0301** and **/content/drive/MyDrive/datasets/SBIC_gpt-3.5-turbo-0613** directories

In [3]:
# List all files in directories. via
# https://www.geeksforgeeks.org/python-list-files-in-a-directory/
dir_0301 = '/content/drive/MyDrive/datasets/SBIC_gpt-3.5-turbo-0301'
file_list_0301 = os.listdir(dir_0301)
file_list_0301 = sorted(file_list_0301)
dir_0613 = '/content/drive/MyDrive/datasets/SBIC_gpt-3.5-turbo-0613'
file_list_0613 = os.listdir(dir_0613)
file_list_0613 = sorted(file_list_0613)

## Concatenate files in file lists into one Pandas DataFrame for each list

In [4]:
# Import and concatenate multiple files. via
# https://stackoverflow.com/a/21232849
SBIC_0301 = pd.concat(
    [pd.read_csv("".join((dir_0301, '/', file)), sep='\t') for file in
     file_list_0301],
    axis=1)
SBIC_0613 = pd.concat(
    [pd.read_csv("".join((dir_0613, '/', file)), sep='\t') for file in
     file_list_0613],
    axis=1)
# Drop duplicate columns. via
# https://www.geeksforgeeks.org/
# how-to-find-drop-duplicate-columns-in-a-pandas-dataframe/
SBIC_0301 = SBIC_0301.T.drop_duplicates().T
SBIC_0613 = SBIC_0613.T.drop_duplicates().T

# Define a function to get the distribution of responses

In [5]:
def get_responses_distribution(df, model):
    # Value counts across multiple columns. via
    # https://stackoverflow.com/a/61565732
    responses_distribution = df.iloc[:, 15:].stack().value_counts()
    # Convert value_counts output to dataframe format. via
    # https://stackoverflow.com/a/47136484
    responses_distribution = responses_distribution.rename_axis(
        'Responses').reset_index(name='Count')
    responses_distribution.to_csv(
        f'/content/drive/MyDrive/datasets/cleaning/'
        f'SBIC_responses_distribution_{model}.csv',
        index=False)
    # Count null values in dataframe. via
    # https://stackoverflow.com/questions/26266362/
    # how-do-i-count-the-nan-values-in-a-column-in-pandas-dataframe#
    # comment74712638_26266451
    print('Number of unusable responses is:')
    print(df.iloc[:, 15:].isna().sum().sum())

In [None]:
get_responses_distribution(SBIC_0301, 'gpt-3.5-turbo-0301')
get_responses_distribution(SBIC_0613, 'gpt-3.5-turbo-0613')

# Remove unsuccessful persona priming

In [7]:
def rm_failed(x):
    x = str(x)
    x = x.lower()
    if 'as an ai language model' in x or 'as a language model' in x:
        return None
    else:
        return x


cols = list(range(15, 79))
cols.remove(62) # predicted_q1a_labels_base
SBIC_0301.iloc[:, cols] = SBIC_0301.iloc[:, cols].applymap(
    lambda x: rm_failed(x))
SBIC_0613.iloc[:, cols] = SBIC_0613.iloc[:, cols].applymap(
    lambda x: rm_failed(x))

In [None]:
get_responses_distribution(SBIC_0301, 'gpt-3.5-turbo-0301')
get_responses_distribution(SBIC_0613, 'gpt-3.5-turbo-0613')

# Clean valid responses to conform to how the data was coded, e.g., `1.0` if `A: Yes, this could be offensive.`

In [9]:
def remap(x):
    if x is not None and \
    'a: yes, this could be offensive' in x.lower():
        return '1.0'
    elif x is not None and \
    'b: maybe, i\'m not sure' in x.lower():
        return '0.5'
    elif x is not None and \
    'c: no, this is harmless' in x.lower():
        return '0.0'
    else:
        return x


SBIC_0301.iloc[:, 15:] = SBIC_0301.iloc[:, 15:].applymap(lambda x: remap(x))
SBIC_0613.iloc[:, 15:] = SBIC_0613.iloc[:, 15:].applymap(lambda x: remap(x))

In [None]:
get_responses_distribution(SBIC_0301, 'gpt-3.5-turbo-0301')
get_responses_distribution(SBIC_0613, 'gpt-3.5-turbo-0613')

# Clean the remaining valid responses to conform to how the data was coded, e.g., `0.0` if `No, this is harmless.` (not remapped in previous round due to absent alphabetical choice)

In [None]:
def remap(x):
    if x is not None and \
    'yes, this could be offensive' in x.lower():
        return '1.0'
    elif x is not None and \
    'no, this is harmless' in x.lower():
        return '0.0'
    elif str(x) in {'1.0', '0.5', '0.0'}:
        return x
    else:
        return None


SBIC_0301.iloc[:, 15:] = SBIC_0301.iloc[:, 15:].applymap(lambda x: remap(x))
SBIC_0613.iloc[:, 15:] = SBIC_0613.iloc[:, 15:].applymap(lambda x: remap(x))

In [None]:
get_responses_distribution(SBIC_0301, 'gpt-3.5-turbo-0301')
get_responses_distribution(SBIC_0613, 'gpt-3.5-turbo-0613')

# Listwise deletion of cases with invalid responses

In [None]:
# Select rows with null values. via
# https://stackoverflow.com/a/14247708
todrop = SBIC_0301[SBIC_0301[SBIC_0301.columns[15:]].isna().any(axis=1)]['post'].tolist()
# Drop rows where column values match any list element. via
# https://stackoverflow.com/a/27965417
SBIC_0301 = SBIC_0301[~SBIC_0301['post'].isin(todrop)]
SBIC_0301.reset_index(drop=True, inplace=True)
SBIC_0613 = SBIC_0613[~SBIC_0613['post'].isin(todrop)]
SBIC_0613.reset_index(drop=True, inplace=True)

# Save cleaned responses

In [None]:
SBIC_0301.to_csv(
    '/content/drive/MyDrive/datasets/cleaning/'
    'SBIC_predictions_cleaned_gpt-3.5-turbo-0301.tsv',
    sep='\t', index=False)
SBIC_0613.to_csv(
    '/content/drive/MyDrive/datasets/cleaning/'
    'SBIC_predictions_cleaned_gpt-3.5-turbo-0613.tsv',
    sep='\t', index=False)