# Import libraries

In [1]:
import os
import re
import pandas as pd

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directory to store cleaned data

In [2]:
os.makedirs('/content/drive/MyDrive/datasets/cleaning', exist_ok=True)

# Import datasets

## Get lists of files in **/content/drive/MyDrive/datasets/POPQUORN_gpt-3.5-turbo-0301** and **/content/drive/MyDrive/datasets/POPQUORN_gpt-3.5-turbo-0613** directories

In [3]:
# List all files in directories.
# via https://www.geeksforgeeks.org/python-list-files-in-a-directory/
dir_0301 = '/content/drive/MyDrive/datasets/POPQUORN_gpt-3.5-turbo-0301'
file_list_0301 = os.listdir(dir_0301)
file_list_0301 = sorted(file_list_0301)
dir_0613 = '/content/drive/MyDrive/datasets/POPQUORN_gpt-3.5-turbo-0613'
file_list_0613 = os.listdir(dir_0613)
file_list_0613 = sorted(file_list_0613)

## Concatenate files in file lists into one Pandas DataFrame for each list

In [4]:
# Import and concatenate multiple files. via
# https://stackoverflow.com/a/21232849
POPQUORN_0301 = pd.concat(
    [pd.read_csv("".join((dir_0301, '/', file)), sep='\t') for file in
     file_list_0301],
    axis=1)
POPQUORN_0613 = pd.concat(
    [pd.read_csv("".join((dir_0613, '/', file)), sep='\t') for file in
     file_list_0613],
    axis=1)
# Drop duplicate columns. via
# https://www.geeksforgeeks.org/
# how-to-find-drop-duplicate-columns-in-a-pandas-dataframe/
POPQUORN_0301 = POPQUORN_0301.T.drop_duplicates().T
POPQUORN_0613 = POPQUORN_0613.T.drop_duplicates().T

# Define a function to get the distribution of responses

In [5]:
def get_responses_distribution(df, model):
    # Value counts across multiple columns. via
    # https://stackoverflow.com/a/61565732
    responses_distribution = df.iloc[:, 1:].stack().value_counts()
    # Convert value_counts output to dataframe format. via
    # https://stackoverflow.com/a/47136484
    responses_distribution = responses_distribution.rename_axis(
        'Responses').reset_index(name='Count')
    responses_distribution.to_csv(
        f'/content/drive/MyDrive/datasets/cleaning/'
        f'POPQUORN_responses_distribution_{model}.csv',
        index=False)
    # Count null values in dataframe. via
    # https://stackoverflow.com/questions/26266362/
    # how-do-i-count-the-nan-values-in-a-column-in-pandas-dataframe#
    # comment74712638_26266451
    print('Number of unusable responses is:')
    print(df.iloc[:, 1:].isna().sum().sum())

In [None]:
get_responses_distribution(POPQUORN_0301, 'gpt-3.5-turbo-0301')
get_responses_distribution(POPQUORN_0613, 'gpt-3.5-turbo-0613')

# Remove unsuccessful persona priming

In [7]:
def rm_failed(x):
    x = str(x)
    x = x.lower()
    if 'as an ai language model' in x or 'as a language model' in x:
        return None
    else:
        return x


cols = list(range(31))
cols.remove(0) # tweet
cols.remove(16) # predicted_scores_base
POPQUORN_0301.iloc[:, cols] = POPQUORN_0301.iloc[:, cols].applymap(
    lambda x: rm_failed(x))
POPQUORN_0613.iloc[:, cols] = POPQUORN_0613.iloc[:, cols].applymap(
    lambda x: rm_failed(x))

In [None]:
get_responses_distribution(POPQUORN_0301, 'gpt-3.5-turbo-0301')
get_responses_distribution(POPQUORN_0613, 'gpt-3.5-turbo-0613')

# Clean valid responses to conform to how the data was coded, e.g., `1` if `1. not offensive at all.`

In [9]:
def remap(x):
    # Determine if string only contains one digit. via
    # https://stackoverflow.com/a/39799113
    # Return True/False when using Regex. via
    # https://stackoverflow.com/questions/6576962/
    # python-regular-expressions-return-true-false
    if bool(re.search("^\D*1\D*$", str(x))) or bool(
            re.search("^\D*1.0\D*$", str(x))):
        return str(1)
    elif bool(re.search("^\D*2\D*$", str(x))) or bool(
            re.search("^\D*2.0\D*$", str(x))):
        return str(2)
    elif bool(re.search("^\D*3\D*$", str(x))) or bool(
            re.search("^\D*3.0\D*$", str(x))):
        return str(3)
    elif bool(re.search("^\D*4\D*$", str(x))) or bool(
            re.search("^\D*4.0\D*$", str(x))):
        return str(4)
    elif bool(re.search("^\D*5\D*$", str(x))) or bool(
            re.search("^\D*5.0\D*$", str(x))):
        return str(5)
    else:
        return None


POPQUORN_0301.iloc[:, 1:] = POPQUORN_0301.iloc[:, 1:].applymap(
    lambda x: remap(x))
POPQUORN_0613.iloc[:, 1:] = POPQUORN_0613.iloc[:, 1:].applymap(
    lambda x: remap(x))

In [None]:
get_responses_distribution(POPQUORN_0301, 'gpt-3.5-turbo-0301')
get_responses_distribution(POPQUORN_0613, 'gpt-3.5-turbo-0613')

# Listwise deletion of cases with invalid responses

In [None]:
# Select rows with null values. via
# https://stackoverflow.com/a/14247708
todrop = POPQUORN_0301[POPQUORN_0301[POPQUORN_0301.columns[1:]].isna().any(axis=1)]['text'].tolist() + \
    POPQUORN_0613[POPQUORN_0613[POPQUORN_0613.columns[1:]].isna().any(axis=1)]['text'].tolist()
# Drop rows where column values match any list element. via
# https://stackoverflow.com/a/27965417
POPQUORN_0301 = POPQUORN_0301[~POPQUORN_0301['text'].isin(todrop)]
POPQUORN_0301.reset_index(drop=True, inplace=True)
POPQUORN_0613 = POPQUORN_0613[~POPQUORN_0613['text'].isin(todrop)]
POPQUORN_0613.reset_index(drop=True, inplace=True)

# Save cleaned responses

In [None]:
POPQUORN_0301.to_csv(
    '/content/drive/MyDrive/datasets/cleaning/'
    'POPQUORN_predictions_cleaned_gpt-3.5-turbo-0301.tsv',
    sep='\t', index=False)
POPQUORN_0613.to_csv(
    '/content/drive/MyDrive/datasets/cleaning/'
    'POPQUORN_predictions_cleaned_gpt-3.5-turbo-0613.tsv',
    sep='\t', index=False)