# Import libraries

In [None]:
import json
import os
import pandas as pd

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directory to store cleaned data

In [None]:
os.makedirs('/content/drive/MyDrive/datasets/cleaning', exist_ok=True)

# Import datasets

## Get lists of files in **/content/drive/MyDrive/datasets/IHC_NLE_gpt-3.5-turbo-0301** and **/content/drive/MyDrive/datasets/IHC_NLE_gpt-3.5-turbo-0613** directories

In [None]:
# List all files in directories. via
# https://www.geeksforgeeks.org/python-list-files-in-a-directory/
dir_0301 = '/content/drive/MyDrive/datasets/IHC_NLE_gpt-3.5-turbo-0301'
file_list_0301 = os.listdir(dir_0301)
file_list_0301 = sorted(file_list_0301)
dir_0613 = '/content/drive/MyDrive/datasets/IHC_NLE_gpt-3.5-turbo-0613'
file_list_0613 = os.listdir(dir_0613)
file_list_0613 = sorted(file_list_0613)

## Concatenate files in file lists into one Pandas DataFrame for each list

In [None]:
# Import and concatenate multiple files. via
# https://stackoverflow.com/a/21232849
IHC_NLE_0301 = pd.concat(
    [pd.read_csv("".join((dir_0301, '/', file)), sep='\t') for file in
     file_list_0301],
    axis=1)
IHC_NLE_0613 = pd.concat(
    [pd.read_csv("".join((dir_0613, '/', file)), sep='\t') for file in
     file_list_0613],
    axis=1)
# Drop duplicate columns. via
# https://www.geeksforgeeks.org/
# how-to-find-drop-duplicate-columns-in-a-pandas-dataframe/
IHC_NLE_0301 = IHC_NLE_0301.T.drop_duplicates().T
IHC_NLE_0613 = IHC_NLE_0613.T.drop_duplicates().T

# Define a function to remove responses indicating unsuccessful persona priming

In [None]:
def rm_failed(x):
    if 'As an AI language model' in x:
        return None
    else:
        return x

# Define a function to extract possible JSON strings

In [None]:
# Get string between two substrings. via
# https://www.geeksforgeeks.org/python-extract-string-between-two-substrings/
def extract_JSON_like(x):
    try:
        return x[x.index('{"'): x.index('"}') + 2]
    except (ValueError, AttributeError) as e:
        return x

# Define a function to coerce strings into valid JSON strings

In [None]:
def coerce_JSON(x):
    try:
        json.loads(x)
        return x
    except ValueError:
        return x.rsplit('tweet": "', 1)[0] + 'tweet": "' + \
            x.rsplit('tweet": "', 1)[1].replace('"', '').replace('}', '') + \
            '"}'
    except TypeError:
        return x

# Define a function to separate columns of valid JSON strings into `GROUP` and `implied statement of implicitly hateful tweet` columns

In [None]:
def explode_JSON(df):
    df.iloc[:, 4:] = df.iloc[:, 4:].applymap(lambda x: rm_failed(x))
    df.iloc[:, 3:] = df.iloc[:, 3:].applymap(lambda x: extract_JSON_like(x))
    df.iloc[:, 3:] = df.iloc[:, 3:].applymap(lambda x: coerce_JSON(x))
    for i in df.columns[3:]:
        # Flatten dictionary column. via
        # https://stackoverflow.com/a/72947328
        df = pd.concat([df, pd.json_normalize(
            df[i].map(lambda x: eval(x) if pd.notnull(x) else x)).add_prefix(
            i + '_')], axis=1)
        df.pop(i)
    for i in df.columns[3:]:
        df.rename(columns={i: i.replace(' ', '_')}, inplace=True)
    return df

# Call the `explode_JSON` function

In [None]:
IHC_NLE_0301 = explode_JSON(IHC_NLE_0301)
IHC_NLE_0613 = explode_JSON(IHC_NLE_0613)

# Listwise deletion of cases with invalid responses

In [None]:
# Select rows with null values. via
# https://stackoverflow.com/a/14247708
todrop = IHC_NLE_0301[IHC_NLE_0301.isnull().any(axis=1)]['post'].tolist()
IHC_NLE_0301 = IHC_NLE_0301.dropna()
IHC_NLE_0301.reset_index(drop=True, inplace=True)
# Drop rows where column values match any list element. via
# https://stackoverflow.com/a/27965417
IHC_NLE_0613 = IHC_NLE_0613[~IHC_NLE_0613['post'].isin(todrop)]
IHC_NLE_0613.reset_index(drop=True, inplace=True)

# Save cleaned responses

In [None]:
IHC_NLE_0301.to_csv(
    '/content/drive/MyDrive/datasets/cleaning/'
    'IHC_NLE_predictions_cleaned_gpt-3.5-turbo-0301.tsv',
    sep='\t', index=False)
IHC_NLE_0613.to_csv(
    '/content/drive/MyDrive/datasets/cleaning/'
    'IHC_NLE_predictions_cleaned_gpt-3.5-turbo-0613.tsv',
    sep='\t', index=False)