# Import libraries

In [1]:
import os
import pandas as pd

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directory to store cleaned data

In [2]:
os.makedirs('/content/drive/MyDrive/datasets/cleaning', exist_ok=True)

# Import datasets

## Get lists of files in **/content/drive/MyDrive/datasets/IHC_classes_gpt-3.5-turbo-0301** and **/content/drive/MyDrive/datasets/IHC_classes_gpt-3.5-turbo-0613** directories

In [3]:
# List all files in directories. via
# https://www.geeksforgeeks.org/python-list-files-in-a-directory/
dir_0301 = '/content/drive/MyDrive/datasets/IHC_classes_gpt-3.5-turbo-0301'
file_list_0301 = os.listdir(dir_0301)
file_list_0301 = sorted(file_list_0301)
dir_0613 = '/content/drive/MyDrive/datasets/IHC_classes_gpt-3.5-turbo-0613'
file_list_0613 = os.listdir(dir_0613)
file_list_0613 = sorted(file_list_0613)

## Concatenate files in file lists into one Pandas DataFrame for each list

In [4]:
# Import and concatenate multiple files. via
# https://stackoverflow.com/a/21232849
IHC_classes_0301 = pd.concat(
    [pd.read_csv("".join((dir_0301, '/', file)), sep='\t') for file in
     file_list_0301],
    axis=1)
IHC_classes_0613 = pd.concat(
    [pd.read_csv("".join((dir_0613, '/', file)), sep='\t') for file in
     file_list_0613],
    axis=1)
# Drop duplicate columns. via
# https://www.geeksforgeeks.org/
# how-to-find-drop-duplicate-columns-in-a-pandas-dataframe/
IHC_classes_0301 = IHC_classes_0301.T.drop_duplicates().T
IHC_classes_0613 = IHC_classes_0613.T.drop_duplicates().T

# Define a function to get the distribution of responses

In [5]:
def get_responses_distribution(df, model):
    # Value counts across multiple columns. via
    # https://stackoverflow.com/a/61565732
    responses_distribution = df.iloc[:, 2:].stack().value_counts()
    # Convert value_counts output to dataframe format. via
    # https://stackoverflow.com/a/47136484
    responses_distribution = responses_distribution.rename_axis(
        'Responses').reset_index(name='Count')
    responses_distribution.to_csv(
        f'/content/drive/MyDrive/datasets/cleaning/'
        f'IHC_classes_responses_distribution_{model}.csv',
        index=False)
    # Count null values in dataframe. via
    # https://stackoverflow.com/questions/26266362/
    # how-do-i-count-the-nan-values-in-a-column-in-pandas-dataframe#
    # comment74712638_26266451
    print('Number of unusable responses is:')
    print(df.iloc[:, 2:].isna().sum().sum())

In [None]:
get_responses_distribution(IHC_classes_0301, 'gpt-3.5-turbo-0301')
get_responses_distribution(IHC_classes_0613, 'gpt-3.5-turbo-0613')

# Clean valid responses to conform to how the data was coded, e.g., `explicit_hate` if `A: Yes, explicit hate speech.`

In [7]:
# Remap values. via
# https://stackoverflow.com/a/64231348
def remap(x):
    x = x.lower()
    if 'a: yes, explicit hate speech' in x:
        return 'explicit_hate'
    elif 'b: yes, implicit hate speech' in x:
        return 'implicit_hate'
    elif 'c: not hate speech' in x:
        return 'not_hate'
    else:
        return x


IHC_classes_0301.iloc[:, 2:] = IHC_classes_0301.iloc[:, 2:].applymap(
    lambda x: remap(x))

In [None]:
get_responses_distribution(IHC_classes_0301, 'gpt-3.5-turbo-0301')

# Clean the remaining valid responses to conform to how the data was coded, e.g., `not_hate` if `A: not hate speech.` (not remapped in previous round due to mismatched alphabetical choice, should be `C: not hate speech.`)

In [None]:
# Remap values. via
# https://stackoverflow.com/a/64231348
def remap_again(x):
    if 'not hate speech' in x:
        return 'not_hate'
    elif 'yes, implicit hate speech' in x:
        return 'implicit_hate'
    else:
        return x


IHC_classes_0301.iloc[:, 2:] = IHC_classes_0301.iloc[:, 2:].applymap(
    lambda x: remap_again(x))

In [None]:
get_responses_distribution(IHC_classes_0301, 'gpt-3.5-turbo-0301')

# Save cleaned responses

In [None]:
IHC_classes_0301.to_csv(
    '/content/drive/MyDrive/datasets/cleaning/'
    'IHC_classes_predictions_cleaned_gpt-3.5-turbo-0301.tsv',
    sep='\t', index=False)
IHC_classes_0613.to_csv(
    '/content/drive/MyDrive/datasets/cleaning/'
    'IHC_classes_predictions_cleaned_gpt-3.5-turbo-0613.tsv',
    sep='\t', index=False)