# Import libraries

In [None]:
import os
import pandas as pd

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directory to store cleaned data

In [None]:
os.makedirs('/content/drive/MyDrive/datasets/cleaning', exist_ok=True)

# Import datasets

In [None]:
COVID_HATE_0301 = pd.read_csv(
    '/content/drive/MyDrive/datasets/COVID-HATE_gpt-3.5-turbo-0301/'
    'COVID-HATE_predictions_all_gpt-3.5-turbo-0301.tsv',
    sep='\t')
COVID_HATE_0613 = pd.read_csv(
    '/content/drive/MyDrive/datasets/COVID-HATE_gpt-3.5-turbo-0613/'
    'COVID-HATE_predictions_all_gpt-3.5-turbo-0613.tsv',
    sep='\t')
COVID_HATE_0301 = COVID_HATE_0301.reset_index(drop=True)
COVID_HATE_0613 = COVID_HATE_0613.reset_index(drop=True)

# Define a function to get the distribution of responses

In [None]:
def get_responses_distribution(df, model):
    # Value counts across multiple columns. via
    # https://stackoverflow.com/a/61565732
    responses_distribution = df.iloc[:, 3:].stack().value_counts()
    # Convert value_counts output to dataframe format. via
    # https://stackoverflow.com/a/47136484
    responses_distribution = responses_distribution.rename_axis(
        'Responses').reset_index(name='Count')
    responses_distribution.to_csv(
        f'/content/drive/MyDrive/datasets/cleaning/'
        f'COVID-HATE_responses_distribution_{model}.csv',
        index=False)
    # Count null values in dataframe. via
    # https://stackoverflow.com/questions/26266362/
    # how-do-i-count-the-nan-values-in-a-column-in-pandas-dataframe#
    # comment74712638_26266451
    print('Number of unusable responses is:')
    print(df.iloc[:, 3:].isna().sum().sum())

In [None]:
get_responses_distribution(COVID_HATE_0301, 'gpt-3.5-turbo-0301')
get_responses_distribution(COVID_HATE_0613, 'gpt-3.5-turbo-0613')

# Clean valid responses to conform to how the data was coded, e.g., `0` if `Neutral.`

In [None]:
# Remap values. via
# https://stackoverflow.com/a/64231348
def remap(x):
    if 'Neutral.' in str(x):
        return str(0)
    elif 'Counterhate.' in str(x):
        return str(1)
    elif 'Hatespeech.' in str(x):
        return str(2)
    else:
        return str(x)


COVID_HATE_0301.iloc[:, 3:] = COVID_HATE_0301.iloc[:, 3:].applymap(
    lambda x: remap(x))
COVID_HATE_0613.iloc[:, 3:] = COVID_HATE_0613.iloc[:, 3:].applymap(
    lambda x: remap(x))

In [None]:
get_responses_distribution(COVID_HATE_0301, 'gpt-3.5-turbo-0301')
get_responses_distribution(COVID_HATE_0613, 'gpt-3.5-turbo-0613')

# Save cleaned responses

In [None]:
COVID_HATE_0301.to_csv(
    '/content/drive/MyDrive/datasets/cleaning/'
    'COVID-HATE_predictions_cleaned_gpt-3.5-turbo-0301.tsv',
    sep='\t', index=False)
COVID_HATE_0613.to_csv(
    '/content/drive/MyDrive/datasets/cleaning/'
    'COVID-HATE_predictions_cleaned_gpt-3.5-turbo-0613.tsv',
    sep='\t', index=False)