# Import libraries

In [None]:
import json
import os
import pandas as pd
from scipy import stats
import shutil
import tarfile
import urllib.request

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directory to store data

In [None]:
os.makedirs('/content/drive/MyDrive/datasets', exist_ok=True)

# Download **Social Bias Inference Corpus (SBIC)**

Sap, M., Gabriel, S., Qin, L., Jurafsky, D., Smith, N.A., Choi, Y.:
Social bias frames: Reasoning about social and power implications of language. In: Proceedings of the 58th Annual Meeting of the Association
for Computational Linguistics, pp. 5477–5490. Association for Computational Linguistics, Online (2020). https://doi.org/10.18653/v1/2020.acl-main.486
. https://aclanthology.org/2020.acl-main.486

In [None]:
urllib.request.urlretrieve(
    'https://maartensap.com/social-bias-frames/SBIC.v2.tgz',
    '/content/drive/MyDrive/datasets/SBIC.v2.tgz')

In [None]:
# Uncompress a .tgz file. via
# https://www.geeksforgeeks.org/how-to-uncompress-a-tar-gz-file-using-python/
file = tarfile.open('/content/drive/MyDrive/datasets/SBIC.v2.tgz')
file.extractall('/content/drive/MyDrive/datasets/SBIC.v2')
file.close()

# Adapted from the code in the `README` file from `SBIC.v2.tgz`

In [None]:
# Import datasets
df = pd.read_csv('/content/drive/MyDrive/datasets/SBIC.v2/SBIC.v2.tst.csv')
df_agg = pd.read_csv(
    '/content/drive/MyDrive/datasets/SBIC.v2/SBIC.v2.agg.tst.csv')

# Update aggregated dataframe with modal responses to questions 1a, 1b, 2, 3a.
# This is in contrast to the use of np.mean by the authors.
classFields = ['whoTarget', 'intentYN', 'sexYN', 'offensiveYN']
aggDict = {c: lambda x: stats.mode(x, nan_policy='omit')[0] for c in
           classFields}
gDf = df.groupby("post", as_index=False).agg(aggDict)
gDf.rename({'whoTarget': 'whoTarget_mode', 'intentYN': 'intentYN_mode',
            'sexYN': 'sexYN_mode', 'offensiveYN': 'offensiveYN_mode'},
           axis=1, inplace=True)
gDf_subset = gDf[['post', 'whoTarget_mode', 'intentYN_mode', 'sexYN_mode',
                  'offensiveYN_mode']]
df_comb = pd.merge(df_agg, gDf_subset, how='left', on='post')

# Save aggregated file
df_comb.to_csv('/content/drive/MyDrive/datasets/SBIC.v2.agg.test.csv',
               index=False)

In [None]:
shutil.move('/content/drive/MyDrive/datasets/SBIC.v2/SBIC.v2.tst.csv',
            '/content/drive/MyDrive/datasets/SBIC.v2.tst.csv')
shutil.rmtree('/content/drive/MyDrive/datasets/SBIC.v2')
os.remove('/content/drive/MyDrive/datasets/SBIC.v2.tgz')