# Import libraries

In [None]:
!pip install gdown

In [None]:
import os
import gdown
import pandas as pd
from zipfile import ZipFile

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directory to store data

In [None]:
os.makedirs('/content/drive/MyDrive/stance_detection_datasets/constraint22_dataset_uspolitics', exist_ok=True)

# Download **CONSTRAINT-2022 shared task (US Politics)**

Shivam Sharma, Tharun Suresh, Atharva Kulkarni, Himanshi Mathur, Preslav Nakov, Md. Shad Akhtar, and Tanmoy Chakraborty. 2022. Findings of the CONSTRAINT 2022 Shared Task on Detecting the Hero, the Villain, and the Victim in Memes. In *Proceedings of the Workshop on Combating Online Hostile Posts in Regional Languages during Emergency Situations*, pages 1–11, Dublin, Ireland. Association for Computational Linguistics.

In [None]:
# Download large file from Google Drive. via
# https://github.com/wkentaro/gdown
url = 'https://drive.google.com/uc?id=15YcYH_doDyeoHtw1UvVEOzndFXF_gly5'
output = '/content/drive/MyDrive/stance_detection_datasets/constraint22_dataset_uspolitics/constraint22_dataset_uspolitics.zip'
gdown.download(url, output, quiet=False)

In [None]:
# Unzip file. via
# https://www.geeksforgeeks.org/unzipping-files-in-python/
with ZipFile('/content/drive/MyDrive/stance_detection_datasets/constraint22_dataset_uspolitics/constraint22_dataset_uspolitics.zip',
             'r') as zObject:
    zObject.extractall('/content/drive/MyDrive/stance_detection_datasets/constraint22_dataset_uspolitics/')

In [None]:
os.remove('/content/drive/MyDrive/stance_detection_datasets/constraint22_dataset_uspolitics/constraint22_dataset_uspolitics.zip')

# Load **CONSTRAINT-2022 shared task (train set for US Politics)**

In [None]:
train = pd.read_json('/content/drive/MyDrive/stance_detection_datasets/constraint22_dataset_uspolitics/annotations_HVV/train.jsonl', lines=True)

# Reshape dataframes

In [None]:
train = train.explode(['hero']).reset_index(drop=True)
train = train.explode(['villain']).reset_index(drop=True)
train = train.explode(['victim']).reset_index(drop=True)
train = train.explode(['other']).reset_index(drop=True)
train = pd.melt(train, id_vars=['OCR', 'image'],
                value_vars=['hero', 'villain', 'victim', 'other'],
                var_name='role', value_name='entity')
train = train.dropna().drop_duplicates().reset_index(drop=True)

# Prefix image names with image directories

In [None]:
train['image'] = '/content/drive/MyDrive/stance_detection_datasets/constraint22_dataset_uspolitics/images/' + train['image'].astype(str)

# Save dataset in CSV format

Because the unseen dataset does not come with gold labels, we use the original *train* dataset as a test dataset.

In [None]:
train.to_csv('/content/drive/MyDrive/stance_detection_datasets/constraint22_dataset_uspolitics/constraint22_dataset_uspolitics_test.csv', index=False)