# Import libraries

In [None]:
!pip install gdown

In [None]:
import os
import gdown
import pandas as pd
import urllib.request
from zipfile import ZipFile

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create directories to store data

In [None]:
os.makedirs('/content/drive/MyDrive/stance_detection_datasets/Harmeme_HarmP_Data', exist_ok=True)
os.makedirs('/content/drive/MyDrive/stance_detection_datasets/DISARM', exist_ok=True)

# Download **HarMeme Dataset**

Shraman Pramanick, Shivam Sharma, Dimitar Dimitrov, Md. Shad Akhtar, Preslav Nakov, and Tanmoy Chakraborty. 2021. MOMENTA: A Multimodal Framework for Detecting Harmful Memes and Their Targets. In *Findings of the Association for Computational Linguistics: EMNLP 2021*, pages 4439–4455, Punta Cana, Dominican Republic. Association for Computational Linguistics.

In [None]:
# Download large file from Google Drive. via
# https://github.com/wkentaro/gdown
url = 'https://drive.google.com/uc?id=1fw850yxKNqzpRpQKH88D13yfrwX1MLde'
output = '/content/drive/MyDrive/stance_detection_datasets/Harmeme_HarmP_Data/Harmeme_HarmP_Data.zip'
gdown.download(url, output, quiet=False)

In [None]:
# Unzip file. via
# https://www.geeksforgeeks.org/unzipping-files-in-python/
with ZipFile('/content/drive/MyDrive/stance_detection_datasets/Harmeme_HarmP_Data/Harmeme_HarmP_Data.zip',
             'r') as zObject:
    zObject.extractall('/content/drive/MyDrive/stance_detection_datasets/Harmeme_HarmP_Data/')

In [None]:
os.remove('/content/drive/MyDrive/stance_detection_datasets/Harmeme_HarmP_Data/Harmeme_HarmP_Data.zip')

# Download **DISARM Dataset**

Shivam Sharma, Md Shad Akhtar, Preslav Nakov, and Tanmoy Chakraborty. 2022. DISARM: Detecting the Victims Targeted by Harmful Memes. In *Findings of the Association for Computational Linguistics: NAACL 2022*, pages 1572–1588, Seattle, United States. Association for Computational Linguistics.

In [None]:
urllib.request.urlretrieve(
    'https://raw.githubusercontent.com/shiv6891/DISARM/main/DISARM_Dataset/Test/test_all.jsonl',
    '/content/drive/MyDrive/stance_detection_datasets/DISARM/test_all.jsonl')

# Load **DISARM Dataset**

In [None]:
test_all = pd.read_json('/content/drive/MyDrive/stance_detection_datasets/DISARM/test_all.jsonl', lines=True)

# Split *text* column

In [None]:
# Split column of lists. via
# https://stackoverflow.com/a/50879945
test_all = pd.concat([test_all, pd.DataFrame(test_all['text'].to_list(), columns=['extracted_text', 'target'])], axis=1)

# Unlist *labels* column

In [None]:
test_all['labels'] = test_all['labels'].apply(lambda x: x[0])

# Prefix image names with image directories

In [None]:
test_all['image'] = '/content/drive/MyDrive/stance_detection_datasets/Harmeme_HarmP_Data/data/datasets/memes/defaults/images/' + test_all['image'].astype(str)

# Save dataset in CSV format

In [None]:
test_all.to_csv('/content/drive/MyDrive/stance_detection_datasets/DISARM/DISARM_test_all.csv', index=False)