# Pull and create datasets

In [28]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/Feger/limited-generalizability

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Feger/limited-generalizability


In [2]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
import re
import os
import json
import spacy
import gzip
import pickle
import warnings
import pandas as pd
from tqdm.notebook import tqdm

warnings.simplefilter(action='ignore', category=FutureWarning)

nlp = spacy.load('en_core_web_lg')

ARGUMENT = "Argument"
NOARGUMENT = "No-Argument"

DSID_COL = "dataset_id"
TOPIC_COL = "topic"
SENTENCE_COL = "sentence"
LABEL_COL = "label"
DATASET_COL = "dataset"
DOMAIN_COL = "domain"



## Helper Functions

In [14]:
def show_information(df):
    assert all(df[DSID_COL] == df[DSID_COL].unique())
    print(f"{df[DATASET_COL].unique()[0]} size:", df.shape[0])
    print("Topics:", len(df[TOPIC_COL].unique()))

In [15]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def read_json_data(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

def extract_sentences_spacy(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

def parse_annotations(annotation_lines):
    parsed_annotations = []
    for line in annotation_lines:
        parts = line.strip().split('\t')
        if len(parts) >= 3:
            tag_id = parts[0]
            tag_info = parts[1]
            tag_text = parts[2]
            tag_type, tag_start, tag_end = tag_info.split()
            parsed_annotations.append({
                'id': tag_id,
                'type': tag_type,
                'start': int(tag_start),
                'end': int(tag_end),
                'text': tag_text
            })
    return parsed_annotations

def match_annotations_to_sentences(text_content, annotations):
    sentences = extract_sentences_spacy(text_content)

    sentence_annotations = []
    for sentence in sentences:
        sentence_start = text_content.index(sentence)
        sentence_end = sentence_start + len(sentence)
        labels = [
            annotation for annotation in annotations
            if annotation['start'] >= sentence_start and annotation['end'] <= sentence_end
        ]
        sentence_annotations.append({
            'sentence': sentence,
            'labels': labels
        })
    return sentence_annotations

def prepare_output(sentence_annotations):
    output_data = []
    for entry in sentence_annotations:
        sentence = entry['sentence']
        labels = entry['labels']
        output_data.append({
            SENTENCE_COL: sentence.strip(),
            LABEL_COL: set([label['type'] for label in labels])
        })
    return output_data

def generate_label_positions(parsed_data):
    label_positions = []
    combined_text = ""
    current_position = 0

    for entry in parsed_data:
        label, sentence = entry['type'], entry['text']
        label_positions.append({
            'type': label,
            'start': current_position,
            'end': current_position + len(sentence),
            'text': sentence
        })
        combined_text += sentence + " "
        current_position += len(sentence) + 1

    return combined_text.strip(), label_positions

def parse_labeled_file(file_path):
    parsed_data = []
    content = read_file(file_path).splitlines()
    for line in content:
        parts = line.strip().split('\t')
        if len(parts) == 2:
            label, sentence = parts
            parsed_data.append({
                'type': label,
                'text': sentence
            })
    return parsed_data

# for .ann files that contian the entire text
def read_and_merge_annotations(file_path):
    merged_sentences = []
    labels = []
    current_sentence = ""
    current_labels = set()
    ending_symbols = ".?!"  # Define proper sentence-ending symbols
    parsed_lines = []

    # Step 1: Read and parse lines
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.split(maxsplit=4)
            if len(parts) < 5:
                continue
            tag_id, tag_type, tag_start, tag_end, tag_text = parts
            parsed_lines.append({
                'id': tag_id,
                'type': tag_type,
                'start': int(tag_start),
                'end': int(tag_end),
                'text': tag_text.strip()
            })

    # Step 2: Sort lines by start and end tags
    parsed_lines.sort(key=lambda x: (x['start'], x['end']))

    # Step 3: Apply merging rules
    for entry in parsed_lines:
        tag_text = entry['text']
        tag_type = entry['type']

        if current_sentence:
            next_starts_lower = tag_text[0].islower()
            next_starts_with_number = tag_text[0].isdigit()
            current_ends_semicolon = current_sentence[-1] == ";"
            current_not_ended = current_sentence[-1] not in ending_symbols

            # Concatenate if the current sentence ends with ; or is not properly ended and next starts with a number
            if current_ends_semicolon or (current_not_ended and (next_starts_lower or next_starts_with_number)):
                current_sentence += " " + tag_text
                current_labels.add(tag_type)
                continue

            # Add current completed sentence and its labels
            merged_sentences.append(current_sentence)
            labels.append(current_labels)  # Store labels as a set
            current_sentence = ""
            current_labels = set()

        # Start a new sentence
        current_sentence = tag_text
        current_labels.add(tag_type)

    # Append the final sentence and its labels
    if current_sentence:
        merged_sentences.append(current_sentence)
        labels.append(current_labels)

    # Create a DataFrame
    df = pd.DataFrame({
        SENTENCE_COL: merged_sentences,
        LABEL_COL: labels
    })
    return df

def split_into_sentences_custom(text):
    # Sometimes spacy is to slow for big documents.
    # Regex pattern to split on periods followed by a space or newlines, handling bullet points too
    sentence_endings = r'(?<=\.)\s+|\n+|\!|\?|\;'
    # Use regex to split based on the custom rule
    sentences = re.split(sentence_endings, text)
    # Clean sentences (strip extra spaces and remove empty ones)
    return [sentence.strip() for sentence in sentences]

def extract_and_label_sentences(data):
    sentences_with_labels = []
    argument_sentences = set()
    for document in tqdm(data):
        name = document['name']
        text = document['text']
        clauses = document['clauses']
        arguments = document['arguments']
        clause_id_to_span = {clause["_id"]: (clause["start"], clause["end"]) for clause in clauses}
        all_sentences = extract_sentences_spacy(text)
        for argument in arguments:
            conclusion_id = argument['conclusion']
            conclusion_start, conclusion_end = clause_id_to_span[conclusion_id]
            conclusion_sentence = text[conclusion_start:conclusion_end].strip()

            for sentence in all_sentences:
                if conclusion_sentence.lower() in sentence.strip().lower():
                    sentences_with_labels.append({DSID_COL: name, SENTENCE_COL: sentence.strip(), LABEL_COL: True})
                    argument_sentences.add(sentence.strip())

        for sentence in all_sentences:
            if sentence.strip() not in argument_sentences:
                sentences_with_labels.append({DSID_COL: name, SENTENCE_COL: sentence.strip(), LABEL_COL: False})

    return sentences_with_labels

## Create data folder

In [None]:
! mkdir -p ./data

## Create ASC

In [25]:
! wget https://github.com/RobinSchaefer/tweet-stance-classification/archive/refs/heads/master.zip
! mkdir -p ./data/ASC
! unzip -o master.zip -d ./data/ASC
! rm -r master.zip
! mv -f ./data/ASC/tweet-stance-classification-master/tsc/data/atheism_stance_corpus.json ./data/ASC
! rm -r ./data/ASC/tweet-stance-classification-master/

--2025-02-05 09:29:02--  https://github.com/RobinSchaefer/tweet-stance-classification/archive/refs/heads/master.zip
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/RobinSchaefer/tweet-stance-classification/zip/refs/heads/master [following]
--2025-02-05 09:29:02--  https://codeload.github.com/RobinSchaefer/tweet-stance-classification/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.112.10
Connecting to codeload.github.com (codeload.github.com)|140.82.112.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘master.zip’

master.zip              [ <=>                ] 352.49K  --.-KB/s    in 0.1s    

2025-02-05 09:29:02 (2.80 MB/s) - ‘master.zip’ saved [360948]

Archive:  master.zip
55180ce3de318fd1d19b3a5b71fb5dd8b4f446ab
   creati

In [26]:
path = './data/ASC/'
final = 'asc.csv'
class2label = {'none': NOARGUMENT, 'against': NOARGUMENT, 'favor':  ARGUMENT}

asc = read_json_data("./data/ASC/atheism_stance_corpus.json")

data = []
for doc in asc:
    id = "ASC_" + str(doc["id"])
    sentence = doc["text"]
    label = doc["debateStancePolarity"]
    data.append((id, sentence, label))

df_asc = pd.DataFrame(data, columns=[DSID_COL, SENTENCE_COL, LABEL_COL])
assert df_asc.shape[0] == 715

df_asc[DATASET_COL] = "ASC"
df_asc[DOMAIN_COL] = "twitter.com"
df_asc[TOPIC_COL] = "random"
df_asc[LABEL_COL] = df_asc[LABEL_COL].replace(class2label)
df_asc = df_asc.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_asc = df_asc[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]

show_information(df_asc)
df_asc.to_csv(path + final, index=False)

ASC size: 715
Topics: 1


In [27]:
!rm ./data/ASC/atheism_stance_corpus.json

## Create SDAT

In [None]:
! wget https://github.com/danielhers/sustainable-diet-arguments-twitter/archive/refs/heads/main.zip
! mkdir -p ./data/SDAT
! unzip -o main.zip -d ./data/SDAT
! mv -f ./data/SDAT/sustainable-diet-arguments-twitter-main/* ./data/SDAT
! rm -r main.zip
! rm -r ./data/SDAT/sustainable-diet-arguments-twitter-main

--2024-12-30 16:47:48--  https://github.com/danielhers/sustainable-diet-arguments-twitter/archive/refs/heads/main.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/danielhers/sustainable-diet-arguments-twitter/zip/refs/heads/main [following]
--2024-12-30 16:47:48--  https://codeload.github.com/danielhers/sustainable-diet-arguments-twitter/zip/refs/heads/main
Resolving codeload.github.com (codeload.github.com)... 140.82.112.9
Connecting to codeload.github.com (codeload.github.com)|140.82.112.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘main.zip’

main.zip                [   <=>              ]   2.73M  5.26MB/s    in 0.5s    

2024-12-30 16:47:49 (5.26 MB/s) - ‘main.zip’ saved [2862226]

Archive:  main.zip
71559e7fcec80c5fc4e83d371a7c5bff78aa15d9
   creatin

In [None]:
path = './data/SDAT/'
final = 'sdat.csv'
class2label = {True: ARGUMENT, False: NOARGUMENT}

df_sdat = pd.read_csv(path + 'annotated-dataset.csv')
df_sdat['id'] = df_sdat['id'].astype('int64')

for group in df_sdat.groupby('id'):
    argumentative = group[1].argumentative.values
    assert(all(argumentative == argumentative[0]))

df_sdat = df_sdat.drop_duplicates(subset='id')
df_sdat['argumentative'] = df_sdat['argumentative'] >= 0.5
df_sdat['argumentative'] = df_sdat['argumentative'].replace(class2label)


df_sdat[DOMAIN_COL] = 'https://twitter.com'
df_sdat = df_sdat[['id', 'domain', 'topic', 'argumentative', 'tweet']]
df_sdat[DATASET_COL] = 'SDAT'

df_sdat.rename(columns={
    "id": DSID_COL,
    "topic": TOPIC_COL,
    "argumentative": LABEL_COL,
    "tweet": SENTENCE_COL
}, inplace=True)

df_sdat = df_sdat[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
show_information(df_sdat)
df_sdat.to_csv(path + final, index=False)

SDAT size: 597
Topics: 1


In [None]:
!find ./data/SDAT -mindepth 1 -not -name 'sdat.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create SCIARK

In [None]:
!git clone https://github.com/afergadis/SciARK.git
!mkdir ./data/SCIARK
!mv ./SciARK/dataset/SciARK.json ./data/SCIARK
!rm -r ./SciARK

Cloning into 'SciARK'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 98 (delta 38), reused 95 (delta 36), pack-reused 0 (from 0)[K
Receiving objects: 100% (98/98), 4.17 MiB | 11.27 MiB/s, done.
Resolving deltas: 100% (38/38), done.
Updating files: 100% (64/64), done.
mkdir: cannot create directory ‘./data/SCIARK’: File exists


In [None]:
path = './data/SCIARK/'
final = 'sciark.csv'
class2label = {None: NOARGUMENT, 'Evidence': NOARGUMENT, 'Claim':  ARGUMENT}

sciark = read_json_data("./data/SCIARK/SciARK.json")
assert len(set(sciark.keys())) == 1000

data = []
for key in tqdm(sciark.keys()):
    doc = sciark[key]
    sentences = doc["sentences"]
    labels = doc["labels"]
    assert len(labels) == len(sentences)
    for line, (sentence, label) in list(enumerate(zip(sentences, labels))):
        data.append((key.split('.txt')[0] + "_" + str(line), sentence, label))

df_sciark = pd.DataFrame(data, columns=[DSID_COL, SENTENCE_COL, LABEL_COL])
assert df_sciark.shape[0] == 12374

df_sciark[DATASET_COL] = "SCIARK"
df_sciark[DOMAIN_COL] = "un.org"
df_sciark[TOPIC_COL] = "random"
df_sciark[LABEL_COL] = df_sciark[LABEL_COL].replace(class2label)
df_sciark = df_sciark.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_sciark = df_sciark[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]

show_information(df_sciark)
df_sciark.to_csv(path + final, index=False)

  0%|          | 0/1000 [00:00<?, ?it/s]

SCIARK size: 11695
Topics: 1


In [None]:
!find ./data/SCIARK -mindepth 1 -not -name 'sciark.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create AFS

In [None]:
!wget http://nldslab.soe.ucsc.edu/afs16/Sigdial_16_release_data.zip
!mkdir ./data/AFS
!unzip Sigdial_16_release_data.zip -d ./data/AFS
!mv ./data/AFS/server_data/ ./data/AFS/data
!rm -r ./data/AFS/__MACOSX/
!rm -r Sigdial_16_release_data.zip

--2024-12-29 15:47:37--  http://nldslab.soe.ucsc.edu/afs16/Sigdial_16_release_data.zip
Resolving nldslab.soe.ucsc.edu (nldslab.soe.ucsc.edu)... 128.114.48.41
Connecting to nldslab.soe.ucsc.edu (nldslab.soe.ucsc.edu)|128.114.48.41|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 775821 (758K) [application/zip]
Saving to: ‘Sigdial_16_release_data.zip’


2024-12-29 15:47:38 (1.83 MB/s) - ‘Sigdial_16_release_data.zip’ saved [775821/775821]

mkdir: cannot create directory ‘./data/AFS’: File exists
Archive:  Sigdial_16_release_data.zip
   creating: ./data/AFS/server_data/
  inflating: ./data/AFS/server_data/ArgPairs_DP.csv  
   creating: ./data/AFS/__MACOSX/
   creating: ./data/AFS/__MACOSX/server_data/
  inflating: ./data/AFS/__MACOSX/server_data/._ArgPairs_DP.csv  
  inflating: ./data/AFS/server_data/ArgPairs_GC.csv  
  inflating: ./data/AFS/__MACOSX/server_data/._ArgPairs_GC.csv  
  inflating: ./data/AFS/server_data/ArgPairs_GM.csv  
  inflating: ./data/AFS/__MACOS

In [None]:
path = './data/AFS/'
final = 'afs.csv'
class2label = {True: ARGUMENT, False: NOARGUMENT}

df_q = pd.read_csv(path + 'data/ArgQuality_GC.csv', encoding='ISO-8859-1')
df_q[LABEL_COL] = df_q.apply(lambda row: True if row['Yes_Count'] > 3  else False, axis=1)
df_q[TOPIC_COL] = "Gun Control"
df_q[DSID_COL] = df_q[TOPIC_COL] + "_" + df_q["sentenceId"].astype(str) + "_" + df_q.index.astype(str)

df_d = pd.read_csv(path + 'data/ArgQuality_DP.csv', encoding='ISO-8859-1')
df_d[LABEL_COL] = df_d.apply(lambda row: True if row['scale3_Count'] > row['scale2_1_Count']  else False, axis=1)
df_d[TOPIC_COL] = "Death Penalty"
df_d[DSID_COL] = df_d[TOPIC_COL] + "_" + df_d["sentenceId"].astype(str) + "_" + df_d.index.astype(str)

df_g = pd.read_csv(path + 'data/ArgQuality_GM.csv', encoding='ISO-8859-1')
df_g[LABEL_COL] = df_g.apply(lambda row: True if row['Yes_Count'] > row['No_Count']  else False, axis=1)
df_g[TOPIC_COL] = "Gay Marriage"
df_g[DSID_COL] = df_g[TOPIC_COL] + "_" + df_g["sentenceId"].astype(str) + "_" + df_g.index.astype(str)

df_afs = pd.concat([df_q, df_d, df_g])

df_afs[DATASET_COL] = "AFS"
df_afs[DOMAIN_COL] = "procon.org + idebate.org"
df_afs[LABEL_COL] = df_afs[LABEL_COL].replace(class2label)
df_afs = df_afs.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_afs = df_afs[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]

show_information(df_afs)

df_afs.to_csv(path + final, index=False)

AFS size: 6186
Topics: 3


In [None]:
!find ./data/AFS -mindepth 1 -not -name 'afs.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create USELEC

In [None]:
!git clone https://github.com/ElecDeb60To16/Dataset
!mkdir ./data/USELEC
!mv ./Dataset/sentence_db_candidate.csv ./data/USELEC
!rm -r ./Dataset

Cloning into 'Dataset'...
remote: Enumerating objects: 10, done.[K
remote: Total 10 (delta 0), reused 0 (delta 0), pack-reused 10 (from 1)[K
Receiving objects: 100% (10/10), 4.26 MiB | 10.58 MiB/s, done.
Resolving deltas: 100% (2/2), done.
mkdir: cannot create directory ‘./data/USELEC’: File exists


In [None]:
path = './data/USELEC/'
final = 'uselec.csv'
class2label = {True: ARGUMENT, False: NOARGUMENT}


df_uselec = pd.read_csv(path + "sentence_db_candidate.csv")
df_uselec['Tag'] = df_uselec['Tag'].apply(lambda x: eval(x) if isinstance(x, str) else x)
df_uselec[LABEL_COL] = df_uselec['Tag'].apply(lambda x: 'Claim' in x if isinstance(x, dict) else False)

df_uselec[DATASET_COL] = "USELEC"
df_uselec[DOMAIN_COL] = "debates.org"
df_uselec[LABEL_COL] = df_uselec[LABEL_COL].replace(class2label)
df_uselec[DSID_COL] = df_uselec["Document"] + "_" + df_uselec["Sentence"].astype(str) + "_" + df_uselec.index.astype(str)
df_uselec[TOPIC_COL] = df_uselec["Speaker"]
df_uselec[SENTENCE_COL] = df_uselec["Text"]

df_uselec = df_uselec.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_uselec = df_uselec[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]

show_information(df_uselec)

df_uselec.to_csv(path + final, index=False)

USELEC size: 29093
Topics: 29


In [None]:
!find ./data/USELEC -mindepth 1 -not -name 'uselec.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create ECHR

In [None]:
!wget http://www.di.uevora.pt/~pq/echr/echr_corpus.zip
!mkdir ./data/ECHR
!unzip ./echr_corpus.zip -d ./data/ECHR
!mv ./data/ECHR/echr_corpus/ ./data/ECHR/data
!rm -r ./data/ECHR/__MACOSX
!rm -r echr_corpus.zip

--2024-12-29 15:41:06--  http://www.di.uevora.pt/~pq/echr/echr_corpus.zip
Resolving www.di.uevora.pt (www.di.uevora.pt)... 193.137.179.82
Connecting to www.di.uevora.pt (www.di.uevora.pt)|193.137.179.82|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 749759 (732K) [application/zip]
Saving to: ‘echr_corpus.zip’


2024-12-29 15:41:08 (483 KB/s) - ‘echr_corpus.zip’ saved [749759/749759]

mkdir: cannot create directory ‘./data/ECHR’: File exists
Archive:  ./echr_corpus.zip
   creating: ./data/ECHR/echr_corpus/
  inflating: ./data/ECHR/echr_corpus/.DS_Store  
   creating: ./data/ECHR/__MACOSX/
   creating: ./data/ECHR/__MACOSX/echr_corpus/
  inflating: ./data/ECHR/__MACOSX/echr_corpus/._.DS_Store  
  inflating: ./data/ECHR/echr_corpus/ECHR_Corpus.json  
  inflating: ./data/ECHR/__MACOSX/echr_corpus/._ECHR_Corpus.json  


In [None]:
path = './data/ECHR/'
final = 'echr.csv'
class2label = {True: ARGUMENT, False: NOARGUMENT}

file_path = path + 'data/ECHR_Corpus.json'
data = read_json_data(file_path)
sentences_with_labels = extract_and_label_sentences(data)

df_echr = pd.DataFrame(sentences_with_labels, columns=[DSID_COL, SENTENCE_COL, LABEL_COL])
df_echr[DATASET_COL] = "ECHR"
df_echr[DOMAIN_COL] = "hudoc.echr.coe.int"
df_echr[LABEL_COL] = df_echr[LABEL_COL].replace(class2label)
df_echr[DSID_COL] = df_echr[DSID_COL].str.split(".txt").str[0] + "_" + df_echr.index.astype(str)
df_echr[TOPIC_COL] = "random"

df_echr = df_echr.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_echr = df_echr[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
show_information(df_echr)

df_echr.to_csv(path + final, index=False)

  0%|          | 0/42 [00:00<?, ?it/s]

ECHR size: 10678
Topics: 1


In [None]:
!find ./data/ECHR -mindepth 1 -not -name 'echr.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create CE

In [None]:
!wget https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_\(R\)_CE-EMNLP-2015.v3.zip
!mkdir ./data/CE/data
!unzip IBM_Debater_\(R\)_CE-EMNLP-2015.v3.zip -d ./data/CE
!mv ./data/CE/IBM_Debater_\(R\)_CE-EMNLP-2015.v3/claims.txt ./data/CE/data/
!mv ./data/CE/IBM_Debater_\(R\)_CE-EMNLP-2015.v3/articles.txt ./data/CE/data/
!unzip ./data/CE/IBM_Debater_\(R\)_CE-EMNLP-2015.v3/articles.zip -d ./data/CE/data
!rm -r ./data/CE/IBM_Debater_\(R\)_CE-EMNLP-2015.v3
!rm -r ./IBM_Debater_\(R\)_CE-EMNLP-2015.v3.zip

--2024-12-29 15:27:16--  https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_CE-EMNLP-2015.v3.zip
Resolving www.research.ibm.com (www.research.ibm.com)... 13.32.230.22, 13.32.230.29, 13.32.230.125, ...
Connecting to www.research.ibm.com (www.research.ibm.com)|13.32.230.22|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_CE-EMNLP-2015.v3.zip [following]
--2024-12-29 15:27:16--  https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_CE-EMNLP-2015.v3.zip
Resolving research.ibm.com (research.ibm.com)... 108.138.106.127, 108.138.106.96, 108.138.106.72, ...
Connecting to research.ibm.com (research.ibm.com)|108.138.106.127|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9058276 (8.6M) [application/zip]
Saving to: ‘IBM_Debater_(R)_CE-EMNLP-2015.v3.zip’


2024-12-29 15:27:19 (4.35 MB/s) - ‘IBM_Debater_(R)_CE-EMNLP-2015.v3.zip’ saved [9058276/9058

In [None]:
path = './data/CE/'
final = 'ce.csv'
class2label = {True: ARGUMENT, False: NOARGUMENT}

df_articles = pd.read_csv("./data/CE/data/articles.txt", sep="\t")

ids = sorted(set([int(f.split(".txt")[0].split("_")[-1]) for f in os.listdir(path + "data/articles")]))
df_claims = pd.read_csv("./data/CE/data/claims.txt", sep="\t")

assert df_articles["article Id"].isin(ids).all()
assert df_articles["Topic"].isin(df_claims["Topic"]).all()
assert df_claims["Topic"].isin(df_articles["Topic"]).all()

ids = [id for id in ids if id in df_articles["article Id"].values]

df_ce = pd.DataFrame()

# Initialize a list to store sentences
sentences = []
# Loop through all files in the specified directory
for id in tqdm(ids):
    file_path = path + f"data/articles/clean_{id}.txt"
    text = read_file(file_path)
    # Split the text into sentences and add file ID
    for sentence in split_into_sentences_custom(text):
        sentences.append((sentence, id))
# Read claims file and split it into a list of claims (with each claim as a list of strings)
claims = df_claims.values
annotated_data = []
# Loop through the sentences and claims
for sentence, file_id in tqdm(sentences):
    match_found = False
    for claim in claims:
        if claim[1].lower() in sentence.lower():
            match_found = True
            break  # Exit the loop as we found a match, no need to check further claims for this sentence
    annotated_data.append((file_id, sentence, match_found, file_id))

df_ce = pd.DataFrame(annotated_data, columns=[TOPIC_COL, SENTENCE_COL, LABEL_COL, DSID_COL])
df_ce[DATASET_COL] = "CE"
df_ce[DOMAIN_COL] = "wikipedia.org"
df_ce[LABEL_COL] = df_ce[LABEL_COL].replace(class2label)
df_ce[DSID_COL] = df_ce[DSID_COL].astype(str) + "_" + df_ce.index.astype(str)
df_ce[TOPIC_COL] = df_ce[TOPIC_COL].map(df_articles.set_index("article Id")["Topic"])

df_ce = df_ce.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_ce = df_ce[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
show_information(df_ce)

df_ce.to_csv(path + final, index=False)

  0%|          | 0/547 [00:00<?, ?it/s]

  0%|          | 0/97324 [00:00<?, ?it/s]

CE size: 86964
Topics: 58


In [None]:
!find ./data/CE -mindepth 1 -not -name 'ce.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create ARGUMINSCI

In [None]:
!wget http://data.dws.informatik.uni-mannheim.de/sci-arg/compiled_corpus.zip
!mkdir ./data/ARGUMINSCI
!unzip compiled_corpus.zip -d ./data/ARGUMINSCI
!mv ./data/ARGUMINSCI/compiled_corpus/ ./data/ARGUMINSCI/data
!rm -r ./compiled_corpus.zip

--2024-12-29 15:24:04--  http://data.dws.informatik.uni-mannheim.de/sci-arg/compiled_corpus.zip
Resolving data.dws.informatik.uni-mannheim.de (data.dws.informatik.uni-mannheim.de)... 134.155.95.56
Connecting to data.dws.informatik.uni-mannheim.de (data.dws.informatik.uni-mannheim.de)|134.155.95.56|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://data.dws.informatik.uni-mannheim.de/sci-arg/compiled_corpus.zip [following]
--2024-12-29 15:24:05--  https://data.dws.informatik.uni-mannheim.de/sci-arg/compiled_corpus.zip
Connecting to data.dws.informatik.uni-mannheim.de (data.dws.informatik.uni-mannheim.de)|134.155.95.56|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1129621 (1.1M) [application/zip]
Saving to: ‘compiled_corpus.zip’


2024-12-29 15:24:06 (2.06 MB/s) - ‘compiled_corpus.zip’ saved [1129621/1129621]

mkdir: cannot create directory ‘./data/ARGUMINSCI’: File exists
Archive:  compiled_corpus.zip
   creating

In [None]:
path = './data/ARGUMINSCI/'
final = 'arguminsci.csv'

df_arguminsci = pd.DataFrame()

for f in tqdm(set([f.split(".")[0] for f in os.listdir(path + "data")])):
    # Load and process files
    txt_path = path + f"data/{f}.txt"
    ann_path = path + f"data/{f}.ann"
    text_content = read_file(txt_path)
    annotation_lines = read_file(ann_path).splitlines()
    annotation_lines = [line for line in annotation_lines if line.startswith('T')] # Keep text labels
    annotation_lines = [line for line in annotation_lines if not ";" in line.split("\t")[1]] # remove lines with error

    parsed_annotations = parse_annotations(annotation_lines)
    # Match annotations to sentences
    sentence_annotations = match_annotations_to_sentences(text_content, parsed_annotations)
    # Prepare data for display
    output_data = prepare_output(sentence_annotations)
    # Create a DataFrame
    output_df = pd.DataFrame(output_data)
    output_df[DSID_COL] = output_df.apply(lambda row: f + "_" + str(row.name), axis=1)

    df_arguminsci = pd.concat([df_arguminsci, output_df])

df_arguminsci[LABEL_COL] = df_arguminsci[LABEL_COL].apply(lambda row: any("claim" in label.lower() for label in row))
df_arguminsci[LABEL_COL] = df_arguminsci[LABEL_COL].replace({True: ARGUMENT, False: NOARGUMENT})

df_arguminsci[DATASET_COL] = "ARGUMINSCI"
df_arguminsci[DOMAIN_COL] = "Dr. Inventor"
df_arguminsci[TOPIC_COL] = "random"

df_arguminsci = df_arguminsci.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_arguminsci = df_arguminsci[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
show_information(df_arguminsci)

df_arguminsci.to_csv(path + final, index=False)

  0%|          | 0/40 [00:00<?, ?it/s]

ARGUMINSCI size: 16102
Topics: 1


In [None]:
!find ./data/ARGUMINSCI -mindepth 1 -not -name 'arguminsci.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create CMV

In [None]:
!git clone https://github.com/chridey/change-my-view-modes.git
!mkdir ./data/CMV/
!mv change-my-view-modes/v2.0/negative ./data/CMV
!mv change-my-view-modes/v2.0/positive ./data/CMV
!rm -r ./change-my-view-modes

Cloning into 'change-my-view-modes'...
remote: Enumerating objects: 1482, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 1482 (delta 3), reused 3 (delta 0), pack-reused 1458 (from 1)[K
Receiving objects: 100% (1482/1482), 1.27 MiB | 6.87 MiB/s, done.
Resolving deltas: 100% (919/919), done.
Updating files: 100% (739/739), done.
mkdir: cannot create directory ‘./data/CMV/’: File exists


In [None]:
path = './data/CMV/'
final = 'cmv.csv'

df_cmv = pd.DataFrame()

for d in tqdm(["positive", "negative"]):
    for f in tqdm(set([f.split(".")[0] for f in os.listdir(path + d)])):
        # Load and process files
        txt_path = path + f"{d}/{f}.txt"
        ann_path = path + f"{d}/{f}.ann"
        text_content = read_file(txt_path)
        annotation_lines = read_file(ann_path).splitlines()
        # Parse annotations
        parsed_annotations = parse_annotations(annotation_lines)
        # Match annotations to sentences
        sentence_annotations = match_annotations_to_sentences(text_content, parsed_annotations)
        # Prepare data for display
        output_data = prepare_output(sentence_annotations)
        # Create a DataFrame
        output_df = pd.DataFrame(output_data)
        output_df[DSID_COL] = d[0] + "_" + f + "_" + output_df.index.astype(str)
        df_cmv = pd.concat([df_cmv, output_df])

df_cmv[LABEL_COL] = df_cmv[LABEL_COL].apply(lambda row: any("claim" in label.lower() for label in row))
df_cmv[LABEL_COL] = df_cmv[LABEL_COL].replace({True: ARGUMENT, False: NOARGUMENT})

df_cmv[TOPIC_COL] = "random"
df_cmv[DATASET_COL] = "CMV"
df_cmv[DOMAIN_COL] = "reddit.com/r/changemyview/"

df_cmv = df_cmv.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_cmv = df_cmv[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_cmv = df_cmv[~df_cmv[DSID_COL].duplicated()]
show_information(df_cmv)

df_cmv.to_csv(path + final, index=False)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

CMV size: 2572
Topics: 1


In [None]:
!find ./data/CMV -mindepth 1 -not -name 'cmv.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create VACC

In [None]:
!git clone https://github.com/cltl/vaccination-corpus.git
!cp vaccination-corpus/code/conll_data.py ./
!mkdir -p ./data/VACC
!mv ./vaccination-corpus/data/annotations-pickle ./data/VACC/
!mv ./data/VACC/annotations-pickle/ ./data/VACC/data/
!rm -r ./vaccination-corpus

Cloning into 'vaccination-corpus'...
remote: Enumerating objects: 3145, done.[K
remote: Counting objects: 100% (353/353), done.[K
remote: Compressing objects: 100% (346/346), done.[K
remote: Total 3145 (delta 29), reused 314 (delta 7), pack-reused 2792 (from 1)[K
Receiving objects: 100% (3145/3145), 116.84 MiB | 13.21 MiB/s, done.
Resolving deltas: 100% (1622/1622), done.
Updating files: 100% (1824/1824), done.


In [None]:
path = "./data/VACC/"
final = "vacc.csv"

class2label = {True: ARGUMENT, False: NOARGUMENT}
df_vacc = pd.DataFrame()

annotations = list()
for pickle_file in tqdm(os.listdir(path + "data/")):
    with gzip.open(path + "data/" + pickle_file, "rb") as infile:
        topic = pickle_file.split("/")[-1].split("_")[0]
        doc = pickle.load(infile)
        claim_sentence_ids = list()
        for claim in doc.claims:
            for sent_id, token_id in claim.sent_token_ids:
                claim_sentence_ids.append(sent_id)
        for sentence in doc.sentences:
            annotations.append((topic, sentence.id, sentence.text, sentence.sent_id in claim_sentence_ids))

df_vacc = pd.DataFrame(annotations, columns=[TOPIC_COL, DSID_COL, SENTENCE_COL, LABEL_COL])
df_vacc[DATASET_COL] = "VACC"
df_vacc[DOMAIN_COL] = "random"
df_vacc[LABEL_COL] = df_vacc[LABEL_COL].replace(class2label)
df_vacc = df_vacc[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_vacc = df_vacc.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])

show_information(df_vacc)
df_vacc.to_csv(path + final, index=False)

  0%|          | 0/294 [00:00<?, ?it/s]

VACC size: 22219
Topics: 177


In [None]:
!find ./data/VACC -mindepth 1 -not -name 'vacc.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)
!rm ./conll_data.py

## Create WTP

In [None]:
! git clone https://github.com/UKPLab/emnlp2017-claim-identification.git
! mkdir ./data/WTP
! unzip ./emnlp2017-claim-identification/src/main/python/data.zip -d ./data/WTP/data
! rm -r ./emnlp2017-claim-identification

Cloning into 'emnlp2017-claim-identification'...
remote: Enumerating objects: 328, done.[K
remote: Total 328 (delta 0), reused 0 (delta 0), pack-reused 328 (from 1)[K
Receiving objects: 100% (328/328), 12.56 MiB | 15.50 MiB/s, done.
Resolving deltas: 100% (121/121), done.
Updating files: 100% (138/138), done.
mkdir: cannot create directory ‘./data/WTP’: File exists
Archive:  ./emnlp2017-claim-identification/src/main/python/data.zip
   creating: ./data/WTP/data/PE/
  inflating: ./data/WTP/data/PE/002.test  
  inflating: ./data/WTP/data/PE/008.dev  
  inflating: ./data/WTP/data/PE/009.train  
  inflating: ./data/WTP/data/PE/009.dev  
  inflating: ./data/WTP/data/PE/010.train  
  inflating: ./data/WTP/data/PE/003.test  
  inflating: ./data/WTP/data/PE/008.train  
  inflating: ./data/WTP/data/PE/008.test  
  inflating: ./data/WTP/data/PE/004.test  
  inflating: ./data/WTP/data/PE/005.test  
  inflating: ./data/WTP/data/PE/009.test  
  inflating: ./data/WTP/data/PE/006.test  
  inflating:

In [None]:
path = "./data/WTP/"
final = "wtp.csv"

class2label = {1: ARGUMENT, 0: NOARGUMENT}
df_wtp = pd.DataFrame()

for f in os.listdir(path + "data/WTP"):
    if not (f.endswith(".train") or f.endswith(".dev") or f.endswith(".test")):
        continue
    data = []
    content = read_file(path + f"data/WTP/{f}")
    for line in content.splitlines():
        sentence, label = line.split("\t")
        data.append((sentence.strip(), int(label)))

    df = pd.DataFrame(data, columns=[SENTENCE_COL, LABEL_COL])
    df[DSID_COL] = df.apply(lambda row: f.replace('.', '_') + "_" + str(row.name), axis=1)
    df_wtp = pd.concat([df_wtp, df])

df_wtp[LABEL_COL] = df_wtp[LABEL_COL].replace(class2label)
df_wtp[TOPIC_COL] = "random"
df_wtp[DATASET_COL] = "WTP"
df_wtp[DOMAIN_COL] = "wikipedia.com"
df_wtp[DSID_COL] = df_wtp[DATASET_COL] + "_" + df_wtp[DSID_COL]
df_wtp = df_wtp.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_wtp = df_wtp[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
show_information(df_wtp)
df_wtp.to_csv(path + final, index=False)

WTP size: 8410
Topics: 1


In [None]:
!find ./data/WTP -mindepth 1 -not -name 'wtp.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create OC

In [None]:
! git clone https://github.com/UKPLab/emnlp2017-claim-identification.git
! mkdir ./data/OC
! unzip ./emnlp2017-claim-identification/src/main/python/data.zip -d ./data/OC/data
! rm -r ./emnlp2017-claim-identification

Cloning into 'emnlp2017-claim-identification'...
remote: Enumerating objects: 328, done.[K
remote: Total 328 (delta 0), reused 0 (delta 0), pack-reused 328 (from 1)[K
Receiving objects: 100% (328/328), 12.56 MiB | 11.22 MiB/s, done.
Resolving deltas: 100% (121/121), done.
Updating files: 100% (138/138), done.
mkdir: cannot create directory ‘./data/OC’: File exists
Archive:  ./emnlp2017-claim-identification/src/main/python/data.zip
   creating: ./data/OC/data/PE/
  inflating: ./data/OC/data/PE/002.test  
  inflating: ./data/OC/data/PE/008.dev  
  inflating: ./data/OC/data/PE/009.train  
  inflating: ./data/OC/data/PE/009.dev  
  inflating: ./data/OC/data/PE/010.train  
  inflating: ./data/OC/data/PE/003.test  
  inflating: ./data/OC/data/PE/008.train  
  inflating: ./data/OC/data/PE/008.test  
  inflating: ./data/OC/data/PE/004.test  
  inflating: ./data/OC/data/PE/005.test  
  inflating: ./data/OC/data/PE/009.test  
  inflating: ./data/OC/data/PE/006.test  
  inflating: ./data/OC/dat

In [None]:
path = "./data/OC/"
final = "oc.csv"

class2label = {1: ARGUMENT, 0: NOARGUMENT}
df_oc = pd.DataFrame()

for f in os.listdir(path + "data/OC"):
    if not (f.endswith(".train") or f.endswith(".dev") or f.endswith(".test")):
        continue
    data = []
    content = read_file(path + f"data/OC/{f}")
    for line in content.splitlines():
        sentence, label = line.split("\t")
        data.append((sentence.strip(), int(label)))

    df = pd.DataFrame(data, columns=[SENTENCE_COL, LABEL_COL])
    df[DSID_COL] = df.apply(lambda row: f.replace('.', '_') + "_" + str(row.name), axis=1)
    df_oc = pd.concat([df_oc, df])

df_oc[LABEL_COL] = df_oc[LABEL_COL].replace(class2label)
df_oc[TOPIC_COL] = "random"
df_oc[DATASET_COL] = "OC"
df_oc[DOMAIN_COL] = "livejournal.com"
df_oc[DSID_COL] = df_oc[DATASET_COL] + "_" + df_oc[DSID_COL]
df_oc = df_oc.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_oc = df_oc[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_oc = df_oc[df_oc['sentence'] != "\ufeff"]
show_information(df_oc)
df_oc.to_csv(path + final, index=False)

OC size: 8527
Topics: 1


In [None]:
!find ./data/OC -mindepth 1 -not -name 'oc.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create PE

In [None]:
! git clone https://github.com/UKPLab/emnlp2017-claim-identification.git
! mkdir ./data/PE
! unzip ./emnlp2017-claim-identification/src/main/python/data.zip -d ./data/PE/data
! rm -r ./emnlp2017-claim-identification

Cloning into 'emnlp2017-claim-identification'...
remote: Enumerating objects: 328, done.[K
remote: Total 328 (delta 0), reused 0 (delta 0), pack-reused 328 (from 1)[K
Receiving objects: 100% (328/328), 12.56 MiB | 15.41 MiB/s, done.
Resolving deltas: 100% (121/121), done.
Updating files: 100% (138/138), done.
mkdir: cannot create directory ‘./data/PE’: File exists
Archive:  ./emnlp2017-claim-identification/src/main/python/data.zip
   creating: ./data/PE/data/PE/
  inflating: ./data/PE/data/PE/002.test  
  inflating: ./data/PE/data/PE/008.dev  
  inflating: ./data/PE/data/PE/009.train  
  inflating: ./data/PE/data/PE/009.dev  
  inflating: ./data/PE/data/PE/010.train  
  inflating: ./data/PE/data/PE/003.test  
  inflating: ./data/PE/data/PE/008.train  
  inflating: ./data/PE/data/PE/008.test  
  inflating: ./data/PE/data/PE/004.test  
  inflating: ./data/PE/data/PE/005.test  
  inflating: ./data/PE/data/PE/009.test  
  inflating: ./data/PE/data/PE/006.test  
  inflating: ./data/PE/dat

In [None]:
path = "./data/PE/"
final = "pe.csv"

class2label = {1: ARGUMENT, 0: NOARGUMENT}
df_pe = pd.DataFrame()

for f in os.listdir(path + "data/PE"):
    if not (f.endswith(".train") or f.endswith(".dev") or f.endswith(".test")):
        continue
    data = []
    content = read_file(path + f"data/PE/{f}")
    for line in content.splitlines():
        sentence, label = line.split("\t")
        data.append((sentence.strip(), int(label)))

    df = pd.DataFrame(data, columns=[SENTENCE_COL, LABEL_COL])
    df[DSID_COL] = df.apply(lambda row: f.replace('.', '_') + "_" + str(row.name), axis=1)
    df_pe = pd.concat([df_pe, df])

df_pe[LABEL_COL] = df_pe[LABEL_COL].replace(class2label)
df_pe[TOPIC_COL] = "random"
df_pe[DATASET_COL] = "PE"
df_pe[DOMAIN_COL] = "essayforum.com"
df_pe[DSID_COL] = df_pe[DATASET_COL] + "_" + df_pe[DSID_COL]
df_pe = df_pe.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_pe = df_pe[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
show_information(df_pe)
df_pe.to_csv(path + final, index=False)

PE size: 7051
Topics: 1


In [None]:
!find ./data/PE -mindepth 1 -not -name 'pe.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create MT

In [None]:
! git clone https://github.com/UKPLab/emnlp2017-claim-identification.git
! mkdir ./data/MT
! unzip ./emnlp2017-claim-identification/src/main/python/data.zip -d ./data/MT/data
! rm -r ./emnlp2017-claim-identification

Cloning into 'emnlp2017-claim-identification'...
remote: Enumerating objects: 328, done.[K
remote: Total 328 (delta 0), reused 0 (delta 0), pack-reused 328 (from 1)[K
Receiving objects: 100% (328/328), 12.56 MiB | 15.35 MiB/s, done.
Resolving deltas: 100% (121/121), done.
Updating files: 100% (138/138), done.
mkdir: cannot create directory ‘./data/MT’: File exists
Archive:  ./emnlp2017-claim-identification/src/main/python/data.zip
   creating: ./data/MT/data/PE/
  inflating: ./data/MT/data/PE/002.test  
  inflating: ./data/MT/data/PE/008.dev  
  inflating: ./data/MT/data/PE/009.train  
  inflating: ./data/MT/data/PE/009.dev  
  inflating: ./data/MT/data/PE/010.train  
  inflating: ./data/MT/data/PE/003.test  
  inflating: ./data/MT/data/PE/008.train  
  inflating: ./data/MT/data/PE/008.test  
  inflating: ./data/MT/data/PE/004.test  
  inflating: ./data/MT/data/PE/005.test  
  inflating: ./data/MT/data/PE/009.test  
  inflating: ./data/MT/data/PE/006.test  
  inflating: ./data/MT/dat

In [None]:
path = "./data/MT/"
final = "mt.csv"

class2label = {1: ARGUMENT, 0: NOARGUMENT}
df_mt = pd.DataFrame()

for f in os.listdir(path + "data/MT"):
    if not (f.endswith(".train") or f.endswith(".dev") or f.endswith(".test")):
        continue
    data = []
    content = read_file(path + f"data/MT/{f}")
    for line in content.splitlines():
        sentence, label = line.split("\t")
        data.append((sentence.strip(), int(label)))

    df = pd.DataFrame(data, columns=[SENTENCE_COL, LABEL_COL])
    df[DSID_COL] = df.apply(lambda row: f.replace('.', '_') + "_" + str(row.name), axis=1)
    df_mt = pd.concat([df_mt, df])

df_mt[LABEL_COL] = df_mt[LABEL_COL].replace(class2label)
df_mt[TOPIC_COL] = "random"
df_mt[DATASET_COL] = "MT"
df_mt[DOMAIN_COL] = "random"
df_mt[DSID_COL] = df_mt[DATASET_COL] + "_" + df_mt[DSID_COL]
df_mt = df_mt.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_mt = df_mt[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]

show_information(df_mt)
df_mt.to_csv(path + final, index=False)

MT size: 449
Topics: 1


In [None]:
!find ./data/MT -mindepth 1 -not -name 'mt.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create WD

In [None]:
! git clone https://github.com/UKPLab/emnlp2017-claim-identification.git
! mkdir ./data/WD
! unzip ./emnlp2017-claim-identification/src/main/python/data.zip -d ./data/WD/data
! rm -r ./emnlp2017-claim-identification

Cloning into 'emnlp2017-claim-identification'...
remote: Enumerating objects: 328, done.[K
remote: Total 328 (delta 0), reused 0 (delta 0), pack-reused 328 (from 1)[K
Receiving objects: 100% (328/328), 12.56 MiB | 8.57 MiB/s, done.
Resolving deltas: 100% (121/121), done.
Updating files: 100% (138/138), done.
mkdir: cannot create directory ‘./data/WD’: File exists
Archive:  ./emnlp2017-claim-identification/src/main/python/data.zip
   creating: ./data/WD/data/PE/
  inflating: ./data/WD/data/PE/002.test  
  inflating: ./data/WD/data/PE/008.dev  
  inflating: ./data/WD/data/PE/009.train  
  inflating: ./data/WD/data/PE/009.dev  
  inflating: ./data/WD/data/PE/010.train  
  inflating: ./data/WD/data/PE/003.test  
  inflating: ./data/WD/data/PE/008.train  
  inflating: ./data/WD/data/PE/008.test  
  inflating: ./data/WD/data/PE/004.test  
  inflating: ./data/WD/data/PE/005.test  
  inflating: ./data/WD/data/PE/009.test  
  inflating: ./data/WD/data/PE/006.test  
  inflating: ./data/WD/data

In [None]:
path = "./data/WD/"
final = "wd.csv"

class2label = {1: ARGUMENT, 0: NOARGUMENT}
df_wd = pd.DataFrame()

for f in os.listdir(path + "data/WD"):
    if not (f.endswith(".train") or f.endswith(".dev") or f.endswith(".test")):
        continue
    data = []
    content = read_file(path + f"data/WD/{f}")
    for line in content.splitlines():
        sentence, label = line.split("\t")
        data.append((sentence.strip(), int(label)))

    df = pd.DataFrame(data, columns=[SENTENCE_COL, LABEL_COL])
    df[DSID_COL] = df.apply(lambda row: f.replace('.', '_') + "_" + str(row.name), axis=1)
    df_wd = pd.concat([df_wd, df])

df_wd[LABEL_COL] = df_wd[LABEL_COL].replace(class2label)
df_wd[TOPIC_COL] = "random"
df_wd[DATASET_COL] = "WD"
df_wd[DOMAIN_COL] = "random"
df_wd[DSID_COL] = df_wd[DATASET_COL] + "_" + df_wd[DSID_COL]
df_wd = df_wd.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_wd = df_wd[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]

show_information(df_wd)
df_wd.to_csv(path + final, index=False)

WD size: 3872
Topics: 1


In [None]:
!find ./data/WD -mindepth 1 -not -name 'wd.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create VG

In [None]:
! git clone https://github.com/UKPLab/emnlp2017-claim-identification.git
! mkdir ./data/VG
! unzip ./emnlp2017-claim-identification/src/main/python/data.zip -d ./data/VG/data
! rm -r ./emnlp2017-claim-identification

Cloning into 'emnlp2017-claim-identification'...
remote: Enumerating objects: 328, done.[K
remote: Total 328 (delta 0), reused 0 (delta 0), pack-reused 328 (from 1)[K
Receiving objects: 100% (328/328), 12.56 MiB | 14.80 MiB/s, done.
Resolving deltas: 100% (121/121), done.
Updating files: 100% (138/138), done.
mkdir: cannot create directory ‘./data/VG’: File exists
Archive:  ./emnlp2017-claim-identification/src/main/python/data.zip
   creating: ./data/VG/data/PE/
  inflating: ./data/VG/data/PE/002.test  
  inflating: ./data/VG/data/PE/008.dev  
  inflating: ./data/VG/data/PE/009.train  
  inflating: ./data/VG/data/PE/009.dev  
  inflating: ./data/VG/data/PE/010.train  
  inflating: ./data/VG/data/PE/003.test  
  inflating: ./data/VG/data/PE/008.train  
  inflating: ./data/VG/data/PE/008.test  
  inflating: ./data/VG/data/PE/004.test  
  inflating: ./data/VG/data/PE/005.test  
  inflating: ./data/VG/data/PE/009.test  
  inflating: ./data/VG/data/PE/006.test  
  inflating: ./data/VG/dat

In [None]:
path = "./data/VG/"
final = "vg.csv"

class2label = {1: ARGUMENT, 0: NOARGUMENT}
df_vg = pd.DataFrame()

for f in os.listdir(path + "data/VG"):
    if not (f.endswith(".train") or f.endswith(".dev") or f.endswith(".test")):
        continue
    data = []
    content = read_file(path + f"data/VG/{f}")
    for line in content.splitlines():
        sentence, label = line.split("\t")
        data.append((sentence.strip(), int(label)))

    df = pd.DataFrame(data, columns=[SENTENCE_COL, LABEL_COL])
    df[DSID_COL] = df.apply(lambda row: f.replace('.', '_') + "_" + str(row.name), axis=1)
    df_vg = pd.concat([df_vg, df])

df_vg[LABEL_COL] = df_vg[LABEL_COL].replace(class2label)
df_vg[TOPIC_COL] = "random"
df_vg[DATASET_COL] = "VG"
df_vg[DOMAIN_COL] = "www.aifdb.org"
df_vg[DSID_COL] = df_vg[DATASET_COL] + "_" + df_vg[DSID_COL]
df_vg = df_vg.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_vg = df_vg[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]

show_information(df_vg)
df_vg.to_csv(path + final, index=False)

VG size: 2576
Topics: 1


In [None]:
!find ./data/VG -mindepth 1 -not -name 'vg.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create TACO

In [None]:
! wget https://zenodo.org/records/8230057/files/TomatenMarc/TACO-Public-Data.zip
! mkdir -p ./data/TACO
! unzip -o TACO-Public-Data.zip -d ./data/TACO
! mv -f ./data/TACO/TomatenMarc-TACO-aa2c0cc/* ./data/TACO
! rm -r TACO-Public-Data.zip
! rm -r ./data/TACO/TomatenMarc-TACO-aa2c0cc/

--2024-12-29 14:40:10--  https://zenodo.org/records/8230057/files/TomatenMarc/TACO-Public-Data.zip
Resolving zenodo.org (zenodo.org)... 188.185.48.194, 188.185.45.92, 188.185.43.25, ...
Connecting to zenodo.org (zenodo.org)|188.185.48.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7453512 (7.1M) [application/octet-stream]
Saving to: ‘TACO-Public-Data.zip’


2024-12-29 14:40:12 (7.27 MB/s) - ‘TACO-Public-Data.zip’ saved [7453512/7453512]

Archive:  TACO-Public-Data.zip
aa2c0cc58a10a9833c2b8ded865205cd7d359156
   creating: ./data/TACO/TomatenMarc-TACO-aa2c0cc/
  inflating: ./data/TACO/TomatenMarc-TACO-aa2c0cc/.gitignore  
  inflating: ./data/TACO/TomatenMarc-TACO-aa2c0cc/README.md  
   creating: ./data/TACO/TomatenMarc-TACO-aa2c0cc/data/
  inflating: ./data/TACO/TomatenMarc-TACO-aa2c0cc/data/README.md  
  inflating: ./data/TACO/TomatenMarc-TACO-aa2c0cc/data/annotation_framework.pdf  
  inflating: ./data/TACO/TomatenMarc-TACO-aa2c0cc/data/conversations.csv  

In [None]:
path = './data/TACO/'
final = 'taco.csv'
class2label = {"Reason": ARGUMENT, "Statement": ARGUMENT, "Notification": NOARGUMENT, "None": NOARGUMENT}

df_majority_votes = pd.read_csv(path + 'data/majority_votes.csv')
df_backup = pd.read_csv('./assets/backup_tweets.csv', lineterminator='\n')

df_taco = df_majority_votes.merge(df_backup, on='tweet_id')
df_taco['class'] = df_taco['class'].astype(str)
df_taco['class'] = df_taco['class'].replace({'nan': 'None'})
df_taco = df_taco[df_taco['class'] != "Undecided"]
df_taco['class'] = df_taco['class'].replace(class2label)

df_taco[DOMAIN_COL] = 'https://twitter.com'
df_taco = df_taco[['tweet_id', 'domain', 'topic', 'class', 'text']]
df_taco[DATASET_COL] = 'TACO'

df_taco.rename(columns={
    "tweet_id": DSID_COL,
    "topic": TOPIC_COL,
    "class": LABEL_COL,
    "text": SENTENCE_COL
}, inplace=True)

df_taco = df_taco[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_taco = df_taco.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])

show_information(df_taco)
df_taco.to_csv(path + final, index=False)

TACO size: 1732
Topics: 6


In [None]:
!find ./data/TACO -mindepth 1 -not -name 'taco.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create AEC

In [None]:
! wget http://nldslab.soe.ucsc.edu/arg-extraction/sigdial2015/annotated-phrases-by-topic.tar.xz
! mkdir -p ./data/AEC
! tar -xf annotated-phrases-by-topic.tar.xz -C ./data/AEC
! rm -r annotated-phrases-by-topic.tar.xz

--2024-12-29 14:33:48--  http://nldslab.soe.ucsc.edu/arg-extraction/sigdial2015/annotated-phrases-by-topic.tar.xz
Resolving nldslab.soe.ucsc.edu (nldslab.soe.ucsc.edu)... 128.114.48.41
Connecting to nldslab.soe.ucsc.edu (nldslab.soe.ucsc.edu)|128.114.48.41|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 353008 (345K) [application/x-xz]
Saving to: ‘annotated-phrases-by-topic.tar.xz’


2024-12-29 14:33:48 (1011 KB/s) - ‘annotated-phrases-by-topic.tar.xz’ saved [353008/353008]



In [None]:
path = './data/AEC/'
final = 'aec.csv'

topic_mapping = {
    'gm': "gay marriage",
    'evo': "evoluation",
    'gc': "gun control",
    'dp': "death penalty"
}

class2label = {'no_connective': NOARGUMENT, 'so': ARGUMENT, 'if': ARGUMENT, 'but': ARGUMENT, 'first': ARGUMENT, "i agree that": ARGUMENT}

dataframes = []
for file in os.listdir(path):
    file_path = os.path.join(path, file)
    if file != final:
        dataframe = pd.read_csv(file_path)
        topic_prefix = file.split('-')[0]
        topic = topic_mapping.get(topic_prefix, None)
        dataframe['topic'] = topic
        dataframe[DOMAIN_COL] = "http://www.createdebate.com/"
        dataframes.append(dataframe)

df_aec = pd.concat(dataframes)

df_aec['Connective.x'] = df_aec['Connective.x'].replace(class2label)

df_aec = df_aec[['ItemId', DOMAIN_COL, 'topic', 'Connective.x', 'Phrase.x']]
df_aec[DATASET_COL] = 'AEC'
df_aec.rename(columns={
    "ItemId": DSID_COL,
    "topic": TOPIC_COL,
    "Connective.x": LABEL_COL,
    "Phrase.x": SENTENCE_COL
}, inplace=True)

df_aec = df_aec[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_aec.sort_values(by=DSID_COL)
df_aec = df_aec.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])

show_information(df_aec)
df_aec.to_csv(path + final, index=False)

AEC size: 5375
Topics: 4


In [None]:
!find ./data/AEC -mindepth 1 -not -name 'aec.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create UKP

In [None]:
! mkdir -p ./data/UKP
! unzip -o ./assets/UKP_sentential_argument_mining.zip -d ./data/UKP

Archive:  ./assets/UKP_sentential_argument_mining.zip
   creating: ./data/UKP/data/
  inflating: ./data/UKP/data/abortion.tsv  
  inflating: ./data/UKP/data/cloning.tsv  
  inflating: ./data/UKP/data/death_penalty.tsv  
  inflating: ./data/UKP/data/gun_control.tsv  
  inflating: ./data/UKP/data/marijuana_legalization.tsv  
  inflating: ./data/UKP/data/minimum_wage.tsv  
  inflating: ./data/UKP/data/nuclear_energy.tsv  
  inflating: ./data/UKP/data/school_uniforms.tsv  
  inflating: ./data/UKP/README.txt   


In [None]:
path = './data/UKP/'
final = 'ukp.csv'

ARGUMENT_FOR = 'Argument_for'
ARGUMENT_AGAINST = 'Argument_against'
NO_ARGUMENT = 'NoArgument'

class2label = {ARGUMENT_FOR: ARGUMENT, ARGUMENT_AGAINST: ARGUMENT, NO_ARGUMENT: NOARGUMENT}
# school_uniforms.tsv contains an error in the last line.
dataframes = [pd.read_csv(os.path.join(path+"data/", file), delimiter='\\t', engine="python") for file in os.listdir(path+"data/") if file != final]

df_ukp = pd.concat(dataframes)
df_ukp['annotation'] = df_ukp['annotation'].replace(class2label)

df_ukp = df_ukp[['sentenceHash', 'retrievedUrl', 'topic', 'annotation', 'sentence']]
df_ukp[DATASET_COL] = "UKP"

df_ukp.rename(columns={
    "sentenceHash": DSID_COL,
    "retrievedUrl": DOMAIN_COL,
    "topic": TOPIC_COL,
    "annotation": LABEL_COL,
    "sentence": SENTENCE_COL
}, inplace=True)

df_ukp = df_ukp[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_ukp = df_ukp.loc[~df_ukp[DSID_COL].duplicated()]
df_ukp = df_ukp.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])
df_ukp.sort_values(by=DSID_COL)

show_information(df_ukp)
df_ukp.to_csv(path+final, index=False)

UKP size: 25104
Topics: 8


In [None]:
!find ./data/UKP -mindepth 1 -not -name 'ukp.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create ASRD

In [None]:
! wget https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_\(R\)_ArgsInASR_Findings-2020.v1.zip
! mkdir -p ./data/ASRD
! unzip -o IBM_Debater_\(R\)_ArgsInASR_Findings-2020.v1.zip -d ./data/ASRD
! mv -f ./data/ASRD/IBM_Debater_\(R\)_ArgsInASR_Findings-2020.v1/* ./data/ASRD
! rm IBM_Debater_\(R\)_ArgsInASR_Findings-2020.v1.zip
! rm -r ./data/ASRD/__MACOSX
! rm -r ./data/ASRD/IBM_Debater_\(R\)_ArgsInASR_Findings-2020.v1

--2024-12-30 17:36:07--  https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_ArgsInASR_Findings-2020.v1.zip
Resolving www.research.ibm.com (www.research.ibm.com)... 13.32.230.22, 13.32.230.125, 13.32.230.11, ...
Connecting to www.research.ibm.com (www.research.ibm.com)|13.32.230.22|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_ArgsInASR_Findings-2020.v1.zip [following]
--2024-12-30 17:36:07--  https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_ArgsInASR_Findings-2020.v1.zip
Resolving research.ibm.com (research.ibm.com)... 108.138.106.92, 108.138.106.96, 108.138.106.127, ...
Connecting to research.ibm.com (research.ibm.com)|108.138.106.92|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 130274 (127K) [application/zip]
Saving to: ‘IBM_Debater_(R)_ArgsInASR_Findings-2020.v1.zip’


2024-12-30 17:36:08 (540 KB/s) - ‘IBM_Debater_(R)_Args

In [None]:
path = './data/ASRD/'
final = 'asrd.csv'

class2label = {0: NOARGUMENT, 1: ARGUMENT}

df_asrd = pd.read_csv(path+'argumentative_sentences_in_spoken_language_with split.csv')
df_asrd['label'] = df_asrd['label'].replace(class2label)
df_asrd['sentence_id'] = "ASRD-" + df_asrd['sentence_id'].astype(str)
df_asrd["sentence"] = df_asrd["sentence"] + "."
df_asrd[DOMAIN_COL] = "ASR Debate Speeches"
df_asrd[DATASET_COL] = "ASRD"
df_asrd.rename(columns={"sentence_id": DSID_COL}, inplace=True)
df_asrd = df_asrd[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_asrd = df_asrd.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])

show_information(df_asrd)
df_asrd.to_csv(path+final, index=False)

ASRD size: 700
Topics: 20


In [None]:
!find ./data/ASRD -mindepth 1 -not -name 'asrd.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create QMC

In [None]:
! wget https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_\(R\)_claim_sentences_search.zip
! mkdir -p ./data/QMC
! unzip -o IBM_Debater_\(R\)_claim_sentences_search.zip -d ./data/QMC
! rm -r IBM_Debater_\(R\)_claim_sentences_search.zip

--2024-12-29 14:10:09--  https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_claim_sentences_search.zip
Resolving www.research.ibm.com (www.research.ibm.com)... 13.32.230.11, 13.32.230.125, 13.32.230.22, ...
Connecting to www.research.ibm.com (www.research.ibm.com)|13.32.230.11|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_claim_sentences_search.zip [following]
--2024-12-29 14:10:09--  https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_claim_sentences_search.zip
Resolving research.ibm.com (research.ibm.com)... 108.138.106.96, 108.138.106.92, 108.138.106.72, ...
Connecting to research.ibm.com (research.ibm.com)|108.138.106.96|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 121145170 (116M) [application/zip]
Saving to: ‘IBM_Debater_(R)_claim_sentences_search.zip’


2024-12-29 14:10:14 (25.0 MB/s) - ‘IBM_Debater_(R)_claim_sentences_s

In [None]:
path = './data/QMC/'
final = 'qmc.csv'

column_names = ['id', 'topic', 'mc', 'sentence', 'query_pattern', 'score', 'label', 'url']
df_qmc = pd.read_csv(path+'data_sets/test_set.csv', header=None, names=column_names)
df_qmc['label'] = df_qmc['label'].replace(class2label)
df_qmc['id'] = "QMC-" + df_qmc.index.astype(str)
df_qmc[DATASET_COL] = 'QMC'
df_qmc.rename(columns={"id": DSID_COL, "url": DOMAIN_COL}, inplace=True)
df_qmc = df_qmc[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_qmc = df_qmc.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])

show_information(df_qmc)
df_qmc.to_csv(path+final, index=False)

QMC size: 2499
Topics: 50


In [None]:
!find ./data/QMC -mindepth 1 -not -name 'qmc.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create IAM

In [None]:
! wget https://github.com/LiyingCheng95/IAM/archive/refs/heads/main.zip
! mkdir -p ./data/IAM
! unzip -o main.zip -d ./data/IAM
! mv -f ./data/IAM/IAM-main/* ./data/IAM
! rm -r main.zip
! rm -r ./data/IAM/IAM-main/

--2024-12-29 14:49:17--  https://github.com/LiyingCheng95/IAM/archive/refs/heads/main.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/LiyingCheng95/IAM/zip/refs/heads/main [following]
--2024-12-29 14:49:18--  https://codeload.github.com/LiyingCheng95/IAM/zip/refs/heads/main
Resolving codeload.github.com (codeload.github.com)... 140.82.113.9
Connecting to codeload.github.com (codeload.github.com)|140.82.113.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘main.zip’

main.zip                [        <=>         ]  14.55M  5.14MB/s    in 2.8s    

2024-12-29 14:49:21 (5.14 MB/s) - ‘main.zip’ saved [15253078]

Archive:  main.zip
5497013e36a6af07a798b42965e6a3c321aa56ae
   creating: ./data/IAM/IAM-main/
   creating: ./data/IAM/IAM-main/CEPE/
  inflating: ./data/

In [None]:
path = './data/IAM/'
final = 'iam.csv'
class2label = {"C": ARGUMENT, "O": NOARGUMENT}
column_names = ['claim_label', 'topic_sentence', 'claim_candidate_sentence', 'article_id', 'stance_label']
df_iam = pd.read_csv(path + 'claims/all_claims.txt', header=None, delimiter='\t', names=column_names)
df_iam['claim_label'] = df_iam['claim_label'].replace(class2label)
df_iam[DSID_COL] = "IAM-" + df_iam.index.astype(str)
df_iam[DATASET_COL] = 'IAM'
df_iam[DOMAIN_COL] = 'www.wikipedia.com/' + df_iam.article_id.astype(str)

df_iam.rename(columns={"topic_sentence": TOPIC_COL,
                       "claim_label": LABEL_COL,
                       "claim_candidate_sentence": SENTENCE_COL},
              inplace=True)
df_iam = df_iam[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_iam = df_iam.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])

show_information(df_iam)
df_iam.to_csv(path+final, index=False)

IAM size: 66524
Topics: 100


In [None]:
!find ./data/IAM -mindepth 1 -not -name 'iam.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create AMSR

In [None]:
! wget https://zenodo.org/records/4314390/files/AMSR.zip
! mkdir -p ./data/AMSR
! unzip -o AMSR.zip data/conferences_annotated/sentence_level/* -d ./data/AMSR
! mv -f ./data/AMSR/data/conferences_annotated/sentence_level/* ./data/AMSR
! rm -r AMSR.zip
! rm -r ./data/AMSR/data

--2024-12-29 14:26:43--  https://zenodo.org/records/4314390/files/AMSR.zip
Resolving zenodo.org (zenodo.org)... 188.185.48.194, 188.185.43.25, 188.185.45.92, ...
Connecting to zenodo.org (zenodo.org)|188.185.48.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 58882809 (56M) [application/octet-stream]
Saving to: ‘AMSR.zip’


2024-12-29 14:26:47 (16.7 MB/s) - ‘AMSR.zip’ saved [58882809/58882809]

Archive:  AMSR.zip
   creating: ./data/AMSR/data/conferences_annotated/sentence_level/
  inflating: ./data/AMSR/data/conferences_annotated/sentence_level/all_reviews_by_sentences.csv  
  inflating: ./data/AMSR/data/conferences_annotated/sentence_level/test.csv  
  inflating: ./data/AMSR/data/conferences_annotated/sentence_level/train.csv  
  inflating: ./data/AMSR/data/conferences_annotated/sentence_level/val.csv  


In [None]:
path = './data/AMSR/'
final = 'amsr.csv'
class2label = {"POS": ARGUMENT, "NEG": ARGUMENT, "NA": NOARGUMENT}

df_amsr = [pd.read_csv(os.path.join(path, file), delimiter='\t', engine="python") for file in ["train.csv", "val.csv", "test.csv"]]
df_amsr = pd.concat(df_amsr)
df_amsr['position'] = df_amsr['position'].apply(lambda text: text.strip())
df_amsr['position'] = df_amsr['position'].replace(class2label)
df_amsr[DATASET_COL] = 'AMSR'
df_amsr[DOMAIN_COL] = 'www.openreview.com/' + df_amsr.sentence_id.astype(str)
df_amsr[TOPIC_COL] = df_amsr["sentence_id"].apply(lambda text: text.split('_')[0])
df_amsr.rename(columns={"sentence_id": DSID_COL,
                       "position": LABEL_COL,
                       "text": SENTENCE_COL},
              inplace=True)
df_amsr = df_amsr[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_amsr = df_amsr.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])

show_information(df_amsr)
df_amsr.to_csv(path+final, index=False)

AMSR size: 1400
Topics: 6


In [None]:
!find ./data/AMSR -mindepth 1 -not -name 'amsr.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create AMPERE

In [None]:
! wget http://xinyuhua.github.io/resources/naacl2019/naacl19_dataset.zip
! unzip -o naacl19_dataset.zip -d ./data/AMPERE
! mv -f ./data/AMPERE/dataset ./data/AMPERE/data
! rm -r naacl19_dataset.zip

--2024-12-29 14:01:37--  http://xinyuhua.github.io/resources/naacl2019/naacl19_dataset.zip
Resolving xinyuhua.github.io (xinyuhua.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to xinyuhua.github.io (xinyuhua.github.io)|185.199.108.153|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15682546 (15M) [application/zip]
Saving to: ‘naacl19_dataset.zip’


2024-12-29 14:01:38 (46.4 MB/s) - ‘naacl19_dataset.zip’ saved [15682546/15682546]

Archive:  naacl19_dataset.zip
   creating: ./data/AMPERE/dataset/
  inflating: ./data/AMPERE/dataset/README  
  inflating: ./data/AMPERE/dataset/annotation_guideline.txt  
  inflating: ./data/AMPERE/dataset/iclr2017.jsonlist  
  inflating: ./data/AMPERE/dataset/iclr2018.jsonlist  
   creating: ./data/AMPERE/dataset/iclr_anno_final/
  inflating: ./data/AMPERE/dataset/iclr_anno_final/B104VQCgM_rating_6.txt  
  inflating: ./data/AMPERE/dataset/iclr_anno_final/B10Nn-jlf_rating_7.txt  
  inflating: ./data/A

In [None]:
path = './data/AMPERE/'
final = 'ampere.csv'
class2label = {"fact": ARGUMENT, "evaluation": ARGUMENT, "request": ARGUMENT, "reference": ARGUMENT, "quote": ARGUMENT, "non-arg": NOARGUMENT}

df_ampere = pd.DataFrame()

for file_name in tqdm(os.listdir(path + "data/iclr_anno_final")):
    if file_name.endswith('tmp'):
        continue
    file_path = path + "data/iclr_anno_final/" + file_name
    # Parse labeled data and generate label positions
    labeled_data = parse_labeled_file(file_path)
    combined_text, label_positions = generate_label_positions(labeled_data)
    # Match annotations to sentences
    sentence_annotations = match_annotations_to_sentences(combined_text, label_positions)
    # Prepare output and create DataFrame
    output_data = prepare_output(sentence_annotations)
    df_output = pd.DataFrame(output_data)
    df_output[DSID_COL] = df_output.apply(lambda row: file_name.split(".txt")[0] + "_" + str(row.name), axis=1)
    df_ampere = pd.concat([df_ampere, df_output])

# Display or save the final DataFrame
df_ampere = df_ampere[df_ampere[LABEL_COL] != set()] # exclude those without labels.
df_ampere[LABEL_COL] = df_ampere[LABEL_COL].apply(lambda row: row != set(["non-arg"]))
df_ampere[LABEL_COL] = df_ampere[LABEL_COL].replace({True: ARGUMENT, False: NOARGUMENT})
df_ampere[DOMAIN_COL] = 'www.openreview.com/'
df_ampere[DATASET_COL] = 'AMPERE'
df_ampere[TOPIC_COL] = "random"
df_ampere = df_ampere[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_ampere = df_ampere.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])

show_information(df_ampere)
df_ampere.to_csv(path+final, index=False)

  0%|          | 0/401 [00:00<?, ?it/s]

AMPERE size: 6971
Topics: 1


In [None]:
!find ./data/AMPERE -mindepth 1 -not -name 'ampere.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create FINARG

In [None]:
!git clone https://github.com/Alaa-Ah/The-FinArg-Dataset-Argument-Mining-in-Financial-Earnings-Calls
! unzip -o ./The-FinArg-Dataset-Argument-Mining-in-Financial-Earnings-Calls/FinArg.zip -d ./data/FINARG
! rm -r ./The-FinArg-Dataset-Argument-Mining-in-Financial-Earnings-Calls/
! mv ./data/FINARG/argument\ mining ./data/FINARG/data

Cloning into 'The-FinArg-Dataset-Argument-Mining-in-Financial-Earnings-Calls'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 45 (delta 23), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (45/45), 5.23 MiB | 10.21 MiB/s, done.
Resolving deltas: 100% (23/23), done.
Archive:  ./The-FinArg-Dataset-Argument-Mining-in-Financial-Earnings-Calls/FinArg.zip
   creating: ./data/FINARG/argument mining/
  inflating: ./data/FINARG/argument mining/AAPL_Q1_2015_17544.json  
  inflating: ./data/FINARG/argument mining/AAPL_Q1_2015_17544_9.ann  
  inflating: ./data/FINARG/argument mining/AAPL_Q1_2015_17545.json  
  inflating: ./data/FINARG/argument mining/AAPL_Q1_2015_17545_9.ann  
  inflating: ./data/FINARG/argument mining/AAPL_Q1_2015_17546.json  
  inflating: ./data/FINARG/argument mining/AAPL_Q1_2015_17546_5.ann  
  inflating: ./data/FINARG/argument mining/AAPL_Q1_20

In [None]:
path = './data/FINARG/'
final = 'finarg.csv'

df_finarg = pd.DataFrame()
for f in tqdm(os.listdir(path + "data")):
    topic = f.split('_')[0].replace('AAPL', 'Apple').replace('AMZN', 'Amazon').replace('FB', 'Facebook').replace('MSFT', 'Microsoft')
    if f.endswith(".ann"):
        df = read_and_merge_annotations(path + "data/" + f)
        df[TOPIC_COL] = topic
        df[DSID_COL] = df.apply(lambda row: f.split(".")[0] + "_" + str(row.name), axis=1)
        df_finarg = pd.concat([df_finarg, df])

df_finarg[DATASET_COL] = "FINARG"
df_finarg[DOMAIN_COL] = "https://site.financialmodelingprep.com"
df_finarg[LABEL_COL] = df_finarg[LABEL_COL].apply(lambda row: any("claim" in label.lower() for label in row))
df_finarg[LABEL_COL] = df_finarg[LABEL_COL].replace({True: ARGUMENT, False: NOARGUMENT})
df_finarg = df_finarg[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_finarg = df_finarg.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])

show_information(df_finarg)
df_finarg.to_csv(path + final, index=False)

  0%|          | 0/1672 [00:00<?, ?it/s]

FINARG size: 12917
Topics: 4


In [None]:
!find ./data/FINARG -mindepth 1 -not -name 'finarg.csv' \( -type f -exec rm -f {} + -o -type d -exec rm -r {} + \)

## Create ABSTRCT

In [None]:
! wget https://gitlab.com/tomaye/abstrct/-/archive/master/abstrct-master.zip
! mkdir -p ./data/ABSTRCT
! unzip -o abstrct-master.zip -d ./data/ABSTRCT
! rm -r abstrct-master.zip

--2024-12-29 14:16:06--  https://gitlab.com/tomaye/abstrct/-/archive/master/abstrct-master.zip
Resolving gitlab.com (gitlab.com)... 172.65.251.78, 2606:4700:90:0:f22e:fbec:5bed:a9b9
Connecting to gitlab.com (gitlab.com)|172.65.251.78|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘abstrct-master.zip’

abstrct-master.zip      [ <=>                ]   1.42M  --.-KB/s    in 0.09s   

2024-12-29 14:16:06 (15.5 MB/s) - ‘abstrct-master.zip’ saved [1485530]

Archive:  abstrct-master.zip
f856f1ca7514caa4094194b5623e84c159d9bf1d
   creating: ./data/ABSTRCT/abstrct-master/
   creating: ./data/ABSTRCT/abstrct-master/AbstRCT_corpus/
  inflating: ./data/ABSTRCT/abstrct-master/AbstRCT_corpus/AnnotationGuidelines.pdf  
   creating: ./data/ABSTRCT/abstrct-master/AbstRCT_corpus/data/
  inflating: ./data/ABSTRCT/abstrct-master/AbstRCT_corpus/data/annotation.conf  
   creating: ./data/ABSTRCT/abstrct-master/AbstRCT_corpus/data/dev/
   cr

In [None]:
%%bash
#!/bin/bash

# Define source and destination directories
src_dir="./data/ABSTRCT/abstrct-master/AbstRCT_corpus/data"
dest_dir="./data/ABSTRCT/data"

# Create destination directory if it doesn't exist
mkdir -p "$dest_dir"

# Iterate over all .ann files in the source directory
find "$src_dir" -type f -name "*.ann" | while read -r ann_file; do
    # Derive corresponding .txt file
    txt_file="${ann_file%.ann}.txt"

    # Check if the .txt file exists
    if [[ -f "$txt_file" ]]; then
        # Extract parent directory name
        parent_dir=$(basename "$(dirname "$ann_file")")

        # Get base filenames
        ann_filename=$(basename "$ann_file")
        txt_filename=$(basename "$txt_file")

        # Define new filenames with parent directory prefix
        new_ann="${parent_dir}_${ann_filename}"
        new_txt="${parent_dir}_${txt_filename}"

        # Copy .ann file if not already present
        if [[ ! -e "$dest_dir/$new_ann" ]]; then
            cp "$ann_file" "$dest_dir/$new_ann"
            echo "Copied: $new_ann"
        else
            echo "Skipped duplicate: $new_ann"
        fi

        # Copy .txt file if not already present
        if [[ ! -e "$dest_dir/$new_txt" ]]; then
            cp "$txt_file" "$dest_dir/$new_txt"
            echo "Copied: $new_txt"
        else
            echo "Skipped duplicate: $new_txt"
        fi
    else
        echo "Missing .txt for: $ann_file"
    fi
done

Copied: neoplasm_dev_10735891.ann
Copied: neoplasm_dev_10735891.txt
Copied: neoplasm_dev_10811675.ann
Copied: neoplasm_dev_10811675.txt
Copied: neoplasm_dev_11766999.ann
Copied: neoplasm_dev_11766999.txt
Copied: neoplasm_dev_11786563.ann
Copied: neoplasm_dev_11786563.txt
Copied: neoplasm_dev_11788911.ann
Copied: neoplasm_dev_11788911.txt
Copied: neoplasm_dev_11790211.ann
Copied: neoplasm_dev_11790211.txt
Copied: neoplasm_dev_11821453.ann
Copied: neoplasm_dev_11821453.txt
Copied: neoplasm_dev_11830607.ann
Copied: neoplasm_dev_11830607.txt
Copied: neoplasm_dev_11843249.ann
Copied: neoplasm_dev_11843249.txt
Copied: neoplasm_dev_11896110.ann
Copied: neoplasm_dev_11896110.txt
Copied: neoplasm_dev_16476840.ann
Copied: neoplasm_dev_16476840.txt
Copied: neoplasm_dev_16476841.ann
Copied: neoplasm_dev_16476841.txt
Copied: neoplasm_dev_16483488.ann
Copied: neoplasm_dev_16483488.txt
Copied: neoplasm_dev_16487438.ann
Copied: neoplasm_dev_16487438.txt
Copied: neoplasm_dev_17664467.ann
Copied: neopla

In [None]:
path = './data/ABSTRCT/'
final = 'abstrct.csv'

df_abstrct = pd.DataFrame()

for f in tqdm(set([f.split(".")[0] for f in os.listdir(path + "data")])):
    # Load and process files
    txt_path = path + f"data/{f}.txt"
    ann_path = path + f"data/{f}.ann"
    topic = f.split("_")[0]
    text_content = read_file(txt_path)
    annotation_lines = read_file(ann_path).splitlines()
    # Parse annotations
    parsed_annotations = parse_annotations(annotation_lines)
    # Match annotations to sentences
    sentence_annotations = match_annotations_to_sentences(text_content, parsed_annotations)
    # Prepare data for display
    output_data = prepare_output(sentence_annotations)
    # Create a DataFrame
    output_df = pd.DataFrame(output_data)
    output_df[TOPIC_COL] = topic
    output_df[DSID_COL] = output_df.apply(lambda row: f + "_" + str(row.name), axis=1)

    df_abstrct = pd.concat([df_abstrct, output_df])

df_abstrct[LABEL_COL] = df_abstrct[LABEL_COL].apply(lambda row: any("claim" in label.lower() for label in row))
df_abstrct[LABEL_COL] = df_abstrct[LABEL_COL].replace({True: ARGUMENT, False: NOARGUMENT})

df_abstrct[DATASET_COL] = "ABSTRCT"
df_abstrct[DOMAIN_COL] = "https://www.nlm.nih.gov"
df_abstrct = df_abstrct[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_abstrct = df_abstrct[~df_abstrct[DSID_COL].duplicated()]
df_abstrct = df_abstrct.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])

show_information(df_abstrct)
df_abstrct = df_abstrct.sort_values(by=DSID_COL)
df_abstrct.to_csv(path + final, index=False)

  0%|          | 0/700 [00:00<?, ?it/s]

ABSTRCT size: 8632
Topics: 3


In [None]:
!find ./data/ABSTRCT -mindepth 1 ! -path "./data/ABSTRCT/abstrct.csv" -exec rm -rf {} +

## Create ACQUA

In [None]:
! wget https://zenodo.org/records/3237552/files/ACQuA\ -\ CompArg\ -\ release\ v1\ -\ dataset\ of\ comparative\ sentences\ -\ all-data.csv

! mkdir -p ./data/ACQUA
! mv -f ACQuA\ -\ CompArg\ -\ release\ v1\ -\ dataset\ of\ comparative\ sentences\ -\ all-data.csv ./data/ACQUA

--2024-12-29 13:53:02--  https://zenodo.org/records/3237552/files/ACQuA%20-%20CompArg%20-%20release%20v1%20-%20dataset%20of%20comparative%20sentences%20-%20all-data.csv
Resolving zenodo.org (zenodo.org)... 188.185.45.92, 188.185.48.194, 188.185.43.25, ...
Connecting to zenodo.org (zenodo.org)|188.185.45.92|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3231554 (3.1M) [text/plain]
Saving to: ‘ACQuA - CompArg - release v1 - dataset of comparative sentences - all-data.csv’


2024-12-29 13:53:03 (3.85 MB/s) - ‘ACQuA - CompArg - release v1 - dataset of comparative sentences - all-data.csv’ saved [3231554/3231554]



In [None]:
path = './data/ACQUA/'
final = 'acqua.csv'

df_acqua = pd.read_csv(path + "ACQuA - CompArg - release v1 - dataset of comparative sentences - all-data.csv")
class2label = {"WORSE": ARGUMENT, "BETTER": ARGUMENT, "NONE": NOARGUMENT}

df_acqua.rename(columns={"id": DSID_COL,
                         "domain": TOPIC_COL,
                         "most_frequent_label": LABEL_COL,
                         "sentence": SENTENCE_COL},
                inplace=True)

df_acqua[DATASET_COL] = "ACQUA"
df_acqua[DOMAIN_COL] = "https://commoncrawl.org"
df_acqua = df_acqua[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_acqua[LABEL_COL] = df_acqua[LABEL_COL].replace(class2label)
df_acqua[TOPIC_COL] = df_acqua[TOPIC_COL].replace({"jbt": "random"})
df_acqua = df_acqua[~df_acqua[DSID_COL].duplicated(keep=False)]
df_acqua = df_acqua.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])

show_information(df_acqua)
df_acqua.to_csv(path + final, index=False)

ACQUA size: 7185
Topics: 3


In [None]:
! rm data/ACQUA/ACQuA\ -\ CompArg\ -\ release\ v1\ -\ dataset\ of\ comparative\ sentences\ -\ all-data.csv

## Create WEBIS

In [None]:
! wget https://zenodo.org/records/3251804/files/webis-debate-16.tar.gz
! mkdir -p ./data/WEBIS
! tar -xf webis-debate-16.tar.gz -C ./data/WEBIS
! rm -r webis-debate-16.tar.gz

--2024-12-29 14:24:58--  https://zenodo.org/records/3251804/files/webis-debate-16.tar.gz
Resolving zenodo.org (zenodo.org)... 188.185.48.194, 188.185.45.92, 188.185.43.25, ...
Connecting to zenodo.org (zenodo.org)|188.185.48.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 928453 (907K) [application/octet-stream]
Saving to: ‘webis-debate-16.tar.gz’


2024-12-29 14:24:59 (1.59 MB/s) - ‘webis-debate-16.tar.gz’ saved [928453/928453]



In [None]:
path = './data/WEBIS/'
final = 'webis.csv'
class2label = {"Argumentative": ARGUMENT, "Non-Argumentative": NOARGUMENT}

df_webis = pd.DataFrame()
for f in tqdm(os.listdir(path + 'webis-debate-16/')):
    pattern = r"([a-zA-Z]+)(\d+)"
    fit = re.search(pattern, f)
    topic = fit.group(1)
    file_id = int(fit.group(2))

    column_names = [LABEL_COL, SENTENCE_COL]
    df = pd.read_csv(path + f'webis-debate-16/{f}', header=None, delimiter='\\t', engine='python', names=column_names)
    df[DSID_COL] = str(topic) + "-" + str(file_id) + "-" + df.index.astype(str)
    df[TOPIC_COL] = topic
    df[DOMAIN_COL] = "https://idebate.net"
    df[DATASET_COL] = "WEBIS"
    df_webis = pd.concat([df_webis, df])

df_webis = df_webis[[DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL]]
df_webis[LABEL_COL] = df_webis[LABEL_COL].replace(class2label)
df_webis = df_webis.drop_duplicates(subset=[SENTENCE_COL, LABEL_COL])

show_information(df_webis)
df_webis.to_csv(path + final, index=False)

  0%|          | 0/445 [00:00<?, ?it/s]

WEBIS size: 16347
Topics: 14


In [None]:
! rm -r ./data/WEBIS/webis-debate-16/

## Combined data

In [29]:
df_combined = pd.DataFrame()
for f in os.listdir("./data"):
    path = "./data/" + f
    if os.path.isdir(path):
        df = pd.read_csv(path + "/" + f.lower() + ".csv")
        df = df[~df[SENTENCE_COL].isna()]
        assert not df[[SENTENCE_COL, LABEL_COL]].duplicated().any()
        assert all(df.columns == [DATASET_COL, DSID_COL, DOMAIN_COL, TOPIC_COL, LABEL_COL, SENTENCE_COL])
        df_combined = pd.concat([df_combined, df])

for column in df_combined.columns:
    assert not df_combined[column].isna().any()

assert df_combined[DSID_COL].nunique() == df_combined.shape[0]

df_combined.to_csv('./data/all_data.csv', index=False)