In [1]:
import torch
from datasets import load_dataset, Dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
)

import json, os
import pandas as pd

# Data preprocessing

In [2]:
def preprocess_liar_raw(path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Function to preprocess the LIAR-RAW dataset.

    It includes:
    - Remove the "reports" column.
    - Rename the labels "mostly-true" to "true" and "barely-true" and "pants-fire" to "false".

    Args:
        path (str): Path to the LIAR-RAW dataset.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: Train, validation and test datasets.
    """
    train = pd.read_json(path+"train.json", orient="records")
    valid = pd.read_json(path+"val.json", orient="records")
    test = pd.read_json(path+"test.json", orient="records")

    # Remove the "reports" column
    train.drop(columns=["reports"], inplace=True)
    valid.drop(columns=["reports"], inplace=True)
    test.drop(columns=["reports"], inplace=True)

    # Rename the labels
    train["label"] = train["label"].replace({"mostly-true": "true", "barely-true": "false", "pants-fire": "false"})
    valid["label"] = valid["label"].replace({"mostly-true": "true", "barely-true": "false", "pants-fire": "false"})
    test["label"] = test["label"].replace({"mostly-true": "true", "barely-true": "false", "pants-fire": "false"})

    return train, valid, test


In [3]:
def preprocess_rawfc(path: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Function to preprocess the RAWFC dataset.

    It includes:
    - Go to the split folder.
    - Read the JSON files.
    - Concatenate the files into a single DataFrame.
    - Repeat with the other splits.

    Args:
        path (str): Path to the RAWFC dataset.

    Returns:
        tuple [pd.DataFrame, pd.DataFrame, pd.DataFrame]: Train, validation and test datasets.
    """
    train = pd.DataFrame(columns=["event_id", "claim", "label", "explain"])
    valid = pd.DataFrame(columns=["event_id", "claim", "label", "explain"])
    test = pd.DataFrame(columns=["event_id", "claim", "label", "explain"])

    for split in ["train", "val", "test"]:
        current_path = path + split + "/"
        json_files = [file for file in os.listdir(current_path) if file.endswith(".json")]

        for index, file in enumerate(json_files):
            json_data = json.load(open(current_path + file))

            # get event_id, claim, label, explain, and original_label
            event_id = json_data["event_id"]+".json"
            claim = json_data["claim"]
            label = json_data["label"]
            original_label = json_data["original_label"]
            explain = json_data["explain"]

            if split == "train":
                train.loc[index]= [event_id, claim, label, explain]
            elif split == "val":
                valid.loc[index]= [event_id, claim, label, explain]
            else:
                test.loc[index]= [event_id, claim, label, explain]

    train["label"] = train["label"].replace({"half": "half-true"})
    valid["label"] = valid["label"].replace({"half": "half-true"})
    test["label"] = test["label"].replace({"half": "half-true"})

    return train, valid, test

In [4]:
%%time

# liar-raw dataset
path = "../dataset/LIAR-RAW/"
train_liar_raw, valid_liar_raw, test_liar_raw = preprocess_liar_raw(path)

# rawfc dataset
path = "../dataset/RAWFC/"
train_rawfc, valid_rawfc, test_rawfc = preprocess_rawfc(path)

# merge the datasets
train_df = pd.concat([train_liar_raw, train_rawfc])
valid_df = pd.concat([valid_liar_raw, valid_rawfc])
test_df = pd.concat([test_liar_raw, test_rawfc])

# convert df to dataset
train_ds = Dataset.from_pandas(train_df, split="train")
valid_ds = Dataset.from_pandas(valid_df, split="validation")
test_ds = Dataset.from_pandas(test_df, split="test")



CPU times: user 5.7 s, sys: 1.32 s, total: 7.02 s
Wall time: 7.95 s


In [5]:
from datasets import DatasetDict

# merge into a single dataset and save it
final_dataset = DatasetDict({"train": train_ds, "validation": valid_ds, "test": test_ds})
final_dataset.save_to_disk("../dataset/LIAR_RAW_RAWFC_Merged/")


Saving the dataset (0/1 shards):   0%|          | 0/11677 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1474 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1451 [00:00<?, ? examples/s]

In [8]:
# push to hub
final_dataset.push_to_hub(repo_id="NoAtmosphere0/LIAR_RAW_RAWFC_Merged",
                          
                          private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/NoAtmosphere0/LIAR_RAW_RAWFC_Merged/commit/391f2f64feb989f823484ea4325f48a6c1c6b2fe', commit_message='Upload dataset', commit_description='', oid='391f2f64feb989f823484ea4325f48a6c1c6b2fe', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/NoAtmosphere0/LIAR_RAW_RAWFC_Merged', endpoint='https://huggingface.co', repo_type='dataset', repo_id='NoAtmosphere0/LIAR_RAW_RAWFC_Merged'), pr_revision=None, pr_num=None)

In [None]:
# push individual splits
train_ds.push_to_hub(repo_id="NoAtmosphere0/LIAR_RAW_RAWFC_Merged", name="train", private=True)
valid_ds.push_to_hub(repo_id="NoAtmosphere0/LIAR_RAW_RAWFC_Merged", name="validation", private=True)
test_ds.push_to_hub(repo_id="NoAtmosphere0/LIAR_RAW_RAWFC_Merged", name="test", private=True)