In [2]:
from data_utils import *
import csv
import pandas as pd
from tqdm.notebook import trange

# Collect Different Datasets Into One Place

In [3]:
# Folder for processed data to all be put in
COMPILED_DATA_FOLDER = "compiled_data"

## Covid-19 Fake News Competition Dataset

In [4]:
FAKENEWS_FOLDER = "covid_fake_news/data"
LABELED_FAKENEWS_DATASET_FILE_NAMES = ["Constraint_Train.csv", "Constraint_Val.csv"]
UNLABELED_FAKENEWS_DATASET_FILE_NAMES = ["Constraint_Test.csv"]

# New name to store this data under (after combining splits and specifying labeled/unlabeled)
COMPILED_FAKENEWS_DATASET_FILE_NAME = "fakenews.csv"

# Original Dataset paths
ORIG_LABELED_FAKENEWS_DATASET_PATHS = [f"{FAKENEWS_FOLDER}/{name}" for name in LABELED_FAKENEWS_DATASET_FILE_NAMES]
ORIG_UNLABELED_FAKENEWS_DATASET_PATHS = [f"{FAKENEWS_FOLDER}/{name}" for name in UNLABELED_FAKENEWS_DATASET_FILE_NAMES]

# Files for storing dataset data after combining splits, with no further processing
RAW_LABELED_FAKENEWS_DATASET_PATH = f"{COMPILED_DATA_FOLDER}/raw_labeled_{COMPILED_FAKENEWS_DATASET_FILE_NAME}"
RAW_UNLABELED_FAKENEWS_DATASET_PATH = f"{COMPILED_DATA_FOLDER}/raw_unlabeled_{COMPILED_FAKENEWS_DATASET_FILE_NAME}"

# Files for storing dataset data after preprocessing text and labels
PREPROCESSED_LABELED_FAKENEWS_DATASET_PATH = f"{COMPILED_DATA_FOLDER}/preprocessed_labeled_{COMPILED_FAKENEWS_DATASET_FILE_NAME}"
PREPROCESSED_UNLABELED_FAKENEWS_DATASET_PATH = f"{COMPILED_DATA_FOLDER}/preprocessed_unlabeled_{COMPILED_FAKENEWS_DATASET_FILE_NAME}"

### Covid-19 Fake News: Retrieval

In [5]:
def collect_fakenews_dataset(source_paths, dest_path):
    # Collect and Combine Labeled Data
    datasets = [pd.read_csv(dataset_path) for dataset_path in source_paths]

    # Drop index column    
    for dataset in datasets:
        dataset.drop(columns="id", inplace=True)
        
    # Concatenate
    labeled_dataset = pd.concat(datasets)

    # Rename columns to shared format
    labeled_dataset.rename(columns={"tweet": "tweet_text", "label": "tweet_label"}, inplace=True)

    # Save
    labeled_dataset.to_csv(dest_path, index=False)    



# Labeled Data
collect_fakenews_dataset(ORIG_LABELED_FAKENEWS_DATASET_PATHS, RAW_LABELED_FAKENEWS_DATASET_PATH)

# Unlabeled Data
collect_fakenews_dataset(ORIG_UNLABELED_FAKENEWS_DATASET_PATHS, RAW_UNLABELED_FAKENEWS_DATASET_PATH)

### Covid-19 Fake News: Preprocessing

In [6]:
labeled_dataset = pd.read_csv(RAW_LABELED_FAKENEWS_DATASET_PATH)
preprocess(labeled_dataset)
labeled_dataset.to_csv(PREPROCESSED_LABELED_FAKENEWS_DATASET_PATH, index=False)

unlabeled_dataset = pd.read_csv(RAW_UNLABELED_FAKENEWS_DATASET_PATH)
preprocess(unlabeled_dataset)
unlabeled_dataset.to_csv(PREPROCESSED_UNLABELED_FAKENEWS_DATASET_PATH, index=False)

## ANTiVax Dataset
#### Requires loading tweet info from Twitter API

In [7]:
ANTIVAX_FOLDER = "ANTiVax/Labeled"
ANTIVAX_DATASET_FILE_NAME = "VaxMisinfoData.csv"
ORIG_ANTIVAX_DATASET_PATH = f"{ANTIVAX_FOLDER}/{ANTIVAX_DATASET_FILE_NAME}"

# File for storing dataset data after retrieval, with no further processing
RAW_ANTIVAX_DATASET_PATH = f"{COMPILED_DATA_FOLDER}/raw_{ANTIVAX_DATASET_FILE_NAME}"

# File for storing dataset data after preprocessing text and labels (make sure labels are 0 (real) and 1 (fake))
PREPROCESSED_ANTIVAX_DATASET_PATH = f"{COMPILED_DATA_FOLDER}/preprocessed_{ANTIVAX_DATASET_FILE_NAME}"

### ANTiVax: Retrieval

In [8]:
# I have a limited number of Twitter API requests available, so make sure this cell isn't run accidentally
# Even though it auto-skips tweets it already stores, it would still request the many tweet ids which were inaccessible and didn't end up in the dataset
if False:
    # Store new data indexed by id, in case we need to load from partially completed dataset processing (since I have limited quota of tweet requests)
    new_data = {}
    new_data_headers = ["tweet_id", "tweet_text", "tweet_label", "profile_id", "profile_name", "profile_username", "profile_description", "profile_image_url"]

    # Load partially complete list
    if os.path.exists(RAW_ANTIVAX_DATASET_PATH):
        new_data_df = pd.read_csv(RAW_ANTIVAX_DATASET_PATH)
        new_data = {row["tweet_id"]: [row[col] for col in new_data_headers] for i, row in new_data_df.iterrows() if isinstance(row["tweet_text"], str) and row["tweet_text"] != ""}
    
    # Load datasets
    twitter = TwitterRetriever()
    orig_data = pd.read_csv(ORIG_ANTIVAX_DATASET_PATH)

    def save_new_data():
        with open(RAW_ANTIVAX_DATASET_PATH, "w", newline="", encoding="utf-8") as fp:
            writer = csv.writer(fp)
            writer.writerow(new_data_headers)
            writer.writerows(new_data.values())

    pbar = trange(len(orig_data.index))
    for i in pbar:
        row = orig_data.iloc[i]
        tweet_id = row["id"]
        tweet_label = row["is_misinfo"]
        
        if tweet_id in new_data:
            continue
        
        tweet_info = twitter.get_tweet_info(tweet_id)
        if tweet_info is None: # Ignorable error
            continue
        
        new_data[tweet_id] = [tweet_id, tweet_info["tweet_text"], tweet_label, tweet_info["profile_id"], tweet_info["profile_name"], tweet_info["profile_username"], tweet_info["profile_description"], tweet_info["profile_description"]]
        
        pbar.set_postfix({"tweets": len(new_data)})
        if i % 1000 == 0:
            save_new_data()

    # Save data as csv
    save_new_data()

### ANTiVax: Preprocessing

In [9]:
dataset = pd.read_csv(RAW_ANTIVAX_DATASET_PATH)
preprocess(dataset)
dataset.to_csv(PREPROCESSED_ANTIVAX_DATASET_PATH, index=False)

# Combine All Datasets (Labeled and Unlabeled Separately)
#### Only Tweet text and labels are shared among all datasets

In [10]:
PREPROCESSED_LABELED_DATASET_PATHS = [
    PREPROCESSED_LABELED_FAKENEWS_DATASET_PATH,
    PREPROCESSED_ANTIVAX_DATASET_PATH
]

PREPROCESSED_UNLABELED_DATASET_PATHS = [
    PREPROCESSED_UNLABELED_FAKENEWS_DATASET_PATH
]

# Output dataset file paths
COMBINED_LABELED_DATASET_FILE_NAME = "combined_labeled_data.csv"
COMBINED_UNLABELED_DATASET_FILE_NAME = "combined_unlabeled_data.csv"

COMBINED_LABELED_DATASET_PATH = f"{COMPILED_DATA_FOLDER}/{COMBINED_LABELED_DATASET_FILE_NAME}"
COMBINED_UNLABELED_DATASET_PATH = f"{COMPILED_DATA_FOLDER}/{COMBINED_UNLABELED_DATASET_FILE_NAME}"

In [11]:
FORCE_RECOMPUTE = False

if not os.path.exists(COMBINED_LABELED_DATASET_PATH) or FORCE_RECOMPUTE:
    # Combine all preprocessed datasets with labels
    labeled_datasets = [pd.read_csv(dataset) for dataset in PREPROCESSED_LABELED_DATASET_PATHS]

    # Filter down to only shared columns
    labeled_datasets = [dataset.filter(["tweet_text", "tweet_label"]) for dataset in labeled_datasets]

    # Combine and save
    combined_labeled_dataset = pd.concat(labeled_datasets)
    combined_labeled_dataset.to_csv(COMBINED_LABELED_DATASET_PATH, index=False)
else:
    combined_labeled_dataset = pd.read_csv(COMBINED_LABELED_DATASET_PATH)

if not os.path.exists(COMBINED_UNLABELED_DATASET_PATH) or FORCE_RECOMPUTE:
    # Combine all preprocessed datasets, including those without labels
    unlabeled_datasets = [pd.read_csv(dataset) for dataset in PREPROCESSED_UNLABELED_DATASET_PATHS]

    # Drop label column from all labeled data
    combined_unlabeled_dataset = combined_labeled_dataset.filter(["tweet_text"])
    combined_unlabeled_dataset = pd.concat([combined_unlabeled_dataset] + unlabeled_datasets)
    combined_unlabeled_dataset.to_csv(COMBINED_UNLABELED_DATASET_PATH, index=False)
else:
    combined_unlabeled_dataset = pd.read_csv(COMBINED_UNLABELED_DATASET_PATH)

#### Split Combined Labeled Dataset into Train and Test Sets

In [12]:
TRAIN_SET_PROPORTION = 0.5
COMBINED_LABELED_TRAIN_DATASET_FILE_NAME = "combined_labeled_data_train.csv"
COMBINED_LABELED_TEST_DATASET_FILE_NAME = "combined_labeled_data_test.csv"

COMBINED_LABELED_TRAIN_DATASET_PATH = f"{COMPILED_DATA_FOLDER}/{COMBINED_LABELED_TRAIN_DATASET_FILE_NAME}"
COMBINED_LABELED_TEST_DATASET_PATH = f"{COMPILED_DATA_FOLDER}/{COMBINED_LABELED_TEST_DATASET_FILE_NAME}"

In [13]:
FORCE_RECOMPUTE = False

if FORCE_RECOMPUTE or not (os.path.exists(COMBINED_LABELED_TRAIN_DATASET_PATH) and os.path.exists(COMBINED_LABELED_TEST_DATASET_PATH)):
    # Shuffle labeled dataset
    combined_labeled_dataset = combined_labeled_dataset.sample(frac=1).reset_index(drop=True)
    
    train_set_end_index = int(len(combined_labeled_dataset) * TRAIN_SET_PROPORTION)
    combined_labeled_train_dataset = combined_labeled_dataset[:train_set_end_index]
    combined_labeled_test_dataset = combined_labeled_dataset[train_set_end_index:]
    
    combined_labeled_train_dataset.to_csv(COMBINED_LABELED_TRAIN_DATASET_PATH, index=False)
    combined_labeled_test_dataset.to_csv(COMBINED_LABELED_TEST_DATASET_PATH, index=False)