In [1]:
"""
modified from the script found here:
    https://github.com/bmurauer/authbench/blob/main/scripts/unify_c50.py

dataset access:
    https://archive.ics.uci.edu/ml/datasets/Reuter_50_50
"""
from sklearn.model_selection import train_test_split
import logging
from tqdm import tqdm
import os
import argparse
import random
import numpy as np
from glob import glob
from typing import List, Dict
import re

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

def process_c50(pth, seed=0):
    processed_dir = os.path.join(pth, "processed")
    if not os.path.isdir(processed_dir):
        os.makedirs(processed_dir)
    raw_dir = pth

    train = os.path.join(raw_dir, "C50train")
    test = os.path.join(raw_dir, "C50test")

    def read(subdir: str, author_ids: dict) -> List[Dict]:
        posts = {}
        authors = os.listdir(subdir)
        for j, author in enumerate(authors):
            author_dir = os.path.join(subdir, author)
            files = glob(author_dir + "/*.txt")
            for f in files:
                with open(f) as i_f:
                    text = i_f.read()
                    posts.setdefault(author_ids.setdefault(author, j), []).append(text)
        return posts, author_ids

    logging.info('getting train and test sets')
    auth_to_id = {}  # make sure author id's are consistent across train and test set
    train_and_eval_dict, auth_to_id = read(train, auth_to_id)
    test_dict, auth_to_id = read(test, auth_to_id)

    # make a dict of all data for stat tracking
    all_data = {}
    for data in [train_and_eval_dict, test_dict]:
        for k, v in data.items():
            for t in v:
                all_data.setdefault(k, []).append(t)

    # we need to split the train into a training and evaluation set
    train_and_eval_data = []
    for auth in train_and_eval_dict.keys():
        for text in train_and_eval_dict[auth]:
            train_and_eval_data.append([auth, text])

    logging.info(f'splitting the training data into train/eval sets')
    # now split into stratified train(60%)/val(20%)/test(20%) splits
    train_set, eval_set = train_test_split(train_and_eval_data, test_size=0.2, shuffle=True, random_state=seed,
                                                    stratify=[lbl for lbl, _ in train_and_eval_data])

    # now transform back to dicts
    train_dict = {}
    for auth, text in train_set:
        train_dict.setdefault(auth, []).append(text)

    val_dict = {}
    for auth, text in eval_set:
        val_dict.setdefault(auth, []).append(text)

    return train_dict, val_dict, test_dict


dataset_path = "./ccat50/"
seed = 0
output_path = "./ccat50/processed"

class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Namespace(dataset_path=dataset_path, seed=seed)

random.seed(args.seed)
np.random.seed(args.seed)

train_data, val_data, test_data = process_c50(args.dataset_path, args.seed)



In [2]:
import re
import pickle

def split_sentence(sentence):
    # Create a Punkt tokenizer with custom parameters
    punkt_param = PunktParameters()
    abbreviation = ['corp', 'co', "u.s", "inc", "cos", "u.k", "st"]
    punkt_param.abbrev_types = set(abbreviation)
    tokenizer = PunktSentenceTokenizer(punkt_param)
    
    # Tokenize the sentence using the Punkt tokenizer
    sentences = tokenizer.tokenize(sentence)
    
    if len(sentences) > 1:
        first_part = sentences[0]
        second_part = ' '.join(sentences[1:])
        return first_part, second_part
    else:
        return sentence, ''

def promptify(dset_dict):
    all_outs = {}
    for author in tqdm(dset_dict):
        if author not in all_outs:
            all_outs[author] = []
        for text in dset_dict[author]:
            # try:
            first_sent, rem = split_sentence(text)

            curr_dict = {
                "prompt": f"Write an article that starts with the following: {first_sent.strip()}",
                "output": text.strip()
            }
            
            # print(curr_dict)

            curr_dict["output"] = re.sub(' +', ' ', curr_dict["output"])
            curr_dict["output"] = re.sub('\t', '', curr_dict["output"])
            curr_dict["output"] = re.sub('\r', '', curr_dict["output"])
            curr_dict["output"] = re.sub('\xa0', '', curr_dict["output"])
            curr_dict["output"] = ' '.join(curr_dict["output"].split())

            all_outs[author].append(curr_dict)
                

    return all_outs

def write_aa_dataset(data: Dict, file_path: str) -> None:
    # Save JSON data as PKL
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)
        


In [3]:
dataset_procs = [
    (train_data, "train"), 
    (val_data, "val"), 
    (test_data, "test"), 
]

for dataset, name in dataset_procs:
    
    curr_dataset = promptify(dataset)

    if name in ["val", "test"]:
        for k in curr_dataset:
            curr_dataset[k] = curr_dataset[k][:3]
            
    write_aa_dataset(curr_dataset, output_path + f"/ccat50_{name}.pkl")

100%|██████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 29.24it/s]
100%|█████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 165.67it/s]
100%|██████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 30.83it/s]
