In [1]:
"""
This script was adapted from:
    https://github.com/bmurauer/authbench/blob/main/scripts/unify_cmcc.py

Dataset Access
    Email authoros of: Creating and Using a Correlated Corpora to Glean Communicative Commonalities
    http://www.lrec-conf.org/proceedings/lrec2008/pdf/771_paper.pdf
"""
import os
import re
from glob import glob
from typing import Dict, Any
# from valla.utils.dataset_utils import finalize_cross_dataset, list_dset_to_dict, auth_text_make_unique
from sklearn.model_selection import train_test_split

import pandas as pd
import argparse
import random
import numpy as np
from typing import List, Dict, Union

prompt = {
    "C": "Do you feel the Catholic Church needs to change its ways to adapt to life in the 21st Century?",
    "G": "While some states have legalized gay marriage, others are still opposed to it. Do you think either side is right or wrong?",
    "P": "Recently, school officials prevented a school shooting because one of the shooters posted a myspace bulletin. Do you think this was an invasion of privacy?",
    "M": "The city of Denver has decided to legalize small amounts of marijuana for persons over 21. How do you feel about this?",
    "I": "The controversial war in Iraq has made news headlines almost every day since it began. How do you feel about the war?",
    "S": "Do you feel that gender discrimination is still an issue in the present-day United States?"
}


topic = {
    "Emails": "Write an email response to a someone who asked you the following: ",
    "Essays": "Write an approximately 500 word essay to the following prompt: "
}


def list_dset_to_dict(data: List[List[Union[int, str]]]) -> Dict:
    out = {}
    for auth, text in data:
        out.setdefault(auth, []).append(text)
    return out


def dict_dset_to_list(data: Dict) -> List[List[Union[int, str]]]:
    out = []
    for auth, texts in data.items():
        for text in texts:
            out.append([auth, text])
    return out


def auth_text_make_unique(data: Dict):
    unique = {}
    for author, texts in data.items():
        unique[author] = list(set(texts))
    return unique
        
    
def check_or_fix_dataset_typo(directory: str) -> None:
    """
    There is one typo in the dataset which might have not been corrected yet:
    there is one file 'Discussions/Correlated/S1D113.txt'
    Which is the only file in the corpus that does not comply to the
    naming convention explained in FileCodingSchemes3.doc.
    It should be called S1D1I3.txt with an upper case i instead of a digit one.
    This code was tested on CMCCData.zip with a md5 checksum of:
        157586057cf4ad3dc1876890e94373a5
    """
    wrong = os.path.join(directory, "Discussion", "Correlated", "S1D113.txt")
    right = os.path.join(directory, "Discussion", "Correlated", "S1D1I3.txt")

    if os.path.isfile(wrong):
        print("renaming " + wrong + " to " + right)
        os.rename(wrong, right)


def process_cmcc(pth: str) -> Dict[str, Dict[Any, Any]]:
    processed_dir = os.path.join(pth, "processed")
    if not os.path.isdir(processed_dir):
        os.makedirs(processed_dir)
    directory = pth

    check_or_fix_dataset_typo(directory)

    train_posts = {}
    auth_to_id = {}
    auth_counter = 0
    categories = ["Emails", "Essays"]
    train_categories, train_texts = ["Emails", "Essays"], 0

    for category in categories:
        correlated_dir = os.path.join(directory, category, "Correlated")
        files = glob(correlated_dir + "/*.txt")
        pattern = re.compile(
            r"(?P<author>[A-Z]\d+)(?P<genre>[A-Z])\d+(?P<topic>[A-Z])\d+.txt"
        )

        for f in files:
            # the files are windows-1252-encoded.
            with open(f, "rb") as i_f:
                try:
                    text_raw = i_f.read().decode("cp1252")
                except Exception as e:
                    print(f)
                    raise e

            name = os.path.basename(f)
            match = pattern.match(name)
            if not match:
                raise ValueError("no match found for file: " + f)

            # we only need text_raw and match.groupdict()['author']
            if match.groupdict()['author'] not in auth_to_id:
                auth_to_id[match.groupdict()['author']] = auth_counter
                auth_counter += 1

            a = auth_to_id[match.groupdict()['author']]

            if category in train_categories:
                train_posts.setdefault(a, []).append(
                    { 
                        "prompt": f"{topic[category]}{prompt[name[-6]]}",
                        "output": text_raw.strip()
                    }
                )
                train_texts += 1

    print(f'there are {train_texts} iid texts')

    return {
        'train': train_posts
    }


dataset_path = "./cmcc/"
seed = 0
output_path = "./cmcc/processed"

class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

args = Namespace(dataset_path=dataset_path, seed=seed)

random.seed(args.seed)
np.random.seed(args.seed)

all_data = process_cmcc(args.dataset_path)

# this gives us the training set, a cross topic set, and a cross genre set. This is realy three problems, so ideally
# we will have: train, val, test, cross_topic_val, cross_topic_test, cross_genre_val, cross_genre_test
# so make lists of the sets and split
iid_data = []
for auth, texts in all_data['train'].items():
    for text in texts:
        iid_data.append([auth, text])
train_set, eval_and_test_set = train_test_split(iid_data, test_size=0.4, shuffle=True, random_state=args.seed,
                                                stratify=[lbl for lbl, _ in iid_data])
eval_set, test_set = train_test_split(eval_and_test_set, test_size=0.5, shuffle=True, random_state=args.seed,
                                                stratify=[lbl for lbl, _ in eval_and_test_set])

# now finalize the dataset.
original_data = []
for dset_name, dset in all_data.items():
    for auth, texts in dset.items():
        for text in texts:
            text["output"] = re.sub(' +', ' ', text["output"])
            text["output"] = re.sub('\t', '', text["output"])
            text["output"] = re.sub('\r', '', text["output"])
            text["output"] = re.sub('\xa0', '', text["output"])
            text["output"] = ' '.join(text["output"].split())

            # text["output"] = re.sub('\n', '\n', text["output"])
            
#             print("NEW")
#             print(text["output"])
            original_data.append([auth, text])
            # print(text)

save_path = os.path.join(args.dataset_path, 'processed')

there are 252 iid texts


In [2]:
original_data = list_dset_to_dict(original_data)
train_data = list_dset_to_dict(train_set)
val_data = list_dset_to_dict(eval_set)
test_data = list_dset_to_dict(test_set)

In [3]:
import pickle
def write_aa_dataset(data: Dict, file_path: str) -> None:
    # Save JSON data as PKL
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)
        
dataset_procs = [
    (train_data, "train"), 
    (val_data, "val"), 
    (test_data, "test"), 
]

for dataset, name in dataset_procs:
    write_aa_dataset(dataset, output_path + f"/cmcc_{name}.pkl")

In [4]:
train_data.keys()

dict_keys([20, 16, 17, 7, 15, 10, 9, 14, 6, 8, 13, 19, 18, 4, 1, 5, 0, 11, 3, 2, 12])

In [5]:
for m in train_data:
    print(f"XX: {len(train_data[m])}")

XX: 8
XX: 7
XX: 7
XX: 7
XX: 7
XX: 7
XX: 7
XX: 7
XX: 7
XX: 8
XX: 8
XX: 7
XX: 7
XX: 7
XX: 8
XX: 7
XX: 7
XX: 7
XX: 7
XX: 7
XX: 7


In [6]:
import pickle

for n in ["cmcc", "guardian", "ccat50"]:
    print(n)
    with open(f"./{n}/processed/{n}_test.pkl", 'rb') as pickle_file:
        data = pickle.load(pickle_file)

    
    for k in data:
        print(len(data[k]))

cmcc
3
3
2
3
3
3
2
2
3
2
2
3
2
2
3
2
2
3
2
2
2
guardian
5
4
3
5
6
4
6
4
5
5
4
3
4
ccat50
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
