# Properties of the OP comments

* Investigate two properties: *controversy* and *stupidity*
* Both likert scale (3)

[Not controversial -- Somehow controversial -- Very controversial]

Is this claim/topic reasonable to discuss or is it just stupid/silly?

[Quite stupid -- Neutral -- Quite reasonable]

Examples: "Harry Potter is sexier than Spiderman", "Burritos are better than sandwiches" -- these are quite stupid
"Nations whose leadership is based upon religion are fundamentally backwards" -- quite reasonable topic

## First pilot

* Sample some hundred OPs (among all OPs, not only AH threads) for pilot

In [1]:
# get some random 100 OP's from reddit
from AnnotatedRedditComment import AnnotatedRedditComment
from RedditThread import RedditThread
import os
import random

# seed random generator
random.seed(1234)


def select_random_ops(_batch_indices: tuple) -> list:
    """
    Samples a random list of original posts (OPs) from the complete CMV
    :param _batch_indices: a tuple (from index, to index)
    :return: list of a single AnnotatedRedditComment instances which are the OPs
    """
    main_dir = '/home/user-ukp/data2/cmv-full-2017-09-22/'
    all_files = [f for f in os.listdir(main_dir) if os.path.isfile(os.path.join(main_dir, f))]
    random.shuffle(all_files)
    files = all_files[_batch_indices[0]:_batch_indices[1]]
    
    result = []
    
    for f in files:
        comments = RedditThread.load_comments_from_file(os.path.join(main_dir, f))
        clean_threads = RedditThread.discard_corrupted_threads(RedditThread.reconstruct_threads_from_submission(comments))
        
        if len(clean_threads):
            _ = clean_threads[0].comments[0]
            
            assert isinstance(_, AnnotatedRedditComment)
            result.append(_)
        
    return result
    

random_ops = select_random_ops((0, 100))
# save to file
with open("experimental-data/exported-100-random-ops.json", "w") as f:
    for op in random_ops:
        assert isinstance(op, AnnotatedRedditComment)
        f.write(op.to_json_string())
        f.write("\n")
    f.close()
    
random_ops = select_random_ops((100, 1000))
# save to file
with open("experimental-data/exported-900-random-ops.json", "w") as f:
    for op in random_ops:
        assert isinstance(op, AnnotatedRedditComment)
        f.write(op.to_json_string())
        f.write("\n")
    f.close()

For analysis of the first pilot, see `stupidity-controversy-pilot-plots.ipynb`

## Sampling a full batch of 2,000 OPs

Try to balance OPs: 1,000 OPs from AHs, 1,000 from those with delta

In [14]:
# seed random generator
random.seed(1234)


def select_random_ops_balanced() -> list:
    """
    Samples a balanced random list of original posts (OPs) from the complete CMV: delta and AH submissions
    :return: list of a single AnnotatedRedditComment instances which are the OPs
    """
    main_dir = '/home/user-ukp/data2/cmv-full-2017-09-22/'
    all_files = [f for f in os.listdir(main_dir) if os.path.isfile(os.path.join(main_dir, f))]
    random.shuffle(all_files)
    
    max_samples_per_type = 900
    
    # OPs with AH or Delta
    result_with_ah = []
    result_with_delta = []
    
    for f in all_files:
        if len(result_with_ah) >= max_samples_per_type and len(result_with_delta) >= max_samples_per_type:
            break
        
        comments = RedditThread.load_comments_from_file(os.path.join(main_dir, f))
        clean_threads = RedditThread.discard_corrupted_threads(RedditThread.reconstruct_threads_from_submission(comments))
        
        if len(clean_threads):
            has_ad_hominem = any([_.has_some_ad_hominem() for _ in clean_threads])
            has_delta = any([_.has_some_delta() for _ in clean_threads])
            # print("Has AH/delta", has_ad_hominem, has_delta, f)
            
            op = clean_threads[0].comments[0]
            assert isinstance(op, AnnotatedRedditComment)
            
            # add only if buffer not full
            if has_ad_hominem and not has_delta and len(result_with_ah) < max_samples_per_type:
                result_with_ah.append(op)
            elif has_delta and len(result_with_delta) < max_samples_per_type:
                result_with_delta.append(op)
            
    print(len(result_with_ah))
    print(len(result_with_delta))
    
    result = result_with_delta + result_with_ah
    random.shuffle(result)
    
    return result 
        

# export and save data to JSON
sampled_balanced_ops = select_random_ops_balanced()

with open("experimental-data/exported-1800-sampled-balanced-ops.json", "w") as f:
    for op in sampled_balanced_ops:
        assert isinstance(op, AnnotatedRedditComment)
        f.write(op.to_json_string())
        f.write("\n")
    f.close()

900
900


* We found that there are only 967 submissions that have some AHs but no delta at all (aka "very bad discussions")
	* Therefore we sampled 900+900 OPs here

## Exporting data for machine learning experiments

* Let's export annotated data (1,800 instances) and all unlabeled OPs for machine learning experiments
* The output format will be (per line):
	* `op_id  text  gold_label` for labeled data and `op_id  text` for unlabeled; tab-delimited
	* `gold_label` is a `double` value

In [14]:
import json
import os
from AnnotatedRedditComment import AnnotatedRedditComment
from RedditThread import RedditThread


def export_labeled_op(gold_json_file: str, output_tsv_gold: str, output_tsv_unlabeled: str)-> None:
    # load gold annotations first
    with open(gold_json_file) as f:
        gold_labels = json.load(f)

    main_dir = '/home/user-ukp/data2/cmv-full-2017-09-22/'
    all_files = [f for f in os.listdir(main_dir) if os.path.isfile(os.path.join(main_dir, f))]

    # open the output files    
    with open(output_tsv_gold, 'w') as f_gold:
        with open(output_tsv_unlabeled, 'w') as f_unlabeled:
            
            # read all threads
            for f in all_files:
                comments = RedditThread.load_comments_from_file(os.path.join(main_dir, f))
                clean_threads = RedditThread.discard_corrupted_threads(
                    RedditThread.reconstruct_threads_from_submission(comments))
        
                if len(clean_threads):
                    op = clean_threads[0].comments[0]
                    assert isinstance(op, AnnotatedRedditComment)
        
                    # now we have the OP -- let's find its gold label
                    if op.name in gold_labels:
                        # print("Gold label", op.name, gold_labels[op.name])
                        # and write to the gold file
                        f_gold.write("%s\t%s\t%s\n" % (op.name, op.title, gold_labels[op.name]))
                    else:
                        # or as unlabeled
                        f_unlabeled.write("%s\t%s\n" % (op.name, op.title))
            # don't forget to close the files
            f_unlabeled.close()
        f_gold.close()


# export stupidity
export_labeled_op('experimental-data/annotated-1800-sampled-balanced-ops-stupidity.json',
                  'experimental-data/op-stupidity-controversy-prediction/stupidity-gold.tsv',
                  'experimental-data/op-stupidity-controversy-prediction/stupidity-unlabeled.tsv')

# export controversy
export_labeled_op('experimental-data/annotated-1800-sampled-balanced-ops-controversy.json',
                  'experimental-data/op-stupidity-controversy-prediction/controversy-gold.tsv',
                  'experimental-data/op-stupidity-controversy-prediction/controversy-unlabeled.tsv')