# Triggers of AH - quantitive analysis

Terminology: **AH(-1)** is a comment preceding AH ("causing"); **Delta** is a delta-awarded comment

* We sample all AH(-1) and their most similar Delta comments
* As in previous sampling, we filter out
	* extremely short and extremely long comments
	* threads with deleted comments
	* threads longer than 200 comments
	* AH(-1) cannot be the original post


In [3]:
import random

from AnnotatedRedditComment import AnnotatedRedditComment
from RedditThread import RedditThread
import os
import pickle

# First sample all instances and store them to pickle
pickle_file = "ah-minus1-delta-instances-all.pkl"


def sample_validation_instances():
    main_dir = '/home/user-ukp/data2/cmv-full-2017-09-22/'
    files = [f for f in os.listdir(main_dir) if os.path.isfile(os.path.join(main_dir, f))]
    
    _ah_minus1_instances = set()
    _delta_instances = set()
    
    for f in files:
        comments = RedditThread.load_comments_from_file(os.path.join(main_dir, f))
        clean_threads = RedditThread.discard_corrupted_threads(RedditThread.reconstruct_threads_from_submission(comments))
        
        for clean_thread in clean_threads:
            assert isinstance(clean_thread, RedditThread)

            has_deleted_comments = clean_thread.has_deleted_comments()
            assert not has_deleted_comments, ["(%d) '%s'" % (len(comment.body), comment.body) for comment in clean_thread.comments]

        # remove outliers (threads longer than 200 comments)
        clean_threads = [thread for thread in clean_threads if 200 >= len(thread.comments) > 0]
    
        # comments by ids
        comments_by_id = {comment.name: comment for comment in RedditThread.collect_all_comments(clean_threads)
                          if 20 < len(comment.body.strip()) < 2000}
    
        for comment_id in comments_by_id:
            comment = comments_by_id[comment_id]
            # let's ignore extremely short and extremely long comments
            assert isinstance(comment, AnnotatedRedditComment)
            assert len(comment.body.strip()) > 0
            
            label = comment.violated_rule
    
            if comment.delta:
                _delta_instances.add(comment)
            elif label == 2:
                # get previous comment (only if not filtered out)
                if comment.parent_id in comments_by_id:
                    parent_comment = comments_by_id[comment.parent_id]
                    assert isinstance(parent_comment, AnnotatedRedditComment)
                    
                    # and only if the parent is not OP (to avoid first-level AHs)
                    if parent_comment.parent_id:
                        _ah_minus1_instances.add(comment)

    # convert to lists and shuffle
    _ah_minus1_instances = list(_ah_minus1_instances)
    _delta_instances = list(_delta_instances)
    random.shuffle(_ah_minus1_instances)
    random.shuffle(_delta_instances)

    with open(pickle_file, "wb") as f:
        pickle.dump((_ah_minus1_instances, _delta_instances), f)
        f.close()
        
    return _ah_minus1_instances, _delta_instances
        

# use cache if possible
if os.path.isfile(pickle_file):
    with open(pickle_file, "rb") as f:
        ah_minus1_instances, delta_instances = pickle.load(f)
else:
    ah_minus1_instances, delta_instances = sample_validation_instances()
    
    
print("AH(-1) instances", len(ah_minus1_instances))
print("Delta instances", len(delta_instances))

AH(-1) instances 2994
Delta instances 16559


* Now we have to run the parallelized script on the cluster to compute all distances

`python3 semantic-similarity-computation-parallel-step1.py` (modify the main method)

* It will produce 5 files with distances

In [6]:
import pickle
import random

random.seed(12345)

positive_to_negative_distances = dict()

for i in range(0, 5):
    file_name = "ah-minus1-delta-distances_%d.pkl" % i

    with open(file_name, "rb") as f:
        current_dict = pickle.load(f)
        assert isinstance(current_dict, dict)
        print("Loaded %d keys from file %s" % (len(current_dict), file_name))
        f.close()
        # add to the final
        positive_to_negative_distances.update(current_dict)
        print("Total size now is %d" % len(positive_to_negative_distances))

print(len(positive_to_negative_distances))

# get a set of all negative instances so we can discard them after draw
first_key = next(iter(positive_to_negative_distances))
negative_instances_ids = set(positive_to_negative_distances[first_key].keys())

print("Negative instances", len(negative_instances_ids))

# random sampling = 2,400 pairs in total
# random_positive_instance_ids = random.sample(list(positive_to_negative_distances), 2400)
# keep all of them now!
random_positive_instance_ids = positive_to_negative_distances

# print(random_positive_instance_ids)

# list of tuples (positive_instance_id, negative_instance_id)
samples = []

for positive_instance_id in random_positive_instance_ids:
    distances = positive_to_negative_distances[positive_instance_id]
    assert isinstance(distances, dict)
    assert all([isinstance(_, float) for _ in distances.values()])
    
    # print("Original negative samples available", len(distances))

    # update the negative candidates - retain only those which has not yet been drawn
    # find the closest negative instance
    distances_updated = {key: float(distances[key]) for key in negative_instances_ids}
    assert isinstance(distances_updated, dict)
    assert all([isinstance(_, float) for _ in distances_updated.values()])
    
    # print("Updated negative samples available", len(distances_updated))
    # print("D", distances)
    # print("DU", distances_updated)
    
    # negative_instance_id = min(distances, key=distances_updated.get)
    negative_instance_id = min(distances_updated.items(), key=lambda x: x[1])[0]

    # print("Negative instance ID:", negative_instance_id)
    # print(distances_updated[negative_instance_id])

    # delete sampled item
    negative_instances_ids.remove(negative_instance_id)
    
    # and add the new sampled tuple
    samples.append((positive_instance_id, negative_instance_id))
    
print(len(samples))


Loaded 605 keys from file ah-minus1-delta-distances_0.pkl
Total size now is 605


Loaded 605 keys from file ah-minus1-delta-distances_1.pkl
Total size now is 1210


Loaded 605 keys from file ah-minus1-delta-distances_2.pkl
Total size now is 1815


Loaded 605 keys from file ah-minus1-delta-distances_3.pkl
Total size now is 2420


Loaded 574 keys from file ah-minus1-delta-distances_4.pkl
Total size now is 2994
2994
Negative instances 

16559
2994


In [8]:
# we have still two variables: positive instances and negative instances, no need to read full CMV again
assert isinstance(ah_minus1_instances[0], AnnotatedRedditComment)
assert isinstance(delta_instances[0], AnnotatedRedditComment)

# group by ID
positive_instances_by_id = {_.name: _ for _ in ah_minus1_instances}
negative_instances_by_id = {_.name: _ for _ in delta_instances}


list_for_export = []

for (positive_id, negative_id) in samples:
    list_for_export.append(positive_instances_by_id[positive_id])
    list_for_export.append(negative_instances_by_id[negative_id])


print(len(list_for_export))

# write them to JSON files
with open("experimental-data/exported-2994-sampled-pairs-ah-minus1-and-delta-context.json", "w") as f:
    for comment in list_for_export:
        assert isinstance(comment, AnnotatedRedditComment)
        f.write(comment.to_json_string())
        f.write("\n")
    f.close()

5988
