* Find threads with at least one AH
* Different context size - the larget the context, the fewer instances (i.e. context 4 gives ~1000 instances)

In [35]:
from RedditThread import RedditThread
import os
import pickle

def filter_duplicate_threads(threads: list) -> list:
    by_last_comment = {t.get_last_comment_name(): t for t in threads}
    return list(by_last_comment.values())


# how long the history must be?
context_size = 2
# context 3 gives 1291 threads -> it's ok
# context 2 gives 1698 threads -> but that's too small, isn't it?
# context 1 gives 3559 threads -> but that's too short context (only last argument / or delta)


def filter_only_two_person_threads(threads: list) -> list:
    """
    Creates a new list containing only threads with max two authors 
    :param threads: list of RedditThread instances, unchanged
    :return: list of RedditThread instances
    """
    result = []
    for thread in threads:
        authors = set([comment.author_name for comment in thread.comments])
        # print("Authors:", len(authors))
        if len(authors) <= 2:
            result.append(thread)

    return result


def sample_ah_threads_instances():
    main_dir = '/home/user-ukp/data2/cmv-full-2017-09-22/'
    files = [f for f in os.listdir(main_dir) if os.path.isfile(os.path.join(main_dir, f))]

    # now extract all AH threads
    threads_with_ah = []
    threads_with_delta = []

    for f in files:
        comments = RedditThread.load_comments_from_file(os.path.join(main_dir, f))
        clean_threads = RedditThread.discard_corrupted_threads(
            RedditThread.reconstruct_threads_from_submission(comments))

        for clean_thread in clean_threads:
            assert isinstance(clean_thread, RedditThread)

            has_deleted_comments = clean_thread.has_deleted_comments()
            assert not has_deleted_comments, ["(%d) '%s'" % (len(comment.body), comment.body) for comment in
                                              clean_thread.comments]

        # remove outliers (threads longer than 200 comments)
        # clean_threads = [thread for thread in clean_threads if 200 >= len(thread.comments) > 0]

        for thread in clean_threads:
            assert isinstance(thread, RedditThread)
            if thread.has_some_ad_hominem():
                first_ah_index = thread.get_positions_of_ad_hominem_comments()[0]

                if first_ah_index >= context_size:
                    # extract comment history and create a new thread
                    shortened_thread = RedditThread()
                    shortened_thread.comments = thread.comments[first_ah_index - context_size:first_ah_index + 1]

                    threads_with_ah.append(shortened_thread)
            elif thread.has_some_delta():
                first_delta_index = thread.get_positions_of_delta_comments()[0]

                # the delta comments is the fourth one here
                if first_delta_index >= (context_size - 1):
                    # extract comment history and create a new thread
                    shortened_thread = RedditThread()
                    shortened_thread.comments = thread.comments[
                                                first_delta_index - (context_size - 1):first_delta_index + 1]

                    threads_with_delta.append(shortened_thread)

    # there are duplicates!!! - filter by the last comment
    threads_with_ah = filter_only_two_person_threads(filter_duplicate_threads(threads_with_ah))
    threads_with_delta = filter_only_two_person_threads(filter_duplicate_threads(threads_with_delta))

    return threads_with_ah, threads_with_delta


pickle_file = "threads-with-ah-threads-with-delta-context%d.pkl" % context_size
if os.path.exists(pickle_file):
    with open(pickle_file, "rb") as f:
        threads_with_ah, threads_with_delta = pickle.load(f)
        f.close()
else:
    threads_with_ah, threads_with_delta = sample_ah_threads_instances()
    with open(pickle_file, "wb") as f:
        pickle.dump((threads_with_ah, threads_with_delta), f)
        f.close()

print("All AH threads:", len(threads_with_ah))
print("All delta threads:", len(threads_with_delta))


All AH threads: 1698
All delta threads: 19533


* Now we must compute semantic similarity using external script

In [26]:
import pickle
import random

random.seed(12345)

positive_to_negative_distances = dict()

for i in range(0, 4):
    file_name = "threads-with-ah-threads-with-delta-context%d-distances_%d.pkl" % (context_size, i)

    with open(file_name, "rb") as f:
        current_dict = pickle.load(f)
        assert isinstance(current_dict, dict)
        print("Loaded %d keys from file %s" % (len(current_dict), file_name))
        f.close()
        # add to the final
        positive_to_negative_distances.update(current_dict)
        print("Total size now is %d" % len(positive_to_negative_distances))

print(len(positive_to_negative_distances))

# get a set of all negative instances so we can discard them after draw
first_key = next(iter(positive_to_negative_distances))
negative_instances_ids = set(positive_to_negative_distances[first_key].keys())

print("Negative instances", len(negative_instances_ids))

# a list of what? tuples (positive_id, negative_id)
samples = []

for positive_instance_id in positive_to_negative_distances:
    distances = positive_to_negative_distances[positive_instance_id]
    assert isinstance(distances, dict)
    assert all([isinstance(_, float) for _ in distances.values()])
    
    # print("Original negative samples available", len(distances))

    # update the negative candidates - retain only those which has not yet been drawn
    # find the closest negative instance
    distances_updated = {key: float(distances[key]) for key in negative_instances_ids}
    assert isinstance(distances_updated, dict)
    assert all([isinstance(_, float) for _ in distances_updated.values()])
    
    # print("Updated negative samples available", len(distances_updated))
    # print("D", distances)
    # print("DU", distances_updated)
    
    # negative_instance_id = min(distances, key=distances_updated.get)
    negative_instance_id = min(distances_updated.items(), key=lambda x: x[1])[0]

    # print("Negative instance ID:", negative_instance_id)
    # print(distances_updated[negative_instance_id])

    # delete sampled item
    negative_instances_ids.remove(negative_instance_id)
    
    # and add the new sampled tuple
    samples.append((positive_instance_id, negative_instance_id))
    
print(len(samples))

Loaded 425 keys from file threads-with-ah-threads-with-delta-context2-distances_0.pkl
Total size now is 425


Loaded 425 keys from file threads-with-ah-threads-with-delta-context2-distances_1.pkl
Total size now is 850


Loaded 425 keys from file threads-with-ah-threads-with-delta-context2-distances_2.pkl
Total size now is 1275


Loaded 423 keys from file threads-with-ah-threads-with-delta-context2-distances_3.pkl
Total size now is 1698
1698
Negative instances 19533


1698


* Sample the threads and save to files
* Each thread in the separate JSON file consisting of comments, each comment on one line

In [27]:
import json

from AnnotatedRedditComment import AnnotatedRedditComment

output_folder = "/tmp/sampled-threads-ah-delta-context%d" % context_size

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

assert isinstance(samples[0], tuple)
assert isinstance(threads_with_ah[0], RedditThread)
assert isinstance(threads_with_delta[0], RedditThread)

threads_ah_by_id = {t.get_last_comment_name(): t for t in threads_with_ah}
threads_delta_by_id = {t.get_last_comment_name(): t for t in threads_with_delta}

for counter, ah_id_delta_id in enumerate(samples):
    ah_id, delta_id = ah_id_delta_id
    # these are the two similar instances
    thread_ah = threads_ah_by_id[ah_id]
    thread_delta = threads_delta_by_id[delta_id]
    
    with open(os.path.join(output_folder, "%d_ah_%s.json" % (counter, ah_id)), "w") as f:
        for c in thread_ah.comments:
            assert isinstance(c, AnnotatedRedditComment)
            f.write(c.to_json_string() + "\n")
        f.close()
        
    with open(os.path.join(output_folder, "%d_delta_%s.json" % (counter, delta_id)), "w") as f:
        for c in thread_delta.comments:
            assert isinstance(c, AnnotatedRedditComment)
            f.write(c.to_json_string() + "\n")
        f.close()