# Verification of AH labels
* How trustworthy are they?
* Re-label some of AH comments (MTurk), also sample some negative instances

## Negative instances sampling criteria
* Sampled distribution must be the same in term of length (mean + std. dev.)
* Sample delta comments (= these are high quality and manually verified)
* Sample other removed comments but different label:
	* 5: 'Low effort post'
 	* 1: 'Direct responses must challenge OP'

In [3]:
from AnnotatedRedditComment import AnnotatedRedditComment
from RedditThread import RedditThread
import os
import matplotlib.pyplot as plt
% matplotlib inline
import pickle

# First sample all instances and store them to pickle
pickle_file = "ah-positive-negative-instances-all.pkl"


def sample_validation_instances():
    main_dir = '/home/user-ukp/data2/cmv-full-2017-09-22/'
    files = [f for f in os.listdir(main_dir) if os.path.isfile(os.path.join(main_dir, f))]
    
    _positive_instances = set()
    _negative_instances = set()
    
    for f in files:
        comments = RedditThread.load_comments_from_file(os.path.join(main_dir, f))
        clean_threads = RedditThread.discard_corrupted_threads(RedditThread.reconstruct_threads_from_submission(comments))
        
        for clean_thread in clean_threads:
            assert isinstance(clean_thread, RedditThread)

            has_deleted_comments = clean_thread.has_deleted_comments()
            assert not has_deleted_comments, ["(%d) '%s'" % (len(comment.body), comment.body) for comment in clean_thread.comments]

        # remove outliers (threads longer than 200 comments)
        clean_threads = [thread for thread in clean_threads if 200 >= len(thread.comments) > 0]
    
        for comment in RedditThread.collect_all_comments(clean_threads):
            # let's ignore extremely short and extremely long comments
            if 20 < len(comment.body.strip()) < 2000:
                assert isinstance(comment, AnnotatedRedditComment)
                assert len(comment.body.strip()) > 0
                
                label = comment.violated_rule
        
                if label in (1, 5) or comment.delta:
                    _negative_instances.add(comment)
                elif label == 2:
                    _positive_instances.add(comment)
    
    with open(pickle_file, "wb") as f:
        pickle.dump((_positive_instances, _negative_instances), f)
        f.close()
        
    # sort them as lists to ensure the non-random ordering if they were sets
    return sorted(_positive_instances), sorted(_negative_instances)
        

# could be sets but sets cannot be ordered in Python
positive_instances = list()
negative_instances = list()

if os.path.isfile(pickle_file):
    with open(pickle_file, "rb") as f:
        positive_instances, negative_instances = pickle.load(f)
else:
    positive_instances, negative_instances = sample_validation_instances()
    
positive_instances = list(positive_instances)
negative_instances = list(negative_instances)
    
print("Positive AH instances", len(positive_instances))
print("Negative AH instances", len(negative_instances))

Positive AH instances 3621
Negative AH instances 22710


* Now look at the length distributions
	* These are exponential, so let's estimate its parameters for future sampling


In [2]:
# import scipy.stats
# %matplotlib inline
# import matplotlib.pyplot as plt
# import seaborn
# 
# positive_instances_lengths = [len(_.body) for _ in positive_instances]
# negative_instances_lengths = [len(_.body) for _ in negative_instances]
# 
# print("Positive instances length stats:", scipy.stats.describe(positive_instances_lengths))
# print("Negative instances length stats:", scipy.stats.describe(negative_instances_lengths))
# 
# # estimate parameters of exponential distribution 
# loc_positive, scale_positive = scipy.stats.expon.fit(positive_instances_lengths)
# 
# generated = scipy.stats.expon.rvs(loc_positive, scale_positive, len(positive_instances_lengths))
# print(scipy.stats.describe(generated))
# 
# # print the original and generated distribution... these are not same but close enough
# seaborn.set(color_codes=True)
# seaborn.distplot(positive_instances_lengths, kde=False)
# seaborn.distplot(generated, kde=False)
# plt.show()

* Now let's sample from both positive and negative sets, in each step a sample with the same length

In [8]:
import pickle
import random

random.seed(12345)

positive_to_negative_distances = dict()

# read only first 6 files due to their extreme size
for i in range(0, 6):
    file_name = "distance_dict_%d.pkl" % i

    with open(file_name, "rb") as f:
        current_dict = pickle.load(f)
        assert isinstance(current_dict, dict)
        print("Loaded %d keys from file %s" % (len(current_dict), file_name))
        f.close()
        # add to the final
        positive_to_negative_distances.update(current_dict)
        print("Total size now is %d" % len(positive_to_negative_distances))

# load the serialized instance matrix
# with open('distance_dict.pkl', 'rb') as f:
#     positive_to_negative_distances = pickle.load(f)
print(len(positive_to_negative_distances))

# get a set of all negative instances so we can discard them after draw
first_key = next(iter(positive_to_negative_distances))
negative_instances_ids = set(positive_to_negative_distances[first_key].keys())

print("Negative instances", len(negative_instances_ids))

# random sampling = 2,400 pairs in total
# random_positive_instance_ids = random.sample(list(positive_to_negative_distances), 2400)
# keep all of them now!
random_positive_instance_ids = positive_to_negative_distances

# print(random_positive_instance_ids)

# list of tuples (positive_instance_id, negative_instance_id)
samples = []

for positive_instance_id in random_positive_instance_ids:
    distances = positive_to_negative_distances[positive_instance_id]
    assert isinstance(distances, dict)
    assert all([isinstance(_, float) for _ in distances.values()])
    
    # print("Original negative samples available", len(distances))

    # update the negative candidates - retain only those which has not yet been drawn
    # find the closest negative instance
    distances_updated = {key: float(distances[key]) for key in negative_instances_ids}
    assert isinstance(distances_updated, dict)
    assert all([isinstance(_, float) for _ in distances_updated.values()])
    
    # print("Updated negative samples available", len(distances_updated))
    # print("D", distances)
    # print("DU", distances_updated)
    
    # negative_instance_id = min(distances, key=distances_updated.get)
    negative_instance_id = min(distances_updated.items(), key=lambda x: x[1])[0]

    # print("Negative instance ID:", negative_instance_id)
    # print(distances_updated[negative_instance_id])

    # delete sampled item
    negative_instances_ids.remove(negative_instance_id)
    
    # and add the new sampled tuple
    samples.append((positive_instance_id, negative_instance_id))
    
print(len(samples))


Loaded 605 keys from file distance_dict_0.pkl
Total size now is 605


Loaded 605 keys from file distance_dict_1.pkl
Total size now is 1210


Loaded 605 keys from file distance_dict_2.pkl
Total size now is 1815


Loaded 605 keys from file distance_dict_3.pkl
Total size now is 2420


Loaded 605 keys from file distance_dict_4.pkl
Total size now is 3025


Loaded 596 keys from file distance_dict_5.pkl
Total size now is 3621
3621
Negative instances 22710


3621


* Now we have _n_ samples of pairs (positive_id, negative_id) -> let's export them to JSON





In [10]:
# we have still two variables: positive instances and negative instances, no need to read full CMV again
assert isinstance(positive_instances[0], AnnotatedRedditComment)
assert isinstance(negative_instances[0], AnnotatedRedditComment)

# group by ID
positive_instances_by_id = {_.name: _ for _ in positive_instances}
negative_instances_by_id = {_.name: _ for _ in negative_instances}


list_for_export = []

for (positive_id, negative_id) in samples:
    list_for_export.append(positive_instances_by_id[positive_id])
    list_for_export.append(negative_instances_by_id[negative_id])


print(len(list_for_export))

# write them to JSON files
with open("experimental-data/exported-3621-sampled-positive-negative-ah-no-context.json", "w") as f:
    for comment in list_for_export:
        assert isinstance(comment, AnnotatedRedditComment)
        f.write(comment.to_json_string())
        f.write("\n")
    f.close()

7242


## Testing "underperforming recall" of moderators

* Our previous experiment showed that 5% of AHs were missing label (negative samples were identified as positive). The negative sampling took into account only arguments with some existing label (such as "low-effort post", "delta-awarded", or similar). In the following experiment, we want to estimate the amount of AHs among all arguments in CMV that have no manual label at all. Therefore, we sample another set of negative instances (arguments) without labels such that they are similar to the positive instances (AHs).

* How to do it?

	* Sample 223 AHs
	* Sample 20000 non-labeled instances
	* Compute similarity matrix (as in the previous experiment)
	* Find the most similar negative instances, resulting into 222 candidates
	* Annotate the candidates by 6 MTurkers with 0.9 MACE threshold, resulting into 200 labeled instances
	* See whether some of them are AH (or not)