In [1]:
import os
import json
import numpy
import pickle
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support

from outer import convertlabeltostr
from preprocessing import load_dataset, load_true_labels

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

In [2]:
def load_test_depth_pred_true():

    # Read the predictions of the model
    submission_file = os.path.join("output", "predictions.txt")
    submission = json.load(open(submission_file, 'r'))

    # And then the corresponding test data
    test_truevals = load_true_labels("test")

    # Load the full dataset and get the list of test tweets and their properties
    train_dev_split = load_dataset()
    alltestinfo = train_dev_split['test']

    alltestbranches = []
    # get all test branches out of it
    for indx, element in enumerate(alltestinfo):
        alltestbranches.extend(element['branches'])
    # loop over each tweet in testing set and find its depth to create id: depth dictionary
    depthinfo = {}
    for tweetid in submission.keys():
        for branch in alltestbranches:
            if tweetid in branch:
                depthinfo[tweetid] = branch.index(tweetid)

    return depthinfo, submission, test_truevals

In [3]:
def get_true_and_predicted_classes(true_tweet_classes, predicted_tweet_classes):

    # Sometimes this function may be called with empty predicted classes (e.g. if trials is not available).
    # In that case, return None for the true and predicted lists.
    if predicted_tweet_classes is None:
        true = None
        pred = None
        return true, pred

    true = []
    pred = []

    # Generate lists of true and predicted classes for all tweets in this set
    for k in true_tweet_classes.keys():
        true.append(true_tweet_classes[k])
        pred.append(predicted_tweet_classes[k])

    return true, pred

In [4]:
def calculate_results_at_each_depth(depthinfo, submission, test_truevals):

    # Group true labels and predictions according to their depth
    # depthinfo id: depth
    # submission id: prediction
    # test_truvals id: label

    depth_groups = {}
    depth_groups['0'] = []
    depth_groups['1'] = []
    depth_groups['2'] = []
    depth_groups['3'] = []
    depth_groups['4'] = []
    depth_groups['5'] = []
    depth_groups['6+'] = []


    # Find all keys in that depth group
    for tweetid, tweetdepth in depthinfo.iteritems():
        if tweetdepth == 0:
            depth_groups['0'].append(tweetid)
        elif tweetdepth == 1:
            depth_groups['1'].append(tweetid)
        elif tweetdepth == 2:
            depth_groups['2'].append(tweetid)
        elif tweetdepth == 3:
            depth_groups['3'].append(tweetid)
        elif tweetdepth == 4:
            depth_groups['4'].append(tweetid)
        elif tweetdepth == 5:
            depth_groups['5'].append(tweetid)
        elif tweetdepth >5:
            depth_groups['6+'].append(tweetid)

    # make a list

    depth_predictions = {}
    depth_predictions['0'] = []
    depth_predictions['1'] = []
    depth_predictions['2'] = []
    depth_predictions['3'] = []
    depth_predictions['4'] = []
    depth_predictions['5'] = []
    depth_predictions['6+'] = []

    depth_labels = {}
    depth_labels['0'] = []
    depth_labels['1'] = []
    depth_labels['2'] = []
    depth_labels['3'] = []
    depth_labels['4'] = []
    depth_labels['5'] = []
    depth_labels['6+'] = []

    depth_result = {}

    for depthgr in depth_groups.keys():
        depth_predictions[depthgr] = [submission[x] for x in depth_groups[depthgr]]
        depth_labels[depthgr] = [test_truevals[x] for x in depth_groups[depthgr]]

        _, _, mactest_F, _ = precision_recall_fscore_support(depth_labels[depthgr],
                                                             depth_predictions[depthgr],
                                                             average='macro')
        _, _, mictest_F, _ = precision_recall_fscore_support(depth_labels[depthgr],
                                                             depth_predictions[depthgr],
                                                             average='micro')
        _, _, test_F, _ = precision_recall_fscore_support(depth_labels[depthgr],
                                                          depth_predictions[depthgr])

        depth_result[depthgr] = [mactest_F, mictest_F, test_F]

    return depth_labels, depth_result

In [5]:
def print_table_three(true, pred):

#     print "\n\n--- Table 3 ---"

    # Prepare headers for the version of Table 3 from the paper (we'll print some additional details too)
    table_three_headers = tuple(["", "Accuracy", "Macro-F"] + sorted(class_labels))
    results_headers = ("Precision", "Recall", "F-score", "Support")

    print "\nResults on testing set"

    test_accuracy = accuracy_score(true, pred)
    print "\nAccuracy =", test_accuracy

    print "\nMacro-average:"
    macroavg_prfs = precision_recall_fscore_support(true, pred, average='macro')
    for lab, val in zip(results_headers, macroavg_prfs):
        if val is not None:
            print "%-12s%-12.3f" % (lab, val)
        else:
            print "%-12s%-12s" % (lab, "--")

    print "\nPer-class:"
    perclass_prfs = precision_recall_fscore_support(true, pred)
    print "%-12s%-12s%-12s%-12s%-12s" % tuple([""] + sorted(class_labels))
    for lab, vals in zip(results_headers, perclass_prfs):
        if lab is "Support":
            print "%-12s%-12i%-12i%-12i%-12i" % (lab, vals[0], vals[1], vals[2], vals[3])
        else:
            print "%-12s%-12.3f%-12.3f%-12.3f%-12.3f" % (lab, vals[0], vals[1], vals[2], vals[3])


In [6]:
def print_extra_details(best_trial_id):

    trials = pickle.load(open(os.path.join("output", "trials.txt"), "rb"))

    # Print out the best combination of hyperparameters
    print "\n--- New Table ---\n"
    print "The best combination of hyperparameters, found in trial " + str(best_trial_id) + ", was:"
    for param, param_value in trials.best_trial["result"]["Params"].iteritems():
        print "\t", param, "=", param_value

    # Let's examine the loss function at each iteration of the hyperparameter tuning process
    print "\n--- New Figure ---"

    # Extract the loss values from the full list of results, and calculate the running minimum value
    loss = numpy.asarray([r["loss"] for r in trials.results])
    running_min_loss = numpy.minimum.accumulate(loss)
    lowest_loss = loss[best_trial_id]
    all_best_ids = numpy.where(loss == lowest_loss)[0]

    # Plot the loss and running loss values against the iteration number, and save to the output folder
    plt.plot(range(0, len(loss)), loss, label="loss")
    plt.plot(range(0, len(running_min_loss)), running_min_loss, label="running min(loss)")
    plt.plot(best_trial_id, lowest_loss, "ro", label="min(loss)")
    if len(all_best_ids) > 1:
        plt.plot(all_best_ids, lowest_loss*numpy.ones(all_best_ids.shape), "rx", label="repeated min(loss)")
    plt.legend()
    plt.title("Hyperparameter optimisation")
    plt.xlabel("Iteration")
    plt.ylabel("Loss")
    plt.savefig(os.path.join("output", "hyperparameter_loss_values.pdf"))

    # Give details of other hyperparameter combinations that also achieved this loss
    if len(all_best_ids) > 1:
        print "\nWARNING: multiple hyperparameter combinations achieved the same lowest loss value as trial", best_trial_id
        print "ID               ",
        for id in all_best_ids:
            print "%-17d" % id,
        print ""
        for param in trials.results[all_best_ids[0]]["Params"]:
            print "%-17s" % param,
            for id in all_best_ids:
                print "%-17.5g" % trials.results[id]["Params"][param],
            print ""

    print "\nFigure showing hyperparameter optimisation progress can be found in the output folder.\n"

In [7]:
def print_table_four(depth_labels, depth_result):

#     print "\n\n--- Table 4 ---"
    print "\nNumber of tweets per depth and performance at each of the depths\n"

    # Print the column headers
    table_four_headers = ("Depth", "# tweets", "# Support", "# Deny", "# Query", "# Comment", "Accuracy", "MacroF") + class_labels
    for col in table_four_headers:
        print "%-11s" % col,
    print ""

    #  Print results in depth level order
    for depth in sorted(depth_result):

        # Work out which class the accuracy values refer to (precision_recall_fscore_support() outputs values in the
        # sorted order of the unique classes of tweets at that depth)
        depth_class_accuracy = depth_result[depth][2]
        depth_class_labels = sorted(set(depth_labels[depth]))

        # Print the depth and classes of tweets at that depth
        print "%-12s%-11i" % (depth, len(depth_labels[depth])),
        for lab in class_labels:
            print "%-11i" % depth_labels[depth].count(lab.lower()),

        # Print the accuracy, macro-F and class-specific performance at each depth
        print "%-12.3f%-11.3f" % \
              (depth_result[depth][1], depth_result[depth][0]),
        for lab in class_labels:
            if lab.lower() in depth_class_labels:
                class_ind = depth_class_labels.index(lab.lower())
                print "%-11.3f" % depth_class_accuracy[class_ind],
            else:
                print "%-11.3f" % 0.0,
        print ""

In [8]:
def print_table_five(true, pred):

#     print "\n\n--- Table 5 ---"
    print "\nConfusion matrix\n"

    # Generate the confusion matrix and the list of labels (as above, in sorted class order as long as each class
    # appears once, which they all do).
    conf_mat = confusion_matrix(true, pred)
    class_labels_mat = ("Lab \\ Pred",) + tuple(sorted(class_labels))

    # Print the header and then the confusion matrix
    print "%-12s%-12s%-12s%-12s%-12s" % class_labels_mat
    for lab, conf_row in zip(sorted(class_labels), conf_mat):
        row = (lab,) + tuple(conf_row)
        print "%-12s%-12i%-12i%-12i%-12i" % row


# First load the full set of tweets.
# Then calculate the depth and extract the true and predicted labels for the test set specifically.
tweet_depth, test_predicted_labels, test_labels = load_test_depth_pred_true()

# If it is present, load data from trials file and format in the same way as the submitted files
# (return None if the trials file is not available)
dev_labels = load_true_labels("dev")
# best_trial, best_loss, dev_predicted_labels = load_trials_data()

# Analyse the results separately at each depth
level_for_each_depth, results_for_each_depth = \
    calculate_results_at_each_depth(tweet_depth, test_predicted_labels, test_labels)

# Get lists of the true and predicted classes for the test and, if possible, development sets
true_labels_test, predicted_labels_test = get_true_and_predicted_classes(test_labels, test_predicted_labels)
# true_labels_dev, predicted_labels_dev = get_true_and_predicted_classes(dev_labels, dev_predicted_labels)

# Define some useful labels for table rows/columns
class_labels = ("Support", "Deny", "Query", "Comment")

# Print the tables
print_table_four(level_for_each_depth, results_for_each_depth)
print_table_five(true_labels_test, predicted_labels_test)
print_table_three(true_labels_test, predicted_labels_test)

# If the trials file is available, output more details of the best hyperparameter combinations and prepare a figure
# showing the loss during the hyperparameter choice process


Number of tweets per depth and performance at each of the depths

Depth       # tweets    # Support   # Deny      # Query     # Comment   Accuracy    MacroF      Support     Deny        Query       Comment     
0           56          50          3           3           0           0.893       0.314       0.943       0.000       0.000       0.000       
1           1010        10          5           39          956         0.967       0.566       0.182       0.333       0.767       0.983       
2           0           0           0           0           0           0.000       nan         0.000       0.000       0.000       0.000       
3           0           0           0           0           0           0.000       nan         0.000       0.000       0.000       0.000       
4           0           0           0           0           0           0.000       nan         0.000       0.000       0.000       0.000       
5           0           0           0           0           0  

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
