In [1]:
%matplotlib inline

from os import listdir
from os.path import isfile, join

import sys
import subprocess
import glob
import json
import pandas as pd
import numpy as np

from collections import defaultdict
from collections import Counter

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm
from matplotlib import rc

import pprint
pp = pprint.PrettyPrinter(indent=4)


sys.path.insert(0, '/home/cnaik/s/3/ilp')
from  utils import ilp_config
from  utils import ilp_utils



In [2]:
rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
rc("grid", linestyle=":")

In [3]:
mpl.use('pgf')

def figsize(scale):
    fig_width_pt = 469.755                          # Get this from LaTeX using \the\textwidth
    inches_per_pt = 1.0/72.27                       # Convert pt to inch
    golden_mean = (np.sqrt(5.0)-1.0)/2.0            # Aesthetic ratio (you could change this)
    fig_width = fig_width_pt*inches_per_pt*scale    # width in inches
    fig_height = fig_width*golden_mean              # height in inches
    fig_size = [fig_width,fig_height]
    return fig_size

pgf_with_latex = {                      # setup matplotlib to use latex for output
    "pgf.texsystem": "pdflatex",        # change this if using xetex or lautex
    "text.usetex": True,                # use LaTeX to write all text
    "font.family": "serif",
    "font.serif": [],                   # blank entries should cause plots to inherit fonts from the document
    "font.sans-serif": [],
    "font.monospace": [],
    "axes.labelsize": 10,               # LaTeX default is 10pt font.
    "text.fontsize": 10,
    "legend.fontsize": 8,               # Make the legend/label fonts a little smaller
    "xtick.labelsize": 8,
    "ytick.labelsize": 8,
    "figure.figsize": figsize(1),     # default fig size of 1 textwidth
    "pgf.preamble": [
        r"\usepackage[utf8x]{inputenc}",    # use utf8 fonts becasue your computer can handle it :)
        r"\usepackage[T1]{fontenc}",        # plots will be generated using this preamble
        ]
    }
mpl.rcParams.update(pgf_with_latex)

sns.set_context("paper", font_scale=1.0, rc={'lines.linewidth': 0.75,
                                             'axes.linewidth': 0.75,
                                             'text.usetex': True
                                             })
sns.set_style("whitegrid", {'font.family': 'serif',
                            'font.serif': ['Palatino']})

# sns.set(style="white")

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [4]:
plt.rc('figure', figsize=(18,12))

In [5]:
def get_gold_data(d_gold):
    gold_data_raw = defaultdict(list)
    for process_dict in d_gold:
        process = process_dict['process']
        # list of sentences
        for sentence_dict in process_dict['sentences']:
            sent_id = sentence_dict['sentenceId']
            # list of arguments
            for arg_dict in sentence_dict['annotatedArgumentSpan']:
                arg_id = arg_dict['argId']
                start_id = int(arg_dict['startIdx'])
                end_id = int(arg_dict['endIdx'])
                role_type = arg_dict['annotatedRole']
                role_label = int(arg_dict['annotatedLabel'])
                gold_data_raw[(sent_id, start_id, end_id)].append((role_type, role_label))

    gold_data = {}
    for k, v in gold_data_raw.iteritems():
        roles = []
        labels = []
        for x in v:
            roles.append(x[0])
            labels.append(x[1])

        if 1 in labels:
            gold_data[k] = roles[labels.index(1)]
        elif np.sum(labels) == -4:
            gold_data[k] = 'NONE'
    return gold_data

In [6]:
def get_prediction_data(d_predict):
    srl_data = defaultdict()
    for process_dict in d_predict:
        process = process_dict['process']
        # list of sentences
        for sentence_dict in process_dict['sentences']:
            sent_id = sentence_dict['sentenceId']
            # list of arguments
            for arg_dict in sentence_dict['predictionArgumentSpan']:
                arg_id = arg_dict['argId']
                start_id = int(arg_dict['startIdx'])
                end_id = int(arg_dict['endIdx'])
                role_predicted = arg_dict['rolePredicted']
                role_probs = {}
                for role_prob in arg_dict['probRoles']:
                    role_probs.update(role_prob)                
                srl_data[(sent_id, start_id, end_id)] = (role_predicted, role_probs[role_predicted])
    return srl_data

In [7]:
id_text_map = defaultdict()

for f, fold_dir in enumerate(listdir(ilp_config.cross_val_dir)):
    fold_path = join(ilp_config.cross_val_dir, fold_dir)
    d_gold_file = join(fold_path, 'test', 'test.srlout.json')
    d_gold = json.load(open(d_gold_file, "r"))
    
    for process_dict in d_gold:
        process = process_dict['process']
        # list of sentences
        for sentence_dict in process_dict['sentences']:
            sent_id = sentence_dict['sentenceId']
            sentence_text = sentence_dict['text']
            # list of arguments
            for arg_dict in sentence_dict['annotatedArgumentSpan']:
                start_id = int(arg_dict['startIdx'])
                end_id = int(arg_dict['endIdx'])
                role_text = arg_dict['text']
                id_text_map[(sent_id, start_id, end_id)] = (process, sentence_text, role_text)

In [8]:
srl_fold_data = {}
for f, fold_dir in enumerate(listdir(ilp_config.cross_val_dir)):
    fold_path = join(ilp_config.cross_val_dir, fold_dir)
    d_gold_file = join(fold_path, 'test', 'test.srlout.json')
    d_predict_file = join(fold_path, 'test', 'test.srlpredict.json')
    
    d_gold = json.load(open(d_gold_file, "r"))
    d_predict = json.load(open(d_predict_file, "r"))
    
    gold_data = get_gold_data(d_gold)
    srl_data = get_prediction_data(d_predict)
    
    analysis_data = {k: (gold_data[k], v) for k, v in srl_data.iteritems() if k in gold_data}
    srl_fold_data[f+1] = analysis_data

In [9]:
ilp_fold_data = {}
for f, fold_dir in enumerate(listdir(ilp_config.cross_val_dir)):
    fold_path = join(ilp_config.cross_val_dir, fold_dir)
    d_gold_file = join(fold_path, 'test', 'test.srlout.json')
    d_ilp_file = join(fold_path, 'test', 'test.ilppredict.json')
    
    d_gold = json.load(open(d_gold_file, "r"))
    d_ilp = json.load(open(d_ilp_file, "r"))
    
    gold_data = get_gold_data(d_gold)
    ilp_data = get_prediction_data(d_ilp)
    
    analysis_data = {k: (gold_data[k], v) for k, v in ilp_data.iteritems() if k in gold_data}
    ilp_fold_data[f+1] = analysis_data

In [10]:
srl_all_data = {}
for f, f_data in srl_fold_data.iteritems():
    srl_all_data.update(f_data)

In [11]:
ilp_all_data = {}
for f, f_data in ilp_fold_data.iteritems():
    ilp_all_data.update(f_data)

In [12]:
# srl_all_data

In [13]:
analysis_data = {}
for k, v in srl_all_data.iteritems():
    gold_role = v[0]
    srl_role = v[1][0]
    srl_score = v[1][0]
    ilp_role = ilp_all_data[k][1][0]
    ilp_score = ilp_all_data[k][1][1]
    analysis_data[k] = (gold_role, srl_role, ilp_role)

In [14]:
# analysis_data

In [15]:
all_data = []
for key, data in analysis_data.iteritems():
    gold_role, srl_role, ilp_role = data
    process, sentence_text, role_text = id_text_map[key]
    all_data.append([key, process, sentence_text, role_text, gold_role, srl_role, ilp_role])

In [16]:
analysis_df = pd.DataFrame(all_data)
analysis_df.columns = ['key', 'process', 'sentence_text', 'role_text', 'gold_role', 'srl_role', 'ilp_role']

In [17]:
analysis_df.to_csv('analysys.csv', index=False)