In [322]:
import pickle
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score, average_precision_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:

perplexions_path = '../data/ant_pairs_formated_def_ppls.pickle'
test_dataset_path = '../data/ant_test.txt'

with open(perplexions_path, 'rb') as f: #(ребенок, родитель): перплексия
    ppls = pickle.load(f)


In [4]:

ppls_clean = dict()
for item in ppls.items():
    ppls_clean[(item[0][0].split('(')[0].strip(', '), item[0][1].strip(', '))] = item[1]


In [21]:
pairs = []
y_true = []
non_reversed = []
not_found = []
data = []

with open(test_dataset_path, 'r',encoding='utf-8') as f:
    i = 0
    lines = f.readlines()
    for line in lines:
        ex1, ex2, category = line.strip('\n').split('\t')
        s11, v1, s12 = ex1.split(',')
        s21, v2, s22 = ex2.split(',')
        # if s11 == s21 and s12 == s22:
        v1 = v1.strip(' ')
        v2 = v2.strip(' ')
        if category == 'directional_entailment': # child, parent
            data.append((v1, v2, 1))

        elif category == 'directional_non-entailment': # parent, child
            data.append((v1, v2, 0))
        # else:
        #     non_reversed.append((s11, s12, v1, s21, s22, v2, category))

y_true = [elem[2] for elem in data]

In [20]:
data[:15]

[('sedated', 'was given to', 1),
 ('was given to', 'sedated', 0),
 ('calmed', 'was given to', 1),
 ('was given to', 'calmed', 0),
 ('subdued', 'was given to', 1),
 ('were given to', 'subdued', 0),
 ('stimulated', 'were given to', 1),
 ('were given to', 'stimulated', 0),
 ('energized', 'was given to', 1),
 ('were given to', 'energized', 0),
 ('enlivened', 'was given to', 1),
 ('were given to', 'enlivened', 0),
 ('livened up', 'were given to', 1),
 ('were given to', 'livened up', 0),
 ('invigorated', 'was given to', 1)]

In [155]:
def count_binary(data, ppls_clean, thr=0):

    y_pred = []

    for child, parent, label in data:
        if not (child, parent) in ppls_clean.keys():
            y_pred.append(0)
            continue
        
        forward_ppl = ppls_clean[(child, parent)]
        backward_ppl = ppls_clean[(parent, child)]

        if (forward_ppl-backward_ppl < thr):
            y_pred.append(1)
        else:
            y_pred.append(0)
    
    print('ROC AUC score: ', roc_auc_score(y_true, y_pred))
    print('Average precision: ', average_precision_score(y_true, y_pred))

def count_diff(data, ppls_clean, low_thr=-100000, high_thr=100000):

    y_pred = []

    for child, parent, label in data:
        if not (child, parent) in ppls_clean.keys():
            y_pred.append(0)
            continue
        
        forward_ppl = ppls_clean[(child, parent)]
        backward_ppl = ppls_clean[(parent, child)]

        y_pred.append(np.clip(backward_ppl-forward_ppl, low_thr, high_thr))
    y_pred = preprocessing.normalize(np.array([y_pred]), norm='l1')[0]
    roc_auc =  roc_auc_score(y_true, y_pred)
    ap = average_precision_score(y_true, y_pred)
    # print('ROC AUC score: ',)
    # print('Average precision: ',)
    return roc_auc, ap

def count_frac(data, ppls_clean,low_thr=-100000, high_thr=100000):

    y_pred = []

    for child, parent, label in data:
        if not (child, parent) in ppls_clean.keys():
            y_pred.append(1)
            continue
        
        forward_ppl = ppls_clean[(child, parent)]
        backward_ppl = ppls_clean[(parent, child)]

        y_pred.append(np.clip((backward_ppl/forward_ppl), low_thr, high_thr))
    y_pred = preprocessing.normalize(np.array([y_pred]), norm='max')[0]
    roc_auc =  roc_auc_score(y_true, y_pred)
    ap = average_precision_score(y_true, y_pred)
    # print('ROC AUC score: ',)
    # print('Average precision: ',)
    return roc_auc, ap

def count_frac_diff(data, ppls_clean):

    y_pred_frac = []
    y_pred_diff = []

    for child, parent, label in data:
        if not (child, parent) in ppls_clean.keys():
            y_pred_frac.append(1)
            y_pred_diff.append(1)
            continue
        
        forward_ppl = ppls_clean[(child, parent)]
        backward_ppl = ppls_clean[(parent, child)]

        
        y_pred_frac.append(backward_ppl/forward_ppl)
        y_pred_diff.append(backward_ppl-forward_ppl)
    y_pred_frac = preprocessing.normalize(np.array([y_pred_frac]), norm='max')[0]
    y_pred_diff = preprocessing.normalize(np.array([y_pred_diff]), norm='max')[0]
    y_pred = (y_pred_frac + y_pred_diff) / 2
    roc_auc =  roc_auc_score(y_true, y_pred)
    ap = average_precision_score(y_true, y_pred)
    # print('ROC AUC score: ',)
    # print('Average precision: ',)
    return roc_auc, ap

In [107]:
count_binary(data, ppls_clean, 1.5)

ROC AUC score:  0.6310580204778157
Average precision:  0.5826467925053981


In [113]:
count_diff(data, ppls_clean)

(0.6313750888187399, 0.6024325798086274)

In [174]:
count_frac(data, ppls_clean, 0, 10)

(0.677718319374716, 0.6549695283131112)

In [145]:
count_frac_diff(data, ppls_clean)

(0.6694652238232245, 0.6563200905058708)

In [86]:
from tqdm.contrib.concurrent import process_map
import itertools


In [95]:
lthr = np.arange(-1000, 1000, 20)
hthr = np.arange(0, 2000, 20)

all_thrs_iterator = itertools.product(lthr, hthr)
all_thrs = []
for l, h in all_thrs_iterator:
    if l < h:
        all_thrs.append((l,h))

In [96]:
len(all_thrs)

8725

In [None]:
out = []

for i in range(1, 11):
    cur_min = (i - 1) * 10000
    cur_max = i * 10000
    out.extend((process_map(get_metric, all_thrs, chunksize=1)))


In [90]:
def get_metric(thrs):
    l, r = thrs
    r, a = count_diff(data, ppls_clean, low_thr=l, high_thr=h)
    return r, a

out = (process_map(get_metric, all_thrs, chunksize=1))

KeyboardInterrupt: 

In [176]:
df = {'forward':[], 'backward':[], 'target':[]}

for child, parent, label in data:
    if not (child, parent) in ppls_clean.keys():
        #y_pred.append(1)
        continue
    
    
    forward_ppl = ppls_clean[(child, parent)]
    backward_ppl = ppls_clean[(parent, child)]
    df['forward'].append(forward_ppl)
    df['backward'].append(backward_ppl)
    df['target'].append(label)

In [315]:
X = pd.DataFrame(df)

In [316]:
X['frac'] = X['backward'] / X['forward']
X['diff'] = X['backward'] - X['forward']
X['mul'] = X['backward'] * X['forward']
X['sign'] = X['frac'] < 0.5

scaler = preprocessing.Normalizer()
X[['forward', 'backward', 'frac', 'diff', 'mul']] = scaler.fit_transform(X[['forward', 'backward', 'frac', 'diff', 'mul']] )

In [371]:
dropping = ['target', 'mul', 'sign', 'backward', 'forward', 'frac']

logreg = LogisticRegression(C=0.01, max_iter=1000, fit_intercept=False, solver='newton-cg')
logreg.fit(X.drop(columns=dropping), X['target'])

probas = logreg.predict_proba(X.drop(columns=dropping))
y_pred = probas[:,1]
y_true = X['target']

roc_auc =  roc_auc_score(y_true, y_pred)
ap = average_precision_score(y_true, y_pred)

roc_auc, ap

(0.6920013421334584, 0.6951447181974137)

In [372]:
logreg.coef_, logreg.intercept_

(array([[0.27866373]]), array([0.]))

In [373]:
from sklearn.linear_model import SGDClassifier

In [386]:
dropping = ['target', 'mul', 'sign', 'backward', 'forward', 'frac']

logreg = SGDClassifier(loss='log_loss', tol=1e-15)
logreg.fit(X.drop(columns=dropping), X['target'])

probas = logreg.predict_proba(X.drop(columns=dropping))
y_pred = probas[:,1]
y_true = X['target']

roc_auc =  roc_auc_score(y_true, y_pred)
ap = average_precision_score(y_true, y_pred)

roc_auc, ap

(0.6920013421334584, 0.6951447181974137)

In [327]:
dropping = ['target']

logreg = DecisionTreeClassifier(max_depth=5)
logreg.fit(X.drop(columns=dropping), X['target'])

probas = logreg.predict_proba(X.drop(columns=dropping))
y_pred = probas[:,1]
y_true = X['target']

roc_auc =  roc_auc_score(y_true, y_pred)
ap = average_precision_score(y_true, y_pred)

roc_auc, ap

(0.717876254913848, 0.6959064805460624)

(0.6556946303223027, 0.655444325087572)

In [None]:

y_true_2 = []
y_pred_2 = []

with open(test_dataset_path, 'r',encoding='utf-8') as f:
    i = 0
    lines = f.readlines()
    for line in lines:
        ex1, ex2, category = line.strip('\n').split('\t')
        s11, v1, s12 = ex1.split(',')
        s21, v2, s22 = ex2.split(',')
        if s11 == s21 and s12 == s22:
            v1 = v1.strip(' ')
            v2 = v2.strip(' ')
            if category == 'directional_entailment': # child, parent
                if (v1, v2) not in ppls_clean.keys():
                    x = 0
                else:
                    y_pred_2.append(ppls_clean[(v1, v2)]-ppls_clean[(v2, v1)])
                    y_true_2.append(0)

            elif category == 'directional_non-entailment': # parent, child
                if (v1, v2) not in ppls_clean.keys():
                    x = 0
                else:
                    y_pred_2.append(ppls_clean[(v1, v2)]-ppls_clean[(v2, v1)])
                    y_true_2.append(1)


normalized_y_pred = preprocessing.normalize([np.array(y_pred_2)])
print('ROC AUC score: ', roc_auc_score(y_true_2, normalized_y_pred[0]))
print('Average precision: ', average_precision_score(y_true_2, normalized_y_pred[0]))