### Load tfidf model

In [1]:
import pickle
import codecs
import joblib

In [2]:
vect = joblib.load("ruentfidf/tfidf.pkl")



### Load data

In [3]:
from pathlib import Path

In [4]:
data_dir = Path("final_data/")

In [5]:
from sklearn.linear_model import LogisticRegression

In [61]:
all_results = {
    "name": [],
    "train": [],
    "val": [],
    "test": []
}

### Copa

In [62]:
train_path = data_dir / "COPA/train.jsonl"
val_path = data_dir / "COPA/val.jsonl"
test_path = data_dir / "COPA/test.jsonl"

In [63]:
import codecs
import json


def build_feature_copa(row):
    premise = str(row["premise"]).strip()
    choice1 = row["choice1"]
    choice2 = row["choice2"]
    label = row["label"]
    question = "Что было ПРИЧИНОЙ этого?" if row["question"] == "cause" else "Что случилось в РЕЗУЛЬТАТЕ?"
    res = f"{premise} {question} {choice1} {choice2}"
    return res, label


def build_features_copa(path):
    with codecs.open(path, encoding='utf-8-sig') as reader:
        lines = reader.read().split("\n")
        lines = list(map(json.loads, filter(None, lines)))
    res = list(map(build_feature_copa, lines))
    texts = list(map(lambda x: x[0], res))
    labels = list(map(lambda x: x[1], res))
    return vect.transform(texts), labels


def fit_copa(train, labels):
    clf = LogisticRegression()
    return clf.fit(train, labels)


def eval_copa(train_path, val_path, test_path):
    train = build_features_copa(train_path)
    val = build_features_copa(val_path)
    test = build_features_copa(test_path)
    clf = fit_copa(*train)
    return clf, {
        "train": clf.score(*train),
        "val": clf.score(*val),
        "test": clf.score(*test)
    }

In [64]:
clf, copa_scores = eval_copa(train_path, val_path, test_path)



In [65]:
copa_scores

{'train': 0.775, 'val': 0.45, 'test': 0.486}

In [66]:
all_results["name"].append("COPA")
all_results["train"].append(copa_scores["train"])
all_results["val"].append(copa_scores["val"])
all_results["test"].append(copa_scores["test"])

### CommitmentBank

In [67]:
train_path = data_dir / "CommitmentBank/train.jsonl"
val_path = data_dir / "CommitmentBank/val.jsonl"
test_path = data_dir / "CommitmentBank/test.jsonl"

In [68]:
import codecs
import json


def build_feature_commitment_bank(row):
    premise = str(row["premise"]).strip()
    hypothesis = row["hypothesis"]
    label = row["label"]
    res = f"{premise} {hypothesis}"
    return res, label


def build_features_commitment_bank(path):
    with codecs.open(path, encoding='utf-8-sig') as reader:
        lines = reader.read().split("\n")
        lines = list(map(json.loads, filter(None, lines)))
    res = list(map(build_feature_commitment_bank, lines))
    texts = list(map(lambda x: x[0], res))
    labels = list(map(lambda x: x[1], res))
    return vect.transform(texts), labels


def fit_commitment_bank(train, labels):
    clf = LogisticRegression()
    return clf.fit(train, labels)


def eval_commitment_bank(train_path, val_path, test_path):
    train = build_features_commitment_bank(train_path)
    val = build_features_commitment_bank(val_path)
    test = build_features_commitment_bank(test_path)
    clf = fit_commitment_bank(*train)
    return clf, {
        "train": clf.score(*train),
        "val": clf.score(*val),
        "test": clf.score(*test)
    }

In [69]:
clf, commitment_bank_scores = eval_commitment_bank(train_path, val_path, test_path)



In [70]:
commitment_bank_scores

{'train': 0.7420091324200914,
 'val': 0.5227272727272727,
 'test': 0.4520547945205479}

In [71]:
all_results["name"].append("CommitmentBank")
all_results["train"].append(commitment_bank_scores["train"])
all_results["val"].append(commitment_bank_scores["val"])
all_results["test"].append(commitment_bank_scores["test"])

### BoolQ

In [72]:
train_path = data_dir / "BoolQ/train.jsonl"
val_path = data_dir / "BoolQ/val.jsonl"
test_path = data_dir / "BoolQ/test.jsonl"

In [73]:
import codecs
import json


def build_feature_bool_q(row):
    res = str(row["question"]).strip()
    label = row["label"]
    return res, label


def build_features_bool_q(path):
    with codecs.open(path, encoding='utf-8-sig') as reader:
        lines = reader.read().split("\n")
        lines = list(map(json.loads, filter(None, lines)))
    res = list(map(build_feature_bool_q, lines))
    texts = list(map(lambda x: x[0], res))
    labels = list(map(lambda x: x[1], res))
    return vect.transform(texts), labels


def fit_bool_q(train, labels):
    clf = LogisticRegression()
    return clf.fit(train, labels)


def eval_bool_q(train_path, val_path, test_path):
    train = build_features_bool_q(train_path)
    val = build_features_bool_q(val_path)
    test = build_features_bool_q(test_path)
    clf = fit_copa(*train)
    return clf, {
        "train": clf.score(*train),
        "val": clf.score(*val),
        "test": clf.score(*test)
    }

In [74]:
clf, bool_q_scores = eval_bool_q(train_path, val_path, test_path)

In [75]:
bool_q_scores

{'train': 0.7321428571428571,
 'val': 0.6644067796610169,
 'test': 0.6847457627118644}

In [76]:
all_results["name"].append("BoolQ")
all_results["train"].append(bool_q_scores["train"])
all_results["val"].append(bool_q_scores["val"])
all_results["test"].append(bool_q_scores["test"])

### RTE

In [77]:
train_path = data_dir / "RTE/train.jsonl"
val_path = data_dir / "RTE/val.jsonl"
test_path = data_dir / "RTE/test.jsonl"

In [78]:
import codecs
import json


def build_feature_rte(row):
    premise = str(row["premise"]).strip()
    hypothesis = row["hypothesis"]
    label = row["label"]
    res = f"{premise} {hypothesis}"
    return res, label


def build_features_rte(path):
    with codecs.open(path, encoding='utf-8-sig') as reader:
        lines = reader.read().split("\n")
        lines = list(map(json.loads, filter(None, lines)))
    res = list(map(build_feature_rte, lines))
    texts = list(map(lambda x: x[0], res))
    labels = list(map(lambda x: x[1], res))
    return vect.transform(texts), labels


def fit_rte(train, labels):
    clf = LogisticRegression()
    return clf.fit(train, labels)


def eval_rte(train_path, val_path, test_path):
    train = build_features_rte(train_path)
    val = build_features_rte(val_path)
    test = build_features_rte(test_path)
    clf = fit_rte(*train)
    return clf, {
        "train": clf.score(*train),
        "val": clf.score(*val),
        "test": clf.score(*test)
    }

In [79]:
clf, rte_scores = eval_rte(train_path, val_path, test_path)

In [80]:
rte_scores

{'train': 0.7152140672782875,
 'val': 0.46579804560260585,
 'test': 0.4715447154471545}

In [81]:
all_results["name"].append("RTE")
all_results["train"].append(rte_scores["train"])
all_results["val"].append(rte_scores["val"])
all_results["test"].append(rte_scores["test"])

### WINOGRAD

In [82]:
train_path = data_dir / "WINOGRAD/train.jsonl"
val_path = data_dir / "WINOGRAD/val.jsonl"
test_path = data_dir / "WINOGRAD/test.jsonl"

In [83]:
import codecs
import json


def build_feature_winograd(row):
    premise = str(row["text"]).strip()
    span1 = row["target"]["span1_text"]
    span2 = row["target"]["span2_text"]
    label = row["label"]
    res = f"{premise} {span1} {span2}"
    return res, label


def build_features_winograd(path):
    with codecs.open(path, encoding='utf-8-sig') as reader:
        lines = reader.read().split("\n")
        lines = list(map(json.loads, filter(None, lines)))
    res = list(map(build_feature_winograd, lines))
    texts = list(map(lambda x: x[0], res))
    labels = list(map(lambda x: x[1], res))
    return vect.transform(texts), labels


def fit_winograd(train, labels):
    clf = LogisticRegression()
    return clf.fit(train, labels)


def eval_winograd(train_path, val_path, test_path):
    train = build_features_winograd(train_path)
    val = build_features_winograd(val_path)
    test = build_features_winograd(test_path)
    clf = fit_winograd(*train)
    return clf, {
        "train": clf.score(*train),
        "val": clf.score(*val),
        "test": clf.score(*test)
    }

In [84]:
clf, winograd_scores = eval_winograd(train_path, val_path, test_path)

In [85]:
winograd_scores

{'train': 0.5115511551155115,
 'val': 0.553921568627451,
 'test': 0.6623376623376623}

In [86]:
all_results["name"].append("WINOGRAD")
all_results["train"].append(winograd_scores["train"])
all_results["val"].append(winograd_scores["val"])
all_results["test"].append(winograd_scores["test"])

### WiC

In [87]:
train_path = data_dir / "WiC/train.jsonl"
val_path = data_dir / "WiC/val.jsonl"
test_path = data_dir / "WiC/test.jsonl"

In [88]:
import codecs
import json


def build_feature_wic(row):
    sentence1 = row["sentence1"].strip()
    sentence2 = row["sentence2"].strip()
    word = row["word"].strip()
    label = row["label"]
    res = f"{sentence1} {sentence2} {word}"
    return res, label


def build_features_wic(path):
    with codecs.open(path, encoding='utf-8-sig') as reader:
        lines = reader.read().split("\n")
        lines = list(map(json.loads, filter(None, lines)))
    res = list(map(build_feature_wic, lines))
    texts = list(map(lambda x: x[0], res))
    labels = list(map(lambda x: x[1], res))
    return vect.transform(texts), labels


def fit_wic(train, labels):
    clf = LogisticRegression()
    return clf.fit(train, labels)


def eval_wic(train_path, val_path, test_path):
    train = build_features_wic(train_path)
    val = build_features_wic(val_path)
    test = build_features_wic(test_path)
    clf = fit_wic(*train)
    return clf, {
        "train": clf.score(*train),
        "val": clf.score(*val),
        "test": clf.score(*test)
    }

In [89]:
clf, wic_scores = eval_wic(train_path, val_path, test_path)

In [90]:
wic_scores

{'train': 0.7103552532123961,
 'val': 0.6653733098177542,
 'test': 0.6694922228623159}

In [91]:
all_results["name"].append("WiC")
all_results["train"].append(wic_scores["train"])
all_results["val"].append(wic_scores["val"])
all_results["test"].append(wic_scores["test"])

### diagnostics

In [92]:
from sklearn.metrics import matthews_corrcoef

In [93]:
train_path = data_dir / "RTE/train.jsonl"
val_path = data_dir / "RTE/val.jsonl"
test_path = data_dir / "diagnostics/AX-b-edited.jsonl"

In [94]:
import codecs
import json


def build_feature_diagnostics(row):
    if row.get("sentence1") is None:
        premise = str(row["premise"]).strip()
        hypothesis = row["hypothesis"]
    else:
        premise = str(row["sentence1"]).strip()
        hypothesis = row["sentence2"]
    label = row["label"]
    res = f"{premise} {hypothesis}"
    return res, label


def build_features_diagnostics(path):
    with codecs.open(path, encoding='utf-8-sig') as reader:
        lines = reader.read().split("\n")
        lines = list(map(json.loads, filter(None, lines)))
    res = list(map(build_feature_diagnostics, lines))
    texts = list(map(lambda x: x[0], res))
    labels = list(map(lambda x: x[1], res))
    return vect.transform(texts), labels


def fit_diagnostics(train, labels):
    clf = LogisticRegression()
    return clf.fit(train, labels)


def eval_diagnostics(train_path, val_path, test_path):
    train = build_features_diagnostics(train_path)
    val = build_features_diagnostics(val_path)
    test = build_features_diagnostics(test_path)
    clf = fit_diagnostics(*train)
    return clf, {
        "train": matthews_corrcoef(train[1], clf.predict(train[0])),
        "val": matthews_corrcoef(val[1], clf.predict(val[0])),
        "test": matthews_corrcoef(test[1], clf.predict(test[0]))
    }

In [95]:
clf, diagnostics_scores = eval_diagnostics(train_path, val_path, test_path)

In [96]:
diagnostics_scores

{'train': 0.4294719661883857,
 'val': -0.06835232958984723,
 'test': 0.05974021843803689}

In [97]:
all_results["name"].append("diagnostics")
all_results["train"].append(diagnostics_scores["train"])
all_results["val"].append(diagnostics_scores["val"])
all_results["test"].append(diagnostics_scores["test"])

### ReCoRD

In [98]:
train_path = data_dir / "ReCoRD/train.jsonl"
val_path = data_dir / "ReCoRD/dev.jsonl"
test_path = data_dir / "ReCoRD/test.jsonl"

In [99]:
from sklearn.metrics.pairwise import cosine_similarity
import jsonlines
import numpy as np

In [119]:
"""
Official evaluation script for ReCoRD v1.0.
(Some functions are adopted from the SQuAD evaluation script.)
"""

from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_punc(lower(s)))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)

def evaluate(dataset, predictions):
    f1 = exact_match = total = 0
    correct_ids = []
    for prediction, passage in tqdm_notebook(zip(predictions, dataset)):
        prediction = prediction["label"]
        for qa in passage['qas']:
            total += 1
            ground_truths = list(map(lambda x: x['text'], qa['answers']))

            _exact_match = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
            if int(_exact_match) == 1:
                correct_ids.append(qa['idx'])
            exact_match += _exact_match

            f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)

    exact_match = exact_match / total
    f1 = f1 / total
    return exact_match, f1


In [120]:
from tqdm import tqdm_notebook

In [121]:
def eval_record(train_path, val_path, test_path):
    return None, {
        "train": eval_part(train_path),
        "val": eval_part(val_path),
        "test": eval_part(test_path)
    }


def eval_part(path):
    with jsonlines.open(path) as reader:
        lines = list(reader)
    preds = []
    for row in tqdm_notebook(lines, total=len(lines), leave=False):
        pred = get_row_pred(row)
        preds.append({
            "idx": row["idx"],
            "label": pred
        })
    return evaluate(lines, preds)


def get_row_pred(row, top_n=5):
    text = vect.transform([row["passage"]["text"].replace("\n@highlight\n", " ")])
    res = []
    words = [
        row["passage"]["text"][x["start"]: x["end"]]
        for x in row["passage"]["entities"]]
    for line in row["qas"]:
        line_candidates = []
        for word in words:
            line_candidates.append(line["query"].replace("@placeholder", word))
        cos = cosine_similarity(text, vect.transform(line_candidates))
        pred = np.array(words)[cos.argsort()[0][-1]]
        res.append(pred)
    return " ".join(res).lower()

In [122]:
# {"source": "lenta", "passage": {"text": "Против министра здравоохранения Саратовской области Натальи Мазиной возбуждено уголовное дело. Об этом в пятницу, 17 января,   сообщается на сайте управления Следственного комитета России (СКР) по региону. Против Мазиной и других сотрудников регионального ведомства следствие возбудило дело о злоупотреблении должностными полномочиями. По данным СКР, осенью 2018 года министерство здравоохранения региона заключило три госконтракта на поставку 18 аппаратов УЗИ ненадлежащего качества. Мазина знала об этом, однако воспользовалась своим служебным положением дала указание врачам принять медицинские изделия. Позже глава министерства организовала оплату данного оборудования, причинив городу ущерб в 53 миллиона рублей.\n@highlight\nВ Москве врач-анестезиолог изнасиловал пациентку после операции\n@highlight\nСкворцова поспорила с Голиковой об «ужасной» оптимизации здравоохранения\n@highlight\nРастративший два миллиарда рублей российский экс-чиновник пустился в бега", "entities": [{"start": 32, "end": 51}, {"start": 52, "end": 67}, {"start": 189, "end": 192}, {"start": 213, "end": 220}, {"start": 346, "end": 349}, {"start": 485, "end": 491}, {"start": 731, "end": 737}, {"start": 804, "end": 813}, {"start": 826, "end": 835}]}, "qas": [{"query": "Как сообщил «Ленте.ру» источник в правоохранительных органах, @placeholder утверждает, что все расценки и другую информацию она получила от Москвы.", "answers": [{"start": 52, "end": 67, "text": "Натальи Мазиной"}, {"start": 213, "end": 220, "text": "Мазиной"}, {"start": 485, "end": 491, "text": "Мазина"}], "idx": 4369}], "idx": 4369}

In [123]:
clf, record_scores = eval_record(train_path, val_path, test_path)

HBox(children=(IntProgress(value=0, max=72193), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=7577), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=7256), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [124]:
record_scores

{'train': (0.20824733699943207, 0.2263773335525391),
 'val': (0.22951036030091065, 0.23539879459768612),
 'test': (0.25151598676957, 0.2564189898671709)}

In [125]:
all_results["name"].append("Record")
all_results["train"].append(record_scores["train"])
all_results["val"].append(record_scores["val"])
all_results["test"].append(record_scores["test"])

### MultiRC

In [126]:
train_path = data_dir / "MultiRC/train.jsonl"
val_path = data_dir / "MultiRC/val.jsonl"
test_path = data_dir / "MultiRC/test.jsonl"

In [127]:
import functools

class MultiRCMetrics:

    @staticmethod
    def per_question_metrics(dataset, output_map):
        P = []
        R = []
        for n, example in enumerate(dataset):
                    predictedAns = example
                    correctAns = output_map[n]
                    predictCount = sum(predictedAns)
                    correctCount = sum(correctAns)
                    assert math.ceil(sum(predictedAns)) == sum(predictedAns), "sum of the scores: " + str(sum(predictedAns))
                    agreementCount = sum([a * b for (a, b) in zip(correctAns, predictedAns)])
                    p1 = (1.0 * agreementCount / predictCount) if predictCount > 0.0 else 1.0
                    r1 = (1.0 * agreementCount / correctCount) if correctCount > 0.0 else 1.0
                    P.append(p1)
                    R.append(r1)

        pAvg = Measures.avg(P)
        rAvg = Measures.avg(R)
        f1Avg = 2 * Measures.avg(R) * Measures.avg(P) / (Measures.avg(P) + Measures.avg(R))
        return [pAvg, rAvg, f1Avg]

    @staticmethod
    def exact_match_metrics_origin(dataset, output_map, delta):
        EM = []
        for n, example in enumerate(dataset):
            predictedAns = example
            correctAns = output_map[n]

            em = 1.0 if sum([abs(i - j) for i, j in zip(correctAns, predictedAns)]) <= delta  else 0.0
            EM.append(em)
        return Measures.avg(EM)

    @staticmethod
    def exact_match_simple(dataset, output_map):
        EM = []
        for n, example in enumerate(dataset):
            predictedAns = example
            correctAns = output_map[n]
            if predictedAns == correctAns:
                em = 1
            else:
                em = 0
            EM.append(em)
        return sum(EM)/len(EM)

    @staticmethod
    def per_dataset_metric(dataset, output_map):
        """
        dataset = [[0,1,1], [0,1]]
        output_map = [[0,1,0], [0,1]]
        """
        agreementCount = 0
        correctCount = 0
        predictCount = 0
        for n, example in enumerate(dataset):
                predictedAns = example
                correctAns = output_map[n]
                predictCount += sum(predictedAns)
                correctCount += sum(correctAns)
                agreementCount += sum([a * b for (a, b) in zip(correctAns, predictedAns)])

        p1 = (1.0 * agreementCount / predictCount) if predictCount > 0.0 else 1.0
        r1 = (1.0 * agreementCount / correctCount) if correctCount > 0.0 else 1.0
        return [p1, r1, 2 * r1 * p1 / (p1 + r1)]

    @staticmethod
    def avg(l):
        return functools.reduce(lambda x, y: x + y, l) / len(l)

    
def multiRC_metrics(pred, labels):
    metrics = MultiRCMetrics()
    em = metrics.exact_match_simple(pred, labels)
    em0 = metrics.exact_match_metrics_origin(pred, labels, 0)
    f1 = metrics.per_dataset_metric(pred, labels)
    f1a = f1[-1]
    return em0, f1a


Measures = MultiRCMetrics

In [128]:
def eval_multirc(train_path, val_path, test_path):
    return None, {
        "train": eval_part_multirc(train_path),
        "val": eval_part_multirc(val_path),
        "test": eval_part_multirc(test_path)
    }


def eval_part_multirc(path):
    with jsonlines.open(path) as reader:
        lines = list(reader)
    preds = []
    labels = []
    for row in lines:
        pred, lbls = get_row_pred_multirc(row)
        preds.extend(pred)
        labels.extend(lbls)
    return multiRC_metrics(preds, labels)


def get_row_pred_multirc(row, top_n=5):
    text = vect.transform([row["passage"]["text"]])
    res = []
    labels = []
    for line in row["passage"]["questions"]:
        line_answers = []
        line_labels = []
        for answ in line["answers"]:
            line_labels.append(answ["label"])
            answ = f"{line['question']} {answ['text']}"
            line_answers.append(answ)
        cos = cosine_similarity(text, vect.transform(line_answers))
        pred = cos.argsort()[0][-2:]
        pred = [int(idx in pred) for idx in range(len(line["answers"]))]
        res.append(pred)
        labels.append(line_labels)
    return res, labels

In [129]:
clf, multirc_scores = eval_multirc(train_path, val_path, test_path)

In [130]:
multirc_scores

{'train': (0.2140077821011673, 0.5475732090384031),
 'val': (0.20982986767485823, 0.5207215992198928),
 'test': (0.24434638720353005, 0.5895127875410773)}

In [131]:
all_results["name"].append("MultiRC")
all_results["train"].append(multirc_scores["train"])
all_results["val"].append(multirc_scores["val"])
all_results["test"].append(multirc_scores["test"])

### Over All

In [132]:
import pandas as pd

In [133]:
results = pd.DataFrame(all_results)

In [134]:
results

Unnamed: 0,name,train,val,test
0,COPA,0.775,0.45,0.486
1,CommitmentBank,0.742009,0.522727,0.452055
2,BoolQ,0.732143,0.664407,0.684746
3,RTE,0.715214,0.465798,0.471545
4,WINOGRAD,0.511551,0.553922,0.662338
5,WiC,0.710355,0.665373,0.669492
6,diagnostics,0.429472,-0.0683523,0.0597402
7,Record,"(0.20824733699943207, 0.2263773335525391)","(0.22951036030091065, 0.23539879459768612)","(0.25151598676957, 0.2564189898671709)"
8,MultiRC,"(0.2140077821011673, 0.5475732090384031)","(0.20982986767485823, 0.5207215992198928)","(0.24434638720353005, 0.5895127875410773)"


In [135]:
results.to_csv("results.csv", sep="\t")