## Binary structure classification used in tree building: Step 3. BiMPM

Prepare data and model-related scripts.

Evaluate models.

Output:
 - ``models/structure_predictor_bimpm/*``

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import pickle
from utils.file_reading import read_edus, read_gold, read_negative, read_annotation

### Make a directory

In [None]:
MODEL_PATH = 'models/structure_predictor_bimpm'
! mkdir $MODEL_PATH

TRAIN_FILE_PATH = os.path.join(MODEL_PATH, 'structure_cf_train.tsv')
DEV_FILE_PATH = os.path.join(MODEL_PATH, 'structure_cf_dev.tsv')
TEST_FILE_PATH = os.path.join(MODEL_PATH, 'structure_cf_test.tsv')

### Prepare train/test sets

In [None]:
IN_PATH = 'data_structure'

train_samples = pd.read_pickle(os.path.join(IN_PATH, 'train_samples.pkl'))
dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
test_samples = pd.read_pickle(os.path.join(IN_PATH, 'test_samples.pkl'))

In [None]:
counts = train_samples['relation'].value_counts(normalize=False).values
NUMBER_CLASSES = len(counts)
print("number of classes:", NUMBER_CLASSES)
print("class weights:", np.round(counts.min() / counts, decimals=6))

In [None]:
train_samples = train_samples.reset_index()
train_samples[['relation', 'snippet_x', 'snippet_y', 'same_sentence', 'same_paragraph', 'index']].to_csv(
    TRAIN_FILE_PATH, sep='\t', header=False, index=False)

dev_samples = dev_samples.reset_index()
dev_samples[['relation', 'snippet_x', 'snippet_y', 'same_sentence', 'same_paragraph', 'index']].to_csv(
    DEV_FILE_PATH, sep='\t', header=False, index=False)

test_samples = test_samples.reset_index()
test_samples[['relation', 'snippet_x', 'snippet_y', 'same_sentence', 'same_paragraph', 'index']].to_csv(
    TEST_FILE_PATH, sep='\t', header=False, index=False)

### Customize BiMPM model with adding inputs 

In [None]:
! rm -r models/bimpm_custom_package
! mkdir models/bimpm_custom_package
! touch models/bimpm_custom_package/__init__.py
! mkdir models/bimpm_custom_package/tokenizers
! mkdir models/bimpm_custom_package/dataset_readers
! mkdir models/bimpm_custom_package/model

In [None]:
%%writefile models/bimpm_custom_package/dataset_readers/__init__.py

try:
    from bimpm_custom_package.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from bimpm_custom_package.dataset_readers.custom_reader import CustomDataReader
except ModuleNotFoundError:
    from models.bimpm_custom_package.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from models.bimpm_custom_package.dataset_readers.custom_reader import CustomDataReader

In [None]:
%%writefile models/bimpm_custom_package/tokenizers/whitespace_tokenizer.py

from allennlp.data.tokenizers import Tokenizer
from allennlp.data.tokenizers import Token, Tokenizer, CharacterTokenizer, WordTokenizer
from overrides import overrides
from typing import Dict, List


@Tokenizer.register("whitespace_tokenizer")
class WhitespaceTokenizer(Tokenizer):
    def __init__(self) -> None:
        super().__init__()

    def _tokenize(self, text):
        return [Token(token) for token in text.split()]

    @overrides
    def tokenize(self, text: str) -> List[Token]:
        tokens = self._tokenize(text)

        return tokens

In [None]:
%%writefile models/bimpm_custom_package/dataset_readers/custom_reader.py

import csv
import logging
from typing import Dict

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, ArrayField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Tokenizer
from overrides import overrides

try:
    from bimpm_custom_package.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
except ModuleNotFoundError:
    from models.bimpm_custom_package.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

import numpy as np

logger = logging.getLogger(__name__)


@DatasetReader.register("custom_pairs_reader")
class CustomDataReader(DatasetReader):
    """
    # Parameters
    tokenizer : `Tokenizer`, optional
        Tokenizer to use to split the premise and hypothesis into words or other kinds of tokens.
        Defaults to `WhitespaceTokenizer`.
    token_indexers : `Dict[str, TokenIndexer]`, optional
        Indexers used to define input token representations. Defaults to `{"tokens":
        SingleIdTokenIndexer()}`.
    """

    def __init__(
            self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None,
            lazy: bool = True) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WhitespaceTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

    @overrides
    def _read(self, file_path):
        logger.info("Reading instances from lines in file at: %s", file_path)
        with open(cached_path(file_path), "r") as data_file:
            tsv_in = csv.reader(data_file, delimiter="\t")
            for row in tsv_in:
                if len(row) == 6:
                    yield self.text_to_instance(premise=row[1], hypothesis=row[2], label=row[0], 
                                                same_sentence=row[3], same_paragraph=row[4])

    @overrides
    def text_to_instance(
            self,  # type: ignore
            premise: str,
            hypothesis: str,
            label: str,
            same_sentence: str,
            same_paragraph: str,
    ) -> Instance:

        fields: Dict[str, Field] = {}
        tokenized_premise = self._tokenizer.tokenize(premise)
        tokenized_hypothesis = self._tokenizer.tokenize(hypothesis)
        fields["premise"] = TextField(tokenized_premise, self._token_indexers)
        fields["hypothesis"] = TextField(tokenized_hypothesis, self._token_indexers)
        _same_sentence = list(map(list, zip(*same_sentence)))
        _same_paragraph = list(map(list, zip(*same_paragraph)))
        fields["same_sentence"] = ArrayField(np.array(_same_sentence))
        fields["same_paragraph"] = ArrayField(np.array(_same_paragraph))
        if label is not None:
            fields["label"] = LabelField(label)

        return Instance(fields)

In [None]:
%%writefile models/bimpm_custom_package/model/__init__.py

try:
    from bimpm_custom_package.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from bimpm_custom_package.model.custom_bimpm import BiMpm as CustomBiMpm
    from bimpm_custom_package.model.multiclass_bimpm import BiMpm as MulticlassBiMpm
    from bimpm_custom_package.model.custom_bimpm_predictor import CustomBiMPMPredictor
except ModuleNotFoundError:
    from models.bimpm_custom_package.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from models.bimpm_custom_package.model.custom_bimpm import BiMpm as CustomBiMpm
    from models.bimpm_custom_package.model.multiclass_bimpm import BiMpm as MulticlassBiMpm
    from models.bimpm_custom_package.model.custom_bimpm_predictor import CustomBiMPMPredictor

In [None]:
%%writefile models/bimpm_custom_package/model/custom_bimpm.py

"""
BiMPM (Bilateral Multi-Perspective Matching) model implementation.
"""

from typing import Dict, Optional, List, Any

from overrides import overrides
import torch
import numpy

from allennlp.common.checks import check_dimensions_match
from allennlp.data import Vocabulary
from allennlp.modules import FeedForward, Seq2SeqEncoder, Seq2VecEncoder, TextFieldEmbedder
from allennlp.models.model import Model
from allennlp.nn import InitializerApplicator, RegularizerApplicator
from allennlp.nn import util
from allennlp.training.metrics import CategoricalAccuracy, F1Measure

from allennlp.modules.bimpm_matching import BiMpmMatching

from allennlp.nn.util import get_text_field_mask
import torch.nn.functional as F


@Model.register("custom_bimpm")
class BiMpm(Model):
    """
    This ``Model`` augments with additional features the BiMPM model described in `Bilateral Multi-Perspective 
    Matching for Natural Language Sentences <https://arxiv.org/abs/1702.03814>`_ by Zhiguo Wang et al., 2017.
    implemented in https://github.com/galsang/BIMPM-pytorch>`_.
    Additional features are added before the feedforward classifier.
    """

    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 matcher_word: BiMpmMatching,
                 encoder1: Seq2SeqEncoder,
                 matcher_forward1: BiMpmMatching,
                 matcher_backward1: BiMpmMatching,
                 encoder2: Seq2SeqEncoder,
                 matcher_forward2: BiMpmMatching,
                 matcher_backward2: BiMpmMatching,
                 aggregator: Seq2VecEncoder,
                 classifier_feedforward: FeedForward,
                 encode_together: bool = False,
                 encode_lstm: bool = True,
                 dropout: float = 0.1,
                 class_weights: list = [],
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(BiMpm, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder

        self.matcher_word = matcher_word

        self.encoder1 = encoder1
        self.matcher_forward1 = matcher_forward1
        self.matcher_backward1 = matcher_backward1

        self.encoder2 = encoder2
        self.matcher_forward2 = matcher_forward2
        self.matcher_backward2 = matcher_backward2

        self.aggregator = aggregator

        self.encode_together = encode_together
        self.encode_lstm = encode_lstm

        matching_dim = self.matcher_word.get_output_dim()

        if self.encode_lstm:
            matching_dim += self.matcher_forward1.get_output_dim(
            ) + self.matcher_backward1.get_output_dim(
            ) + self.matcher_forward2.get_output_dim(
            ) + self.matcher_backward2.get_output_dim(
            )

        check_dimensions_match(matching_dim, self.aggregator.get_input_dim(),
                               "sum of dim of all matching layers", "aggregator input dim")

        self.classifier_feedforward = classifier_feedforward

        self.dropout = torch.nn.Dropout(dropout)

        if class_weights:
            self.class_weights = class_weights
        else:
            self.class_weights = [1.] * self.classifier_feedforward.get_output_dim()

        self.metrics = {"accuracy": CategoricalAccuracy(),
                        "f1": F1Measure(1)}

        self.loss = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(self.class_weights))

        initializer(self)

    @overrides
    def forward(self,  # type: ignore
                premise: Dict[str, torch.LongTensor],
                hypothesis: Dict[str, torch.LongTensor],
                same_sentence: List[Dict[str, torch.FloatTensor]],
                same_paragraph: List[Dict[str, torch.FloatTensor]],
                #metadata: List[Dict[str, torch.FloatTensor]],
                label: torch.LongTensor = None,  # pylint:disable=unused-argument
                ) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        premise : Dict[str, torch.LongTensor]
            The premise from a ``TextField``
        hypothesis : Dict[str, torch.LongTensor]
            The hypothesis from a ``TextField``
        label : torch.LongTensor, optional (default = None)
            The label for the pair of the premise and the hypothesis
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            Additional information about the pair
        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_labels)`` representing unnormalised log
            probabilities of the entailment label.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """

        def encode_pair(x1, x2, mask1=None, mask2=None):
            _joined_pair: Dict[str, torch.LongTensor] = {}

            for key in premise.keys():
                bsz = premise[key].size(0)
                x1_len, x2_len = premise[key].size(1), hypothesis[key].size(1)
                sep = torch.empty([bsz, 1], dtype=torch.long, device=premise[key].device)
                sep.data.fill_(0)  # 2 is the id for </s>

                x = torch.cat([premise[key], hypothesis[key]], dim=1)
                _joined_pair[key] = x

            x_output = self.dropout(self.text_field_embedder(_joined_pair))
            return x_output[:, :x1_len], x_output[:, -x2_len:], mask1, mask2

        mask_premise = util.get_text_field_mask(premise)
        mask_hypothesis = util.get_text_field_mask(hypothesis)

        if self.encode_together:
            embedded_premise, embedded_hypothesis, _, _ = encode_pair(premise, hypothesis)
        else:
            embedded_premise = self.dropout(self.text_field_embedder(premise))
            embedded_hypothesis = self.dropout(self.text_field_embedder(hypothesis))

        # embedding and encoding of the premise
        encoded_premise1 = self.dropout(self.encoder1(embedded_premise, mask_premise))
        encoded_premise2 = self.dropout(self.encoder2(encoded_premise1, mask_premise))

        # embedding and encoding of the hypothesis
        encoded_hypothesis1 = self.dropout(self.encoder1(embedded_hypothesis, mask_hypothesis))
        encoded_hypothesis2 = self.dropout(self.encoder2(encoded_hypothesis1, mask_hypothesis))

        matching_vector_premise: List[torch.Tensor] = []
        matching_vector_hypothesis: List[torch.Tensor] = []

        def add_matching_result(matcher, encoded_premise, encoded_hypothesis):
            # utility function to get matching result and add to the result list
            matching_result = matcher(encoded_premise, mask_premise, encoded_hypothesis, mask_hypothesis)
            matching_vector_premise.extend(matching_result[0])
            matching_vector_hypothesis.extend(matching_result[1])

        # calculate matching vectors from word embedding, first layer encoding, and second layer encoding
        add_matching_result(self.matcher_word, embedded_premise, embedded_hypothesis)
        half_hidden_size_1 = self.encoder1.get_output_dim() // 2
        add_matching_result(self.matcher_forward1,
                            encoded_premise1[:, :, :half_hidden_size_1],
                            encoded_hypothesis1[:, :, :half_hidden_size_1])
        add_matching_result(self.matcher_backward1,
                            encoded_premise1[:, :, half_hidden_size_1:],
                            encoded_hypothesis1[:, :, half_hidden_size_1:])

        half_hidden_size_2 = self.encoder2.get_output_dim() // 2
        add_matching_result(self.matcher_forward2,
                            encoded_premise2[:, :, :half_hidden_size_2],
                            encoded_hypothesis2[:, :, :half_hidden_size_2])
        add_matching_result(self.matcher_backward2,
                            encoded_premise2[:, :, half_hidden_size_2:],
                            encoded_hypothesis2[:, :, half_hidden_size_2:])

        # concat the matching vectors
        matching_vector_cat_premise = self.dropout(torch.cat(matching_vector_premise, dim=2))
        matching_vector_cat_hypothesis = self.dropout(torch.cat(matching_vector_hypothesis, dim=2))

        # aggregate the matching vectors
        aggregated_premise = self.dropout(self.aggregator(matching_vector_cat_premise, mask_premise))
        aggregated_hypothesis = self.dropout(self.aggregator(matching_vector_cat_hypothesis, mask_hypothesis))

        # encode additional information
        batch_size, _ = aggregated_premise.size()
        encoded_same_sentence = same_sentence.float().view(batch_size, -1)
        encoded_same_paragraph = same_paragraph.float().view(batch_size, -1)

        # the final forward layer
        logits = self.classifier_feedforward(
            torch.cat([aggregated_premise, 
                       aggregated_hypothesis, 
                       encoded_same_sentence, 
                       encoded_same_paragraph], dim=-1))
        
        probs = torch.nn.functional.softmax(logits, dim=-1)

        output_dict = {'logits': logits, "probs": probs}

        if label is not None:
            loss = self.loss(logits, label)
            for metric in self.metrics.values():
                metric(logits, label)
            output_dict["loss"] = loss

        return output_dict

    @overrides
    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        """
        Converts indices to string labels, and adds a ``"label"`` key to the result.
        """
        predictions = output_dict["probs"].cpu().data.numpy()
        argmax_indices = numpy.argmax(predictions, axis=-1)
        labels = [self.vocab.get_token_from_index(x, namespace="labels")
                  for x in argmax_indices]
        output_dict['label'] = labels
        return output_dict

    @overrides
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {
            "f1": self.metrics["f1"].get_metric(reset=reset)[2],
            "accuracy": self.metrics["accuracy"].get_metric(reset=reset)
        }


In [None]:
%%writefile models/bimpm_custom_package/model/custom_bimpm_predictor.py

from allennlp.common import JsonDict
from allennlp.data import DatasetReader, Instance
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.models import Model
from allennlp.predictors import Predictor
from allennlp.predictors.decomposable_attention import DecomposableAttentionPredictor
from overrides import overrides

from allennlp.data.tokenizers import Tokenizer
from allennlp.data.tokenizers import Token, Tokenizer, CharacterTokenizer, WordTokenizer
from overrides import overrides
from typing import Dict, List, Tuple

try:
    from bimpm_custom_package.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
except ModuleNotFoundError:
    from models.bimpm_custom_package.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

# You need to name your predictor and register so that `allennlp` command can recognize it
# Note that you need to use "@Predictor.register", not "@Model.register"!
@Predictor.register("custom_bimpm_predictor")
class CustomBiMPMPredictor(DecomposableAttentionPredictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = WhitespaceTokenizer()

    def predict(self, premise: str, hypothesis: str, same_sentence: str, same_paragraph: str) -> JsonDict:
        return self.predict_json({"premise": premise, "hypothesis": hypothesis, 
                                  "same_sentence": same_sentence, "same_paragraph": same_paragraph})
    
    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like `{"premise": "...", "hypothesis": "...", "metadata": "..."}`.
        """
        premise_text = json_dict["premise"]
        hypothesis_text = json_dict["hypothesis"]
        same_sentence = json_dict["same_sentence"]
        same_paragraph = json_dict["same_paragraph"]
        #metadata = json_dict["metadata"]
        #same_sentence, same_paragraph = metadata.split('\t')
        return self._dataset_reader.text_to_instance(premise_text, 
                                                     hypothesis_text, 
                                                     label=None, 
                                                     same_sentence=same_sentence,
                                                     same_paragraph=same_paragraph)

###  Generate config file

In [None]:
print(TRAIN_FILE_PATH)
print(DEV_FILE_PATH)
print(TEST_FILE_PATH)

In [None]:
%%writefile $MODEL_PATH/config_elmo.json

// Configuration for a sentence matching model based on:
//   Wang, Zhiguo, Wael Hamza, and Radu Florian. "Bilateral multi-perspective matching for natural language sentences."
//   Proceedings of the 26th International Joint Conference on Artificial Intelligence. 2017.
// (Augmented with additional granularity related features)

local NUM_EPOCHS = 50;
local LR = 1e-3;

{
  "dataset_reader": {
    "type": "custom_pairs_reader",
    "lazy": false,
    "token_indexers": {
      "token_characters": {
        "type": "characters",
        "min_padding_length": 3
      },
      "elmo": {
        "type": "elmo_characters"
     }
    }
  },
  "train_data_path": "structure_predictor_bimpm/structure_cf_train.tsv",
  "validation_data_path": "structure_predictor_bimpm/structure_cf_dev.tsv",
  "model": {
    "type": "custom_bimpm",
    "dropout": 0.5,
    "class_weights": [0.4, 1.0],
    "encode_together": false,
    "text_field_embedder": {
        "token_embedders": {
            "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": "rsv_elmo/options.json",
                    "weight_file": "rsv_elmo/model.hdf5",
                    "do_layer_norm": false,
                    "dropout": 0.0
            },
            "token_characters": {
                "type": "character_encoding",
                "dropout": 0.2,
                "embedding": {
                    "embedding_dim": 20,
                    "padding_index": 0
                },
                "encoder": {
                    "type": "gru",
                    "input_size": 20,
                    "hidden_size": 50,
                    "num_layers": 1,
                    "bidirectional": true
              },
            }
      }
    },
    "matcher_word": {
      "is_forward": true,
      "hidden_dim": 1024+100,
      "num_perspectives": 10,
      "with_full_match": false
    },
    "encoder1": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": 1024+100,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward1": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward1": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "encoder2": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": 400,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward2": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward2": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "aggregator":{
      "type": "lstm",
      "bidirectional": true,
      "input_size": 264,
      "hidden_size": 100,
      "num_layers": 1,
    },
    "classifier_feedforward": {
      "input_dim": 200+200+1+1,
      "num_layers": 2,
      "hidden_dims": [200, 2],
      "activations": ["relu", "linear"],
      "dropout": [0.5, 0.0]
    },
    "initializer": [
      [".*linear_layers.*weight", {"type": "xavier_normal"}],
      [".*linear_layers.*bias", {"type": "constant", "val": 0}],
      [".*weight_ih.*", {"type": "xavier_normal"}],
      [".*weight_hh.*", {"type": "orthogonal"}],
      [".*bias.*", {"type": "constant", "val": 0}],
      [".*matcher.*match_weights.*", {"type": "kaiming_normal"}]
    ]
  },
  "iterator": {
        "type": "bucket",
        "batch_size": 20,
        "padding_noise": 0,
        "sorting_keys": [
            [
                "premise",
                "num_tokens"
            ],
            [
                "hypothesis",
                "num_tokens"
            ]
        ]
  },
  "trainer": {
    "num_epochs": NUM_EPOCHS,
    "cuda_device": 0,
    "optimizer": {
      "type": "adam",
      "lr": LR
    },
    "type":"callback",
    "callbacks": [
        {
            "type": "validate"
        },
        {
            "type": "checkpoint",
            "checkpointer":{
                "num_serialized_models_to_keep": 1
            }
        },
        {
            "type": "gradient_norm_and_clip", 
            "grad_norm": 10.0
        },
        {
            "type": "track_metrics",
            "patience": 7,
            "validation_metric": "+f1"
        },
        {
            "type": "log_metrics_to_wandb"
        }
    ],
  }
}

In [None]:
! mv ../../../maintenance_rst/models/structure_predictor_bimpm ../../../maintenance_rst/models/structure_predictor_bimpm_OLD

In [None]:
! cp -r models/structure_predictor_bimpm ../../../maintenance_rst/models/structure_predictor_bimpm

### 3. Scripts for training/prediction 

#### Option 1. Directly from the config

Train a model

In [None]:
%%writefile models/train_structure_predictor.sh
# usage:
# $ cd models 
# $ sh train_structure_predictor.sh {bert|elmo} result_directory

export METHOD=${1}
export RESULT_DIR=${2}
export DEV_FILE_PATH="structure_cf_dev.tsv"
export TEST_FILE_PATH="structure_cf_test.tsv"

rm -r structure_predictor_bimpm/${RESULT_DIR}/
allennlp train -s structure_predictor_bimpm/${RESULT_DIR}/ structure_predictor_bimpm/config_${METHOD}.json \
   --include-package bimpm_custom_package

Predict on dev&test

In [None]:
%%writefile models/eval_structure_predictor.sh
# usage:
# $ cd models 
# $ sh eval_structure_predictor.sh {bert|elmo} result_directory

export METHOD=${1}
export RESULT_DIR=${2}
export DEV_FILE_PATH="structure_cf_dev.tsv"
export TEST_FILE_PATH="structure_cf_test.tsv"

allennlp predict --use-dataset-reader --silent \
    --output-file structure_predictor_bimpm/${RESULT_DIR}/predictions_dev.json \
    structure_predictor_bimpm/${RESULT_DIR}/model.tar.gz structure_predictor_bimpm/${DEV_FILE_PATH} \
    --include-package bimpm_custom_package \
    --predictor custom_bimpm_predictor

allennlp predict --use-dataset-reader --silent \
    --output-file structure_predictor_bimpm/${RESULT_DIR}/predictions_test.json \
    structure_predictor_bimpm/${RESULT_DIR}/model.tar.gz structure_predictor_bimpm/${TEST_FILE_PATH} \
    --include-package bimpm_custom_package \
    --predictor custom_bimpm_predictor

#### Option 2. Using wandb for parameters adjustment

In [None]:
%%writefile models/wandb_structure_predictor.yaml
# usage:
# $ cd models
# wandb sweep wandb_structure_predictor.yaml

name: structure_predictor_stacked
program: wandb_allennlp # this is a wrapper console script around allennlp commands. It is part of wandb-allennlp
method: bayes
## Do not for get to use the command keyword to specify the following command structure
command:
  - ${program} #omit the interpreter as we use allennlp train command directly
  - "--subcommand=train"
  - "--include-package=bimpm_custom_package" # add all packages containing your registered classes here
  - "--config_file=structure_predictor_bimpm/config_elmo.json"
  - ${args}
metric:
    name: best_f1
    goal: maximize
parameters:
    model.type:
        values: ["custom_bimpm",]
    iterator.batch_size:
        values: [20,]
    model.encode_together:
        values: ["false",]
    trainer.optimizer.lr:
        values: [0.001,]
    model.dropout:
        values: [0.5]


3. Run training

``wandb sweep wandb_structure_predictor.yaml``

(returns %sweepname)

``wandb agent --count 1 %sweepname``

Move the best model in structure_predictor_bimpm

In [None]:
! cp -r models/wandb/run-20200720_203050-84hl3zwy/training_dumps models/structure_predictor_bimpm/snowy-sweep-2

In [None]:
! mv models/wandb/run-20200929_034343-5tmisocu models/structure_predictor_bimpm/colorful-sweep-1

###  Evaluate classifier

In [None]:
def load_predictions(path):
    result = []
    
    with open(path, 'r') as file:
        for line in file.readlines():
            result.append(json.loads(line)["label"])
            
    result = list(map(int, result))
    print('length of result:', len(result))
    return result

In [None]:
! cp ../../../maintenance_rst/models/structure_predictor_bimpm/colorful-sweep-1-dumps/*.json models/structure_predictor_bimpm/colorful-sweep-1-dumps/

In [None]:
RESULT_DIR = 'colorful-sweep-1-dumps'

On dev set

In [None]:
import pandas as pd
import json

true = pd.read_csv(DEV_FILE_PATH, sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_dev.json')
print('length of true labels:', len(true))

In [None]:
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(true[:len(pred)], pred)*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred)*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred)*100))

print(classification_report(true[:len(pred)], pred, digits=4))

On test set

In [None]:
import pandas as pd
import json

true = pd.read_csv(TEST_FILE_PATH, sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_test.json')
print('length of true labels:', len(true))

In [None]:
from sklearn.metrics import classification_report

print('f1: %.2f'%(f1_score(true[:len(pred)], pred)*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred)*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred)*100))

print(classification_report(true[:len(pred)], pred, digits=4))

### Ensemble 

In [None]:
model_vocab = [0, 1]
catboost_vocab = [0, 1]

def load_neural_predictions(path):
    result = []
    
    with open(path, 'r') as file:
        for line in file.readlines():
            probs = json.loads(line)['probs']
            probs = {model_vocab[i]: probs[i] for i in range(len(model_vocab))}
            result.append(probs)
            
    return result

def load_scikit_predictions(model, X):
    result = []
    
    try:
        predictions = model.predict_proba(X)
    except AttributeError:
        predictions = model._predict_proba_lr(X)
    
    for prediction in predictions:
        probs = {catboost_vocab[j]: prediction[j] for j in range(len(catboost_vocab))}
        result.append(probs)
    
    return result

def vote_predictions(pred1, pred2, soft=True):
    assert len(pred1) == len(pred2)
    result = []
    
    for i in range(len(pred1)):
        sample_result = {}
        for key in pred1[i].keys():
            if soft:
                sample_result[key] = pred1[i][key] + pred2[i][key]
            else:
                sample_result[key] = max(pred1[i][key], pred2[i][key])
        
        result.append(sample_result)
    
    return result

def probs_to_classes(pred):
    result = []
    
    for sample in pred:
        best_class = ''
        best_prob = 0.
        for key in sample.keys():
            if sample[key] > best_prob:
                best_prob = sample[key]
                best_class = key
        
        result.append(best_class)
    
    return result

In [None]:
import pickle

model = pickle.load(open('models/structure_predictor_baseline/model.pkl', 'rb'))
scaler = pickle.load(open('models/structure_predictor_baseline/scaler.pkl', 'rb'))
drop_columns = pickle.load(open('models/structure_predictor_baseline/drop_columns.pkl', 'rb'))

In [None]:
IN_PATH = 'data_structure'

train_samples = pd.read_pickle(os.path.join(IN_PATH, 'train_samples.pkl'))
dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
test_samples = pd.read_pickle(os.path.join(IN_PATH, 'test_samples.pkl'))

y_train, X_train = train_samples['relation'].to_frame(), train_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])
y_dev, X_dev = dev_samples['relation'].to_frame(), dev_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])
y_test, X_test = test_samples['relation'].to_frame(), test_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])

In [None]:
X_scaled_np = scaler.transform(X_dev)
X_dev = pd.DataFrame(X_scaled_np, index=X_dev.index)

X_scaled_np = scaler.transform(X_test)
X_test = pd.DataFrame(X_scaled_np, index=X_test.index)

In [None]:
from sklearn import metrics


TARGET = 'relation'
svm_predictions = load_scikit_predictions(model, X_dev)
neural_predictions = load_neural_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_dev.json')

tmp = vote_predictions(neural_predictions, svm_predictions, soft=True)
ensemble_pred = probs_to_classes(tmp)

print('f1: %.2f'%(metrics.f1_score(y_dev, ensemble_pred)*100.))
print('pr: %.2f'%(metrics.precision_score(y_dev, ensemble_pred)*100.))
print('re: %.2f'%(metrics.recall_score(y_dev, ensemble_pred)*100.))
print()
print(metrics.classification_report(y_dev, ensemble_pred, digits=4))

In [None]:
svm_predictions = load_scikit_predictions(model, X_test)
neural_predictions = load_neural_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_test.json')

tmp = vote_predictions(neural_predictions, svm_predictions, soft=True)
ensemble_pred = probs_to_classes(tmp)

print('f1: %.2f'%(metrics.f1_score(y_test, ensemble_pred)*100.))
print('pr: %.2f'%(metrics.precision_score(y_test, ensemble_pred)*100.))
print('re: %.2f'%(metrics.recall_score(y_test, ensemble_pred)*100.))
print()
print(metrics.classification_report(y_test, ensemble_pred, digits=4))