In [2]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

def import_sys():
    import sys
    sys.path.append('..')
import_sys()

import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name

In [3]:
import errudite
print(errudite)

<module 'errudite' from '/Users/gyuhoshim/errudite/errudite/__init__.py'>


In [4]:
import pandas as pd
from overrides import overrides
from errudite.io import DatasetReader
from errudite.utils import normalize_file_path, accuracy_score
from errudite.targets.instance import Instance
from errudite.targets.target import Target
from errudite.targets.label import Label, PredefinedLabel



## 1. DatasetReader

## Define a `DatasetReader`.
---

A ``DatasetReader`` ([documentation here](https://errudite.readthedocs.io/en/latest/api/_extensible_dataset_reader.html)) loads the raw data from a data file, preprocess the data to include linguistic features, and save the processed data to a cache folder. All the task specific readers are *registered* under the base class ``DatasetReader``, so they could be queried via their names:

```python
DatasetReader.by_name("dataset_name")
```

Because the reader also handles dumping the processed instances into cache files, we require you to provide a desired cache path. If not provided, the default path is `./caches/`

To get the pre-implemented reader for SNLI, we could run the following: 

In [5]:
@DatasetReader.register("STE")
class STEReader(DatasetReader):
    def __init__(self, cache_folder_path: str=None) -> None:
        super().__init__(cache_folder_path)
        # overwrite the primary evaluation method and metric name
        Label.set_task_evaluator(accuracy_score, 'accuracy')
        
    @overrides
    def _read(self, file_path: str, lazy: bool, sample_size: int):
        """
        Returns a list containing all the instances in the specified dataset.

        Parameters
        ----------
        file_path : str
            The path of the input data file.
        lazy : bool, optional
            If ``lazy==True``, only run the tokenization, does not compute the linguistic
            features like POS, NER. By default False
        sample_size : int, optional
            If sample size is set, only load this many of instances, by default None
        
        Returns
        -------
        List[Instance]
            The instance list.
        """
        logger.info("Reading instances from lines in file at: %s", file_path)
        df = pd.read_csv(normalize_file_path(file_path), sep='\t')
        for idx, row in tqdm(df.iterrows()):
            if lazy:
                essay_ID.append(row['Essay_ID'])
                essay.append(row['Essay'])
            else:
                instance = self._text_to_instance(f'q:{idx}', row)
                if instance is not None:
                    instances.append(instance)
                if sample_size and idx > sample_size:
                    break
        if lazy:
            return { "Essay_ID": premises, "Essay": hypotheses }
        else:
            return instances
    
    @overrides
    def _text_to_instance(self, id: str, row) -> Instance:
        # The function that transfers raw text to instance.
        premise = Target(qid=row['pairID'], text=row['Essay_ID'], vid=0, metas={'type': 'Essay_ID'})
        hypothesis = Target(qid=row['pairID'], text=row['Essay'], vid=0, metas={'type': 'Essay'})
        # label
        raw_labels = [row[f'label{i}']  for i in range(1,6)]
        groundtruth = PredefinedLabel(
            model='groundtruth', 
            qid=row['pairID'], 
            text=row['gold_label'], 
            vid=0, 
            metas={'raw_labels': raw_labels}
        )
        return self.create_instance(row['pairID'], 
            hypothesis=hypothesis, 
            premise=premise, 
            groundtruth=groundtruth)

In [6]:
import pandas as pd

from overrides import overrides
from errudite.io import DatasetReader
from errudite.targets.instance import Instance
from errudite.targets.target import Target
from errudite.targets.label import Label

@DatasetReader.register("student_essays")
class StudentEssaysReader(DatasetReader):
    def __init__(self, cache_folder_path: str = None):
        super().__init__(cache_folder_path)

    @overrides
    def _read(self, file_path, lazy=False, sample_size=None):

        instances = []

        # Load the dataset from the provided file_path (assuming it's in Excel format)
        df = pd.read_excel(file_path)

        for idx, row in df.iterrows():
            if lazy:
                essays = [row['Essay']]
            else:
                instance = self._text_to_instance(f'essay_{idx}', row)
                if instance is not None:
                    instances.append(instance)
                if sample_size and idx >= sample_size:
                    break

        if lazy:
            return {"essays": essays}
        else:
            return instances

    @overrides
    def _text_to_instance(self, id: str, row) -> Instance:
        essay = Target(qid=row['Essay_ID'], text=row['Essay'], vid=0)
        return self.create_instance(row['Essay_ID'], essay=essay)


In [7]:
#This reader, as we did before, can be queried via:

# #from errudite.readers import DatasetReader
# from errudite.io import DatasetReader

cache_folder_path = "./data/ste_cache/"
reader=DatasetReader.by_name("student_essays")(cache_folder_path=cache_folder_path)
instances=reader.read("./StudentEssays.xlsx", sample_size= 70)

INFO:errudite.utils.file_utils:Errudite cache folder selected: ./data/ste_cache/
INFO:errudite.io.dataset_reader:Reading instances from lines in file at: ./StudentEssays.xlsx


In [8]:
instances[0]

Instance [InstanceKey(qid='L24-RCFinal proposal, L3b8-23_RCFinal proposal, & L3b8-08_RCFinal proposal', vid=0)]

In [76]:
# from errudite.io import DatasetReader

# # Create an instance of your custom dataset reader
# reader = DatasetReader.by_name("student_essays")

# # Use the read method on the reader instance
# instances = reader.read(file_path="./StudentEssays.xlsx", lazy=False, sample_size=None)


TypeError: read() missing 1 required positional argument: 'self'

## 2. Predictor

In [47]:
from typing import List, Dict
#from errudite.io import Predictor
#from allennlp.predictors import Predictor
from errudite.predictors import Predictor
from errudite.targets.label import Label, PredefinedLabel
from errudite.targets.instance import Instance
from errudite.predictors.predictor_allennlp import PredictorAllennlp # a wrapper for Allennlp classes

In [50]:
@Predictor.register("nli_decompose_att")
class PredictorNLI(PredictorAllennlp,Predictor):
    """
    The wrapper for DecomposableAttention model, as implemented in Allennlp:
    https://allenai.github.io/allennlp-docs/api/allennlp.predictors.html#decomposable-attention
    """
    def __init__(self, name: str, 
        model_path: str=None,
        model_online_path: str=None,
        description: str='') -> None:
        PredictorAllennlp.__init__(self, name, model_path, model_online_path, description)
        Predictor.__init__(self, name, description, model, perform_metrics)
        # set the perform metrics
        perform_metrics = ['accuracy', 'confidence']
        # First, define the evaluation function to determine how well a model is doing 
        # on one instance, based on an individual predicted label.
        from ...utils.evaluator import accuracy_score
        # Second, from the metrics above, pick one that's primary, and it will be used 
        # to compute `is_incorrect()` in any label target object: primary metric < 1.
        Label.set_task_evaluator(
            # the evaluation function that accepts pred and groundtruths, 
            # and return a dict of metrics: { metric_name: metric_score }. 
            # This is saved as Label.task_evaluation_func.
            task_evaluation_func=accuracy_score, 
            # The primary task metric name, ideally a key of task_evaluation_func ‘s return.
            task_primary_metric='accuracy')

    # the raw prediction function, returning the output of the model in a json format.
    def predict(self, premise: str, hypothesis: str) -> Dict[str, float]:
        try:
            labels = ['entailment', 'contradiction', 'neutral']
            predicted = self.model.predict_json({
                "premise": premise, "hypothesis":hypothesis})
            return {
                'confidence': max(predicted['label_probs']),
                'text': labels[np.argmax(label_probs)],
            }
        except:
            raise

    @classmethod
    # the class method that takes `Target` inputs, and output a `Label` object.
    def model_predict(cls, 
        predictor: Predictor, 
        premise: Target, 
        hypothesis: Target, 
        groundtruth: Label) -> 'Label':
        answer = None
        if not predictor:
            return answer
        predicted = predictor.predict(premise.get_text(), hypothesis.get_text())
        if not predicted:
            return None
        answer = PredefinedLabel(
            model=predictor.name, 
            qid=premise.qid,
            text=predicted['text'], 
            vid=max([premise.vid, hypothesis.vid, groundtruth.vid] ))
        answer.compute_perform(groundtruths=groundtruth)
        answer.set_perform(confidence=predicted['confidence'])
        return answer



