In [1]:
!pip install -q errant
!pip install -q spacy
!pip install -q stanza

In [2]:
import subprocess
import sys
import spacy
import stanza
import os
import re
from pathlib import Path

#### Path definitions

In [3]:
input_folder_path = Path('/kaggle/input')
output_folder_path = Path(os.getcwd())

validation_m2_path = input_folder_path / 'valid-target' / 'valid.m2'
generated_text_folder_path = input_folder_path / 'generated'

#### Evaluator class with all helper methods

In [4]:
class EvaluateErrant:
    def __init__(self,
                 generated_text_path: Path,
                 valid_m2_path: Path,
                 output_folder_path: Path,
                 last_id = None
                ):
        self.last_id = last_id
        self.generated_text_path = generated_text_path
        self.valid_m2_path = valid_m2_path
        self.tokenized_generated_path = output_folder_path / f"generated.tok"
        self.tokenized_valid_path = output_folder_path / f"valid.tok"
        self.validation_m2_cropped_path = output_folder_path / f"valid_cropped.m2"
        self.generated_m2_cropped_path = output_folder_path / f"generated_updated.m2"
        self.generated_m2_path = output_folder_path / f"generated.m2"
        self.nlp=stanza.Pipeline(lang="uk", processors="tokenize")

    def _tokenize(self,
                  text: str) -> [str]:
        tokenized = " ".join([t.text for t in self.nlp(text).iter_tokens()])
        return tokenized

    def _tokenize_file(self,
                       input_file: Path, 
                       output_file: Path):
        with open(input_file) as f, open(output_file, "w") as out:
            for line in f:
                line = line.rstrip("\n")
                tokenized = self._tokenize(line)
                out.write(tokenized + "\n")
            
    def _tokenize_generated(self):
        print("Tokenizing submission...", file=sys.stderr)
        self._tokenize_file(self.generated_text_path, self.tokenized_generated_path)
        print(f"Tokenized: {self.tokenized_generated_path}", file=sys.stderr)
            
    def _get_last_id_from_tokenized(self):
        generated_ids = []
        with open(self.tokenized_generated_path) as f:
            for line in f:
                if line.startswith('#'):
                    generated_ids.append(line)

        last_id = re.search(r'\d+', generated_ids[-1]).group()
        return last_id

    def _generate_croped_tokenized_valid_from_m2(self,
                                                 include_last_block = True):
        stop = not include_last_block
        with open(self.valid_m2_path) as f, open(self.tokenized_valid_path, "w") as out:
            for line in f:
                if line.startswith("S "):
                    line = line[2:]
                    if line.startswith('#'):
                        if stop:
                            break
                        if self.last_id in line:
                            stop = True
                    out.write(line)
                
    def _get_last_id_from_m2(self):
        generated_ids = []
        with open(self.generated_m2_path) as f:
            for line in f:
                if line.startswith("S "):
                    line = line[2:]
                    if line.startswith('#'):
                        generated_ids.append(line)
        last_id = re.search(r'\d+', generated_ids[-1]).group()
        return last_id
    
    def _update_m2_file(self, input_path, output_path):
        with open(input_path) as f, open(output_path, "w") as out:
            for line in f:
                if line[2:].startswith('#'):
                    if self.last_id in line:
                        break
                out.write(line)

    def _print_m2_content_length(self, input_path):
        content_len = 0
        block_count = 0
        with open(input_path) as f:
            for line in f:
                if line.startswith("S "):            
                    content_len+=1
                    line = line[2:]
                    if line.startswith('#'):
                        block_count+=1
        print(f'Content length: {content_len}')
        print(f'Blocks count: {block_count}')
    
    def evaluate(self):
        # tokenize generated text
        self._tokenize_generated()
        
        # get validation tokenized from m2 (cropped by the ids from generated text)
        tokenized_last_id = self._get_last_id_from_tokenized()
        self.last_id = min(tokenized_last_id, self.last_id) if self.last_id else tokenized_last_id
        self._generate_croped_tokenized_valid_from_m2(self.last_id)
        
        # generate m2 errant file for generated text
        subprocess.run(["errant_parallel", "-orig", self.tokenized_valid_path, "-cor", self.tokenized_generated_path, "-out", self.generated_m2_path], check=True)
        print(f"Aligned submission: {self.generated_m2_path}", file=sys.stderr)
        
        # crop m2 files to have the same length
        last_id = self._get_last_id_from_m2()
        self.last_id = min(last_id, self.last_id)
        self._update_m2_file(self.valid_m2_path, self.validation_m2_cropped_path)
        self._update_m2_file(self.generated_m2_path, self.generated_m2_cropped_path)
        
        # check the length and blocks count
        print('-- Target --')
        self._print_m2_content_length(self.validation_m2_cropped_path)
        print('\n-- Generated --')
        self._print_m2_content_length(self.generated_m2_cropped_path)
        
        # evaluate
        subprocess.run(["errant_compare", "-hyp", self.generated_m2_cropped_path, "-ref", self.validation_m2_cropped_path])
        subprocess.run(["errant_compare", "-hyp", self.generated_m2_cropped_path, "-ref", self.validation_m2_cropped_path, "-ds"])
        
        return self.last_id
        

### Evaluate raw model

In [5]:
generated_text_raw_path = generated_text_folder_path / 'raw.txt'
output_raw = output_folder_path / 'raw_model'
os.makedirs(output_raw, exist_ok=True)

evaluator = EvaluateErrant(
    generated_text_path = generated_text_raw_path,
    valid_m2_path = validation_m2_path,
    output_folder_path = output_raw)

last_id = evaluator.evaluate()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-uk/resolve/v1.8.0/models/tokenize/iu.pt:   0%|          …

Downloading https://huggingface.co/stanfordnlp/stanza-uk/resolve/v1.8.0/models/mwt/iu.pt:   0%|          | 0.0…

Tokenizing submission...
Tokenized: /kaggle/working/raw_model/generated.tok


Loading resources...
Processing parallel files...


Aligned submission: /kaggle/working/raw_model/generated.m2


-- Target --
Content length: 1493
Blocks count: 92

-- Generated --
Content length: 1493
Blocks count: 92

TP	FP	FN	Prec	Rec	F0.5
38	8127	1129	0.0047	0.0326	0.0056


TP	FP	FN	Prec	Rec	F0.5
333	7826	1049	0.0408	0.241	0.0489



### Evaluate LoRA fine-tuned model on 500 test samples with configuration r=4

In [6]:
generated_text_tuned_path = generated_text_folder_path / 'fine-tuned-r4-500.txt'
output_tuned = output_folder_path / 'fine-tuned-r4-500'
os.makedirs(output_tuned, exist_ok=True)

evaluator2 = EvaluateErrant(
    generated_text_path = generated_text_tuned_path,
    valid_m2_path = validation_m2_path,
    output_folder_path = output_tuned,
    last_id = last_id)

_ = evaluator2.evaluate()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

Tokenizing submission...
Tokenized: /kaggle/working/fine-tuned-r4-500/generated.tok


Loading resources...
Processing parallel files...


Aligned submission: /kaggle/working/fine-tuned-r4-500/generated.m2


-- Target --
Content length: 1493
Blocks count: 92

-- Generated --
Content length: 1493
Blocks count: 92

TP	FP	FN	Prec	Rec	F0.5
52	8024	1101	0.0064	0.0451	0.0078


TP	FP	FN	Prec	Rec	F0.5
351	7724	1033	0.0435	0.2536	0.0521



### Evaluate LoRA fine-tuned model on 500 test samples with configuration r=8

In [7]:
generated_text_tuned_path = generated_text_folder_path / 'fine-tuned-r8-500.txt'
output_tuned = output_folder_path / 'fine-tuned-r8-500'
os.makedirs(output_tuned, exist_ok=True)

evaluator3 = EvaluateErrant(
    generated_text_path = generated_text_tuned_path,
    valid_m2_path = validation_m2_path,
    output_folder_path = output_tuned,
    last_id = last_id)

_ = evaluator3.evaluate()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

Tokenizing submission...
Tokenized: /kaggle/working/fine-tuned-r8-500/generated.tok


Loading resources...
Processing parallel files...


Aligned submission: /kaggle/working/fine-tuned-r8-500/generated.m2


-- Target --
Content length: 1493
Blocks count: 92

-- Generated --
Content length: 1493
Blocks count: 92

TP	FP	FN	Prec	Rec	F0.5
40	8404	1119	0.0047	0.0345	0.0057


TP	FP	FN	Prec	Rec	F0.5
358	8082	1046	0.0424	0.255	0.0509



### Evaluate LoRA fine-tuned model on 1500 test samples with configuration r=4

In [8]:
generated_text_tuned_path = generated_text_folder_path / 'fine-tuned-r4-1500.txt'
output_tuned = output_folder_path / 'fine-tuned-r4-1500'
os.makedirs(output_tuned, exist_ok=True)

evaluator4 = EvaluateErrant(
    generated_text_path = generated_text_tuned_path,
    valid_m2_path = validation_m2_path,
    output_folder_path = output_tuned,
    last_id = last_id)

_ = evaluator4.evaluate()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

Tokenizing submission...
Tokenized: /kaggle/working/fine-tuned-r4-1500/generated.tok


Loading resources...
Processing parallel files...


Aligned submission: /kaggle/working/fine-tuned-r4-1500/generated.m2


-- Target --
Content length: 1493
Blocks count: 92

-- Generated --
Content length: 1493
Blocks count: 92

TP	FP	FN	Prec	Rec	F0.5
33	8305	1129	0.004	0.0284	0.0048


TP	FP	FN	Prec	Rec	F0.5
328	8010	1003	0.0393	0.2464	0.0473



### Evaluate LoRA fine-tuned model on 1000 test samples with configuration r=8

In [9]:
generated_text_tuned_path = generated_text_folder_path / 'fine-tuned-r8-1000.txt'
output_tuned = output_folder_path / 'fine-tuned-r8-1000'
os.makedirs(output_tuned, exist_ok=True)

evaluator5 = EvaluateErrant(
    generated_text_path = generated_text_tuned_path,
    valid_m2_path = validation_m2_path,
    output_folder_path = output_tuned,
    last_id = last_id)

_ = evaluator5.evaluate()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

Tokenizing submission...
Tokenized: /kaggle/working/fine-tuned-r8-1000/generated.tok


Loading resources...
Processing parallel files...


Aligned submission: /kaggle/working/fine-tuned-r8-1000/generated.m2


-- Target --
Content length: 1493
Blocks count: 92

-- Generated --
Content length: 1493
Blocks count: 92

TP	FP	FN	Prec	Rec	F0.5
39	7899	1148	0.0049	0.0329	0.0059


TP	FP	FN	Prec	Rec	F0.5
325	7613	1036	0.0409	0.2388	0.0491

