# Generate text file with corrected documents for UA-GEC validation set

In [1]:
!git clone https://github.com/Reennon/ua-gec-lora.git
!cd ua-gec-lora && pip install -q -r requirements.txt
!pwd && ls -a
# Install additional libs
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c
!pip install -q ua_gec
!pip install -q datasets==2.16.0
!pip install -q nltk
!pip install -q toolz
# CD into the project directory
%cd ua-gec-lora
!git pull origin "feature/fine-tuning-research"
!git status

Cloning into 'ua-gec-lora'...
remote: Enumerating objects: 156, done.[K
remote: Counting objects: 100% (156/156), done.[K
remote: Compressing objects: 100% (139/139), done.[K
remote: Total 156 (delta 66), reused 69 (delta 12), pack-reused 0[K
Receiving objects: 100% (156/156), 450.00 KiB | 4.84 MiB/s, done.
Resolving deltas: 100% (66/66), done.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cubinlinker, which is not installed.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires ptxcompiler, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
keras-nlp 0.8.1 requires keras-core, which is not installed.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not insta

In [2]:
from transformers import AutoModelForCausalLM, pipeline, Conversation, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from src.packages.constants.error_constants import ErrorConstants
from ua_gec import Corpus
from langchain.prompts import PromptTemplate
from tqdm import tqdm
from difflib import SequenceMatcher
import torch
import nltk
import re
import toolz
import gc
import os
import re

nltk.download('punkt')  # Download the necessary resources for sentence tokenization

from nltk.tokenize import sent_tokenize

2024-03-25 10:35:17.867051: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-25 10:35:17.867149: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-25 10:35:17.970078: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
base_model_name = "mistralai/Mistral-7B-Instruct-v0.2"

#### Specify fine-tuned model path and ouput filename

In [5]:
fine_tuned_model_name = 'rkovalchuk/mistral-7b-ua-gec'
generated_file_name = 'fine-tuned-1500-samples'

#### Load fine-tuned model

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant= True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16, 
    device_map={'':torch.cuda.current_device()},)

model = PeftModel.from_pretrained(base_model, fine_tuned_model_name)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/23.1M [00:00<?, ?B/s]

#### Load tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_name,
    padding_side='left',
    trust_remote_code=True)

# Fix padding token for Mistral and Phi-2 models
tokenizer.pad_token = "[PAD]"

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

#### Setup threshold for tokenizer

In [8]:
max_tokens_length_threshold = 3400

#### Load prompt

In [9]:
template = """[INST] Given a text ("ORIGINAL_TEXT") in Ukrainian with potential errors, correct them to fulfill the GEC (Grammar Error Correction) Task.
Consider the provided set of error types ("ERROR_TYPES"):
{error_types}
When you identify an error ("ERROR") in the text, correct it according to the format:
("ERROR") => ("CORRECTION")
The correction should address the error without providing explicit reasoning for the change.
The resulting text ("FIXED_TEXT") should be error-free, maintaining the original information's semantics.
Focus solely on correcting Ukrainian language errors.
Ensure that the corrected text doesn't include original errors, additional text, comments, or parts of these instructions.

ORIGINAL_TEXT: {query}
FIXED_TEXT:
[/INST]"""

it_prompt = PromptTemplate(
    template=template,
    input_variables=['query', 'error_types']
)

### Define helper methods for text generation

In [10]:
def generate_promt(source):
    prompt = it_prompt.format_prompt(
        query=source,
        error_types=ErrorConstants.ERROR_TYPES
    ).to_string()
    
    return prompt

def get_max_tokens_length(
    prompt, 
    original_document,
    threshold=max_tokens_length_threshold,
    print_info=False):
    prompt_len = len(tokenizer.tokenize(prompt))
    source_len = len(tokenizer.tokenize(original_document))
    max_tokens = prompt_len + source_len
    max_tokens = min(threshold, max_tokens)
    if print_info:
        print(f"""
Prompt len: {prompt_len}
Source len: {source_len}
Max tokens: {max_tokens}
""")
    return max_tokens
    
def torch_helper():
    # fix torch cuda errors on generation result
    torch.backends.cuda.enable_mem_efficient_sdp(False)
    torch.backends.cuda.enable_flash_sdp(False)
    
def clear_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()
    
def tokenized_response_to_text(response, prompt):
    decoded_outputs = tokenizer.batch_decode(response.detach().cpu().numpy(), skip_special_tokens=True)
    text = decoded_outputs[0][len(prompt):]
    text_with_sentences = re.sub(r'(?<=[.!?])(?![\'"])', ' ', text)
    return "\n".join(sent_tokenize(text_with_sentences))

### Define helper methods for visual comparison of the output

In [11]:
def highlight_changes(text1, text2):
    # Tokenize the texts into words
    words1 = re.findall(r'\w+|[^\w\s]', text1)
    words2 = re.findall(r'\w+|[^\w\s]', text2)


    # Find the unique words present in both texts
    all_words = set(words1 + words2)

    # Initialize a SequenceMatcher object
    matcher = SequenceMatcher(None, words1, words2)

    # Get the differences
    diff = matcher.get_opcodes()

    highlighted_text = []

    for op, start1, end1, start2, end2 in diff:
        if op == 'equal':
            # No change, just append the words as is
            highlighted_text.extend(words1[start1:end1])
        elif op == 'delete':
            # Word(s) removed, highlight with red
            for word in words1[start1:end1]:
                word = '\u0336'.join(word) + '\u0336'
                highlighted_text.append('\033[91m\033[1m' + word + '\033[0m')
        elif op == 'insert':
            # Word(s) added, highlight with green
            for word in words2[start2:end2]:
                highlighted_text.append('\033[92m\033[1m' + word + '\033[0m')
        elif op == 'replace':
            # Word(s) replaced, highlight with yellow
            for word in words2[start2:end2]:
                highlighted_text.append('\033[93m\033[1m' + word + '\033[0m')

    return ' '.join(highlighted_text)

def generate_original_corrected_texts(original_text, corrected_text):
    # Split the original and corrected texts
    original_words = original_text.split()
    corrected_words = corrected_text.split()

    # Initialize empty lists for marked original and corrected texts
    marked_original_text = []
    marked_corrected_text = []

    # Track words from the original text that were removed
    removed_words = set(original_words) - set(corrected_words)

    # Track words from the corrected text that were added
    added_words = set(corrected_words) - set(original_words)

    # Mark removed words in the original text as red
    for word in original_words:
        if word in removed_words:
            marked_original_text.append('\033[91m\033[1m' + word + '\033[0m')
        else:
            marked_original_text.append(word)

    # Mark added words in the corrected text as green
    for word in corrected_words:
        if word in added_words:
            marked_corrected_text.append('\033[92m\033[1m' + word + '\033[0m')
        else:
            marked_corrected_text.append(word)

    return (' '.join(marked_original_text), ' '.join(marked_corrected_text))


def print_text_comparison(original, generated):
    highlighted_text = highlight_changes(original, generated)

    original_text, corrected_text, = generate_original_corrected_texts(
        original_text=original, 
        corrected_text=generated
    )

    print(f"Original Text:\n{original_text}")

    print(f"\nCorrected Text:\n{corrected_text}")

    print(f"\nChanges comparison:\n{highlighted_text}")

### Load test set for UA-GEC and remove duplicates

In [12]:
corpus_test_list = list(Corpus(partition="test", annotation_layer="gec-only"))

# remove duplicates
corpus_test_list = list(toolz.unique(corpus_test_list, key=lambda x: x.doc_id))
print(f'Number of samples in test set: {len(corpus_test_list)}')

Number of samples in test set: 166


### Test model generation on one document

In [13]:
doc = corpus_test_list[0]
print(f'--- Document id: {doc.doc_id}')
source = doc.source
prompt = generate_promt(source)
print(f"\n--- Prompt for training:\n\n{prompt}")

--- Document id: 0002

--- Prompt for training:

[INST] Given a text ("ORIGINAL_TEXT") in Ukrainian with potential errors, correct them to fulfill the GEC (Grammar Error Correction) Task.
Consider the provided set of error types ("ERROR_TYPES"):
['Fluency', 'Grammar', 'Punctuation', 'Spelling']
When you identify an error ("ERROR") in the text, correct it according to the format:
("ERROR") => ("CORRECTION")
The correction should address the error without providing explicit reasoning for the change.
The resulting text ("FIXED_TEXT") should be error-free, maintaining the original information's semantics.
Focus solely on correcting Ukrainian language errors.
Ensure that the corrected text doesn't include original errors, additional text, comments, or parts of these instructions.

ORIGINAL_TEXT: Наступного ранку рівно о одинадцятій годині, коли я сидів сам, дядько Том шаштався в готелі і попросив у лікаря підійти і побачити Джанге Банк, хто, воно здавалось, це був майор і дуже хворий чолові

In [14]:
max_length = get_max_tokens_length(prompt, source, print_info=True)


Prompt len: 538
Source len: 337
Max tokens: 875



In [15]:
model_inputs = tokenizer(
    prompt, 
    max_length=max_length, 
    padding="max_length", 
    truncation=True, 
    return_tensors="pt"
)

In [16]:
model = model.eval()

In [17]:
torch_helper()

In [18]:
with torch.no_grad():
    response = model.generate(
        input_ids=model_inputs["input_ids"].to(device),
        attention_mask=model_inputs["attention_mask"].to(device),
        max_new_tokens=max_length,
        pad_token_id=tokenizer.eos_token_id
    )

In [19]:
response_text = tokenized_response_to_text(response, prompt)
response_text

'Наступного ранку рівно о одинадцятій годині, коли я сидів сам, дядько Том шаштався в готелі і попросив у лікаря підійти і побачити Джанге Банк, хто, воно здавалось, був майор і дуже хворий чоловік.\n—Я не є доктором, — сказав я.\n—Чому вам не піти до лікаря?\n— "Босс", — сказав він.\n—Доктор Хоскінс має проїхати 20 миль по країні, щоб побачити хворих персон.\nВін є єдиним лікарем у місті, і Масса банки сильно погано обурудували.\nВін відправив мені, щоб спитав, чи погоджується хоч прийти".\n—Як чоловік до чоловіка, — я сказав.'

In [20]:
print_text_comparison(source, response_text)

Original Text:
Наступного ранку рівно о одинадцятій годині, коли я сидів сам, дядько Том шаштався в готелі і попросив у лікаря підійти і побачити Джанге Банк, хто, воно здавалось, [91m[1mце[0m був майор і дуже хворий чоловік. [91m[1m"Я[0m не є [91m[1mдоктор"[0m [91m[1m-[0m сказав [91m[1mя:[0m [91m[1m"Чому[0m вам не піти до [91m[1mлікаря?".[0m [91m[1m"Босс"[0m [91m[1m-[0m сказав [91m[1mвін:[0m [91m[1m"Доктор[0m Хоскінс має проїхати 20 миль по країні, щоб побачити хворих персон. Він є [91m[1mєдиний[0m [91m[1mлікар[0m в [91m[1mмісті[0m і Масса банки сильно погано [91m[1mобурудувані.[0m Він відправив мені, щоб [91m[1mспитав[0m чи [91m[1mпогоджуйтеся[0m хоч прийти". [91m[1m"Як[0m чоловік до [91m[1mчоловіка"[0m [91m[1m-[0m я [91m[1mсказав:[0m [91m[1m"Я[0m [91m[1mбуду[0m [91m[1mйти[0m і [91m[1mпошукаю[0m [91m[1mйого[0m [91m[1mпізніше".[0m [91m[1mОтже,[0m [91m[1mЯ[0m [91m[1mпідняв[0m [91m[1mпляшку[0m 

## Create text file with generated responses for all test set

In [21]:
def generate_response_for_test_set(
    model,
    tokenizer,
    output_file_path,
    test_data,
    ):

    for doc in tqdm(test_data):
        id_str = doc.doc_id
        source = doc.source

        # tokenize input
        prompt = generate_promt(source)
        max_token_length = get_max_tokens_length(prompt, source)
        tokenized_prompt = tokenizer(
            prompt, 
            max_length=max_token_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )

        torch_helper()
        clear_gpu_memory()

        # generate correction
        with torch.no_grad():
            response = model.generate(
                input_ids=tokenized_prompt["input_ids"].to(device),
                attention_mask=tokenized_prompt["attention_mask"].to(device),
                max_new_tokens=max_token_length,
                pad_token_id=tokenizer.eos_token_id
            )

        # preprocess output
        response_text = tokenized_response_to_text(response, prompt)
        
        # write to file
        with open(result_path, 'a+') as file:
            file.write(f'# {id_str}\n')
            file.write(response_text)
            file.write('\n')

### Define output path

In [22]:
output_folder = os.path.join(os.getcwd(), generated_file_name)
os.makedirs(output_folder, exist_ok=True)
result_path = os.path.join(output_folder,f'{generated_file_name}.txt')

### Generate

In [23]:
generate_response_for_test_set(model, tokenizer, result_path, corpus_test_list)

100%|██████████| 166/166 [1:47:04<00:00, 38.70s/it]
