
 """
 ## Simplify Estonian Sentences with Saved Model

Author: Eduard Barbu

Contact: eduard.barbu@ut.ee

This script handles two scenarios for simplifying Estonian sentences:
1. Simplifying a dataset with columns ["source", "original", "simplified", "corrected"].

2. Simplifying sentences from a file where each line contains a number and a sentence to simplify.

Ensure the appropriate input format for the chosen scenario and specify the file paths accordingly.

## Dependencies:
- `unsloth`
- `datasets`
- `pandas`
"""



In [1]:
# %%capture
# Install required packages in Google Colab
!pip install torch==2.0.0+cu118 torchvision==0.15.0+cu118 -f https://download.pytorch.org/whl/torch_stable.html
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps trl peft accelerate bitsandbytes
!pip install "xformers<=0.0.27" --no-cache-dir
!pip install numpy==1.23.5


Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==2.0.0+cu118
  Using cached https://download.pytorch.org/whl/cu118/torch-2.0.0%2Bcu118-cp311-cp311-linux_x86_64.whl (2267.3 MB)
Collecting triton==2.0.0 (from torch==2.0.0+cu118)
  Using cached triton-2.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.0 kB)
Using cached triton-2.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)
Installing collected packages: triton, torch
  Attempting uninstall: triton
    Found existing installation: triton 2.3.1
    Uninstalling triton-2.3.1:
      Successfully uninstalled triton-2.3.1
  Attempting uninstall: torch
    Found existing installation: torch 2.3.1
    Uninstalling torch-2.3.1:
      Successfully uninstalled torch-2.3.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
xforme

In [2]:
import xformers
print(xformers.__version__)

0.0.27


In [3]:
from google.colab import drive
import os
drive.mount('/content/drive')

# Define the path to the Google Drive directory containing the data and model
directory_path = '/content/drive/MyDrive/DataToShare/Estonian-Text-Simplification/Data/Estonian-Training-And-Test-Sets'
model_save_path = os.path.join(directory_path, "models", "fine_tuned_model")
template_file_path = os.path.join(directory_path, 'template-simplification-llama.txt')

# Print the path where the model is stored
print(f"Model is stored at: {model_save_path}")

# Define the scenario: "dataset" or "numbered_sentences"
scenario = "dataset"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model is stored at: /content/drive/MyDrive/DataToShare/Estonian-Text-Simplification/Data/Estonian-Training-And-Test-Sets/models/fine_tuned_model


In [4]:
def read_tsv_file(file_path, columns):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == len(columns):
                data.append(parts)
            else:
                print(f"Skipping line: {line}")
    return pd.DataFrame(data, columns=columns)


def read_template(file_path):
    """
    Reads a template file and returns its content as a string.

    Args:
        file_path (str): Path to the template file.

    Returns:
        str: Content of the template file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()


In [5]:
import os
import pandas as pd
from datasets import Dataset
from unsloth import FastLanguageModel
import torch
import time

# Load data based on the selected scenario
if scenario == "dataset":
    # Scenario 1: Simplify a dataset with columns ["source", "original", "simplified", "corrected"]
    test_file_path = os.path.join(directory_path, 'Test-Sets', 'Estonian-TestSet.tsv')
    test_columns = ["source", "original", "simplified", "corrected"]
    test_df = read_tsv_file(test_file_path, test_columns)
    test_df.reset_index(drop=True, inplace=True)
elif scenario == "numbered_sentences":
    # Scenario 2: Simplify sentences from a file with number and sentence
    input_file_path = os.path.join(directory_path, 'article-fictitious.txt')
    numbered_sentences = []
    with open(input_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                numbered_sentences.append((parts[0].strip(), parts[1].strip()))
            else:
                print(f"Skipping line: {line}")
else:
    raise ValueError("Invalid scenario. Choose 'dataset' or 'numbered_sentences'.")


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [6]:
# Load the model and tokenizer
print("Loading model and tokenizer...")
try:
    max_seq_length = 2048  # Set maximum sequence length
    dtype = None  # Auto-detect data type. Use Float16 for T4/V100 or BFloat16 for Ampere.
    load_in_4bit = True  # Use 4-bit quantization to save memory

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_save_path,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Failed to load model and tokenizer: {e}")
    model, tokenizer = None, None
    raise e

# Enable fast inference
try:
    print("Enabling fast inference...")
    FastLanguageModel.for_inference(model)
    print("Fast inference enabled.")
except Exception as e:
    print(f"Failed to enable fast inference: {e}")
    raise e


Loading model and tokenizer...
==((====))==  Unsloth 2025.1.7: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.3.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 2.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

unsloth/Meta-Llama-3.1-8B-bnb-4bit does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


Unsloth 2025.1.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Model and tokenizer loaded successfully.
Enabling fast inference...
Fast inference enabled.


In [7]:
def simplify_sentence(sentence, simplification_template, tokenizer, model):
    """
    Simplifies a given sentence using the trained model with fast inference enabled.
    The prompt for the model is constructed based on a provided template.

    Args:
        sentence (str): The sentence to simplify.
        simplification_template (str): The template used to construct the prompt.
        tokenizer: The tokenizer for the model.
        model: The trained model.

    Returns:
        str: The simplified sentence.
    """
    # Replace the placeholder in the template with the actual sentence
    prompt = simplification_template.replace("{input_sentence}", sentence)

    # Tokenize and generate the simplified sentence
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        use_cache=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id
    )
    simplified_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip()

    # Extract the part after "### Simplified version:"
    if "### Simplified version:" in simplified_sentence:
        simplified_sentence = simplified_sentence.split("### Simplified version:")[1].strip()

    return simplified_sentence


In [8]:
if scenario == "dataset":
    # Simplify the dataset
    print("Simplifying dataset...")
    simplified_sentences = []
    simplification_template=read_template(template_file_path)
    for original_sentence in test_df['original']:
        start_time = time.time()  # Start timing
        simplified = simplify_sentence(original_sentence, simplification_template, tokenizer, model)
        end_time = time.time()  # End timing
        time_taken = end_time - start_time

        # Print original, simplified sentence, and time taken
        print(f"Original sentence:\n{original_sentence}")
        print(f"Simplified sentence:\n{simplified}")
        print(f"Time taken: {time_taken:.2f} seconds")
        print("--------------------------------------------------------")

        simplified_sentences.append(simplified)
    test_df['simplified'] = simplified_sentences

    # Save results to a TSV file
    output_file_path = os.path.join(directory_path, 'Estonian-Predictions-Dataset.tsv')
    test_df.to_csv(output_file_path, sep='\t', index=False, encoding='utf-8')
    print(f"Simplified dataset saved to {output_file_path}")

elif scenario == "numbered_sentences":
    # Simplify sentences from the numbered file
    print("Simplifying numbered sentences...")
    simplified_sentences = []
    for number, original_sentence in numbered_sentences:
        start_time = time.time()  # Start timing
        simplified = simplify_sentence(original_sentence, simplification_template, tokenizer, model)
        end_time = time.time()  # End timing
        time_taken = end_time - start_time

        # Print number, original, simplified sentence, and time taken
        print(f"Number: {number}")
        print(f"Original sentence:\n{original_sentence}")
        print(f"Simplified sentence:\n{simplified}")
        print(f"Time taken: {time_taken:.2f} seconds")
        print




Simplifying dataset...
Original sentence:
Esimest tüüpi Wolframi sündroom on põhjustatud mutatsioonidest WFS1 geenis, kuid teist tüüpi sündroom on põhjustatud mutatsioonidest CSID2 geenis.
Simplified sentence:
Esimest tüüpi Wolframi sündroom on põhjustatud WFS1 geeni mutatsioonidest. Teist tüüpi sündroom on põhjustatud CSID2 geeni mutatsioonidest.
Time taken: 6.77 seconds
--------------------------------------------------------
Original sentence:
Gormi poeg Harald Sinihammas rajas Jellingisse oma vanematele vägeva hauamonumendi, mis koosneb kahest hiiglaslikust kääpast, kahest ruunikivist, suurest kivilaevast ja kirikust.
Simplified sentence:
Gormi poeg Harald Sinihammas ehitas Jellingisse oma vanematele vägeva hauamonumendi. See monumendi koosneb kahest suurest kääpast, kahest ruunikivist, suurest kivilaevast ja kirikust.
Time taken: 4.31 seconds
--------------------------------------------------------
Original sentence:
Sellegipoolest oli segmentide kirjeldajate laadimine kulukas ope