In [2]:
!pip install lark

[1;31merror[0m: [1mexternally-managed-environment[0m

[31m×[0m This environment is externally managed
[31m╰─>[0m To install Python packages system-wide, try apt install
[31m   [0m python3-xyz, where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a non-Debian-packaged Python package,
[31m   [0m create a virtual environment using python3 -m venv path/to/venv.
[31m   [0m Then use path/to/venv/bin/python and path/to/venv/bin/pip. Make
[31m   [0m sure you have python3-full installed.
[31m   [0m 
[31m   [0m If you wish to install a non-Debian packaged Python application,
[31m   [0m it may be easiest to use pipx install xyz, which will manage a
[31m   [0m virtual environment for you. Make sure you have pipx installed.
[31m   [0m 
[31m   [0m See /usr/share/doc/python3.12/README.venv for more information.

[1;35mnote[0m: If you believe this is a mistake, please contact your Python installation or OS dist

# Контекстно-свободная грамматика для toki pona

Описывать будем toki pona, в частности именные группы (с модификаторами) и простые предложения (с прямым объектом). Также я сделаю одно важно допущение: в toki pona всё достаточно сложно с частями речи, т.к. языковые единицы не могут изменяться одна и та же единица может принимать значения разных частей речи в зависимости от своей позиции. Например:

| word   | meaning                                  |
|--------|------------------------------------------|
| `telo` | вода, жидкость, мокрое, пить, мыть, течь |
| `moku` | еда, есть                                |
| `ona`  | местоимение третьего лица                |
| `e`    | показатель прямого объекта               |
| `li`   | показатель предиката (глагола)           |

ona li telo - они пьют

ona li moku - они едят

ona li moku e moku telo - они едят мокрую еду (суп, например)

ona li telo e telo - они пьют воду

Для простоты (хотя интересно об этом потом подумать) мы возьмем ограниченный корпус toki pona "со снятой омонимией", где будем использовать каждую единицу в качестве только одной части речи.

```bnf
start: sentence

sentence: context_phrase? subject_part predicate
context_phrase: np "la"
subject_part: subject "li"?
subject: np ("en" np)*
predicate: verb_phrase (object | prep_phrase)*
verb_phrase: PREVERB? VERB modifier_phrase?
object: "e" np
prep_phrase: PREPOSITION_MOD np

np: (NOUN | PRONOUN) modifier_phrase?
modifier_phrase: (modifier | pi_phrase)+
pi_phrase: "pi" np
modifier: NOUN

PREPOSITION_MOD: "kepeken" | "lon" | "tawa" | "tan" | "sama"
PREVERB: "awen" | "kama" | "ken" | "lukin" | "sona" | "wile" | "alasa"
PRONOUN: "mi" | "sina" | "ona"
NOUN: "telo" | "ilo" | "jan" | "kala" | "kasi" | "tomo" | "moku" | "soweli" | "esun" | "jo" | "kalama" | "toki" | "kili" | "suli" | "lili" | "pona"
VERB: "esun" | "jo" | "kalama" | "toki"

%import common.WS
%ignore WS
```

Попробуем помучать LARK

In [3]:
from lark import Lark

In [4]:
toki_pona_grammar = r"""
start: sentence

sentence: context_phrase? subject_part predicate
context_phrase: np "la"
subject_part: subject "li"?
subject: np ("en" np)*
predicate: verb_phrase (object | prep_phrase)*
verb_phrase: PREVERB? VERB modifier_phrase?
object: "e" np
prep_phrase: PREPOSITION_MOD np

np: (NOUN | PRONOUN) modifier_phrase?
modifier_phrase: (modifier | pi_phrase)+
pi_phrase: "pi" np
modifier: NOUN

PREPOSITION_MOD: "kepeken" | "lon" | "tawa" | "tan" | "sama"
PREVERB: "awen" | "kama" | "ken" | "lukin" | "sona" | "wile" | "alasa"
PRONOUN: "mi" | "sina" | "ona"
NOUN: "telo" | "ilo" | "jan" | "kala" | "kasi" | "tomo" | "moku" | "soweli" | "esun" | "jo" | "kalama" | "toki" | "kili" | "suli" | "lili" | "pona"
VERB: "esun" | "jo" | "kalama" | "toki" | "moku"

%import common.WS
%ignore WS

"""
parser = Lark(toki_pona_grammar, start="start")
tree = parser.parse("mi toki e ona")  # I am talking to him/her/them
print(tree.pretty())

start
  sentence
    subject_part
      subject
        np	mi
    predicate
      verb_phrase	toki
      object
        np	ona



In [5]:
tree = parser.parse("ona li moku e mi")  # it is eating me (so sad)
print(tree.pretty())

start
  sentence
    subject_part
      subject
        np	ona
    predicate
      verb_phrase	moku
      object
        np	mi



In [6]:
tree = parser.parse("ona li moku e moku telo")  # they are eating a soup
print(tree.pretty())

start
  sentence
    subject_part
      subject
        np	ona
    predicate
      verb_phrase	moku
      object
        np
          moku
          modifier_phrase
            modifier	telo



In [10]:
from lark import Visitor, Token


class Translator(Visitor):
    def __init__(self):
        self.leaves = []
        self.toki_pona_dict = {
            "PREPOSITION_MOD": {
                "kepeken": "using, with",
                "lon": "in, at, on",
                "tawa": "to, for, toward",
                "tan": "from, because of",
                "sama": "like, same as",
            },
            "PREVERB": {
                "awen": "keep, stay",
                "kama": "become, come",
                "ken": "can, may",
                "lukin": "see, look (attempt)",
                "sona": "know, know how to",
                "wile": "want, need, must",
                "alasa": "hunt, seek",
            },
            "PRONOUN": {
                "mi": "I, me, we, us",
                "sina": "you",
                "ona": "he, she, it, they, them",
            },
            "NOUN": {
                "telo": "water, liquid",
                "ilo": "tool, machine",
                "jan": "person, people",
                "kala": "fish, sea creature",
                "kasi": "plant, herb",
                "tomo": "house, building, room",
                "moku": "food, meal",
                "soweli": "animal, land mammal",
                "esun": "market, shop, trade",
                "jo": "possession, having",
                "kalama": "sound, noise",
                "toki": "language, speech",
                "kili": "fruit, vegetable",
                "suli": "size, greatness",
                "lili": "smallness, fewness",
                "pona": "good, simplicity",
            },
            "VERB": {
                "esun": "trade, buy, sell",
                "jo": "have, possess",
                "kalama": "make noise, play an instrument",
                "toki": "speak, say, communicate",
                "moku": "eat, drink, consume",
            },
        }

    def __default__(self, tree):
        for child in tree.children:
            if isinstance(child, Token):
                self.leaves.append((child.type, child.value))

    def translate(self, sentence, parser, merge=True):
        tree = parser.parse(sentence)
        self.visit_topdown(tree)
        if merge:    
            return " ".join(
                self.toki_pona_dict.get(pos, {}).get(word, word)
                for pos, word in self.leaves
            )
        else:
            return [self.toki_pona_dict.get(pos, {}).get(word, word) for pos, word in self.leaves]


parser = Lark(toki_pona_grammar)

# Visit the tree and collect tokens in order
translator = Translator()

translator.translate("mi toki e ona", parser, merge=False)

['I, me, we, us', 'speak, say, communicate', 'he, she, it, they, them']

In [13]:
translator = Translator()

# Ошибка, нельзя ставить li после mi
translator.translate("mi li moku telo", parser, merge=False)

['I, me, we, us', 'eat, drink, consume', 'water, liquid']

Окей, он выдает нам какой-то набор слов, это уже неплохо. Интересно, сможем ли мы перевести эти "лемматизированные" предложения в нормальный английский язык? С учетом того как мал корпус токи поны, это может быть одним из вариантов перевода.

In [14]:
from nltk.stem import WordNetLemmatizer
import nltk
from datasets import load_dataset, Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [7]:
lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(sentence):
    tokens = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)
    
    lemmatized_words = []
    for word, tag in pos_tags:
        if tag.startswith('N'):
            lemmatized_words.append(lemmatizer.lemmatize(word, 'n'))
        elif tag.startswith('V'):
            lemmatized_words.append(lemmatizer.lemmatize(word, 'v'))
        elif tag.startswith('J'):
            lemmatized_words.append(lemmatizer.lemmatize(word, 'a'))
        elif tag.startswith('R'):
            lemmatized_words.append(lemmatizer.lemmatize(word, 'r'))
        else:
            lemmatized_words.append(word)
    
    return " ".join(lemmatized_words)

# Load a dataset like WikiText or a subset of books
dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")

In [8]:
from tqdm import tqdm

dataset = dataset.shuffle(seed=42).select(range(100000))  # Use a smaller subset for testing

training_pairs = []
for example in tqdm(dataset):
    text = example["text"]
    if text.strip():  # Skip empty lines
        sentences = nltk.sent_tokenize(text)
        for sentence in sentences:
            if 5 <= len(sentence.split()) <= 20:  # Filter by length
                lemmatized = lemmatize_sentence(sentence)
                # if lemmatized != sentence:  # Only include if actually changed
                training_pairs.append({"lemmatized": lemmatized, "original": sentence})

100%|██████████| 100000/100000 [00:54<00:00, 1842.82it/s]


In [10]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [11]:
def preprocess_function(examples):
    inputs = ["inflect: " + ex for ex in examples["lemmatized"]]
    targets = examples["original"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [15]:
train_dataset = Dataset.from_list(training_pairs[:80000])
val_dataset = Dataset.from_list(training_pairs[80000:])

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 80000/80000 [00:17<00:00, 4565.71 examples/s]
Map: 100%|██████████| 22089/22089 [00:04<00:00, 4643.71 examples/s]


In [16]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [17]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 