In [1]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[0mInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [2]:
import pandas as pd

trans_df = pd.read_csv('/kaggle/input/kaz-rus-parallel-corpus/kazakh_russian_parallel_corpus.csv')

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
pd.options.display.max_colwidth = 100

In [5]:
# Calculate lengths of the text in each column
kazakh_lengths = trans_df['Kazakh'].str.len()
source_lengths = trans_df['Source'].str.len()

# Apply the filtering condition
trans_df = trans_df[(source_lengths > kazakh_lengths * 0.7) & (source_lengths < kazakh_lengths * 1.3)]

trans_df = trans_df.drop_duplicates()

In [6]:
print(trans_df.shape)
print(trans_df.columns)

(32691, 2)
Index(['Source', 'Kazakh'], dtype='object')


In [7]:
trans_df.sample(10)

Unnamed: 0,Source,Kazakh
28399,"Мы более в нем не нуждаемся, мама.","− Ол енді бізге керек емес, апа."
12921,Через несколько дней Хосе Аркадио Буэндия подыскал дом для семейства коррехидора.,"Бірнеше күн өткен соң, Хосе Аркадио Буэндиа коррехидордың отбасына үй тауып берді."
7802,Человек долго смотрел на Джоуда.,Oл адам Джoудқа ұзақ қарады.
4204,"Легко себе представить, что странные повадки Динго не раз служили темой бесед, которые вели на к...","Дингоның адам таңқаларлық қылығы капитан Гульдің, миссис Уэлдонның және Диктің кеме кормосындағы..."
33010,"— Все исправно, — сказал англичанин.","— Бәз бабында бәрі, — деді ағылшын."
21581,"Сколько мы плыли по реке, сколько делали для этих мошенников, и все зря!","Біз өзенмен қанша жол жүрдік, сол алаяқтарға қанша қызмет еттім, бәрі босқа!"
15683,"Я от него ничего не видела, кроме добра...",Мен одан жақсылықтан өзге ештеңе көрген емеспін.
5911,"— Друзья мои, — сказал он вполголоса, как будто разговаривая сам с собой, — слушайте меня.","— Достарым,— деді ол дауысын бәсең шығарып, өзімен өзі сөйлесіп тұрған кісі сияқтанып,— менің сө..."
17979,"Спектакль был не такой дрянной, как те, что я раньше видел.",Спектакль бұрын көргендеріме қарағанда жаман емес сияқты еді.
17522,"Две вещи здесь как-то странно останавливали внимание: небольшой погнутый самовар, стоявший на по...","Өте-мөте екі нәрсе айырықша назар аударады: біреуісі - сөре үстінде тұрған, қабысқан кішкене сам..."


In [8]:
trans_df.isnull().sum()

Source    0
Kazakh    0
dtype: int64

In [9]:
# this code is adapted from  the Stopes repo of the NLLB team
# https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/monolingual_line_processor.py#L214

import re
import sys
import typing as tp
import unicodedata
from sacremoses import MosesPunctNormalizer


mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]


def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # same as \p{C} in perl
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    # replace 𝓕𝔯𝔞𝔫𝔠𝔢𝔰𝔠𝔞 by Francesca
    clean = unicodedata.normalize("NFKC", clean)
    return clean

In [10]:
# Apply the preprocessing function to each column
trans_df['Kazakh'] = trans_df['Kazakh'].apply(preproc)
trans_df['Source'] = trans_df['Source'].apply(preproc)

In [11]:
from sklearn.model_selection import train_test_split

# dataset splitting into test, train, dev sets
test_size = 300
dev_size = 300

# First, create the training set and a remaining set (test + dev)
train_df, remaining_df = train_test_split(trans_df, test_size=test_size + dev_size, random_state=42)

# Split the remaining set into test and dev
test_df, dev_df = train_test_split(remaining_df, test_size=test_size, random_state=42)


In [12]:
print(train_df.shape)
print(test_df.shape)
print(dev_df.shape)

(32091, 2)
(300, 2)
(300, 2)


# Preparing the training loop

In [13]:
import gc
import random
import numpy as np
import torch
from tqdm.auto import tqdm, trange
from transformers.optimization import Adafactor
from transformers import get_constant_schedule_with_warmup
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TranslationPipeline


def cleanup():
    """Try to free GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

2024-06-01 20:09:35.899846: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-01 20:09:35.899954: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-01 20:09:36.038901: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [14]:
# Load the tokenizer and model
model_name = 'issai/tilmash'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/544 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/853 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.49G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [15]:
model.cuda()

M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): M2M100ScaledWordEmbedding(256206, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): M2M100ScaledWordEmbedding(256206, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-23): 24 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=8192, bias=True)
          (fc2): Linear(in_features=8192, out_features=1024, bias=True)
       

In [16]:
optimizer = Adafactor(
    [p for p in model.parameters() if p.requires_grad],
    scale_parameter=False,
    relative_step=False,
    lr=1e-4,
    clip_threshold=1.0,
    weight_decay=1e-3,
)

In [17]:
batch_size = 8  # 32 already doesn't fit well to 15GB of GPU memory
max_length = 128
warmup_steps = 1_000
training_steps = 57000

In [18]:
losses = []
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)

In [19]:
LANGS = [('Source', 'rus_Cyrl'), ('Kazakh', 'kaz_Cyrl')]

def get_batch_pairs(batch_size, data=train_df):
    (l1, long1), (l2, long2) = random.sample(LANGS, 2)
    xx, yy = [], []
    for _ in range(batch_size):
        item = data.iloc[random.randint(0, len(data)-1)]
        xx.append(preproc(item[l1]))
        yy.append(preproc(item[l2]))
    return xx, yy, long1, long2

print(get_batch_pairs(1))


(['Күн асқан сайын жасыл желегі қоюланып келе жатқан жөке ағашы мен сары тал терезенің сыртынан көктем иісін себездеп, майда самал оны подвалға кіргізді.'], ['С каждым днем все сильнее зеленеющие липы и ветла за окном источали весенний запах, и начинающийся ветерок заносил его в подвал.'], 'kaz_Cyrl', 'rus_Cyrl')


In [20]:
model.train()
x, y, loss = None, None, None
cleanup()

tq = trange(len(losses), training_steps)
for i in tq:
    xx, yy, lang1, lang2 = get_batch_pairs(batch_size)
    try:
        tokenizer.src_lang = lang1
        x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        tokenizer.src_lang = lang2
        y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

        loss = model(**x, labels=y.input_ids).loss
        loss.backward()
        losses.append(loss.item())

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

    except RuntimeError as e:
        optimizer.zero_grad(set_to_none=True)
        x, y, loss = None, None, None
        cleanup()
        print('error', max(len(s) for s in xx + yy), e)
        continue

    if i % 1000 == 0:
        print(i, np.mean(losses[-1000:]))


  0%|          | 0/57000 [00:00<?, ?it/s]

0 1.6621999740600586


KeyboardInterrupt: 

In [22]:
savedModelPath = 'tilmash_fine_tuned'
model.save_pretrained(savedModelPath)

Non-default generation parameters: {'max_length': 200}
