# Datasets and Model training

### Libraly

In [None]:
!pip install -q transformers sentencepiece

In [None]:
!pip install -q datasets
!pip install -q wandb

In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/xglm-564M")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Datasets

In [None]:
# append all datasets to a list
data = []
data.append(pd.read_csv('/content/drive/MyDrive/Klong/Klong_haripunchai.csv')) 
data.append(pd.read_csv('/content/drive/MyDrive/Klong/Klong_lokanit.csv'))
data.append(pd.read_csv('/content/drive/MyDrive/Klong/Klong_supan.csv'))
data.append(pd.read_csv('/content/drive/MyDrive/Klong/Klong_wadruak.csv'))
data.append(pd.read_csv('/content/drive/MyDrive/Klong/klong_jaofahapai.csv'))
klong_dataset_list = []
for klong in range(5):
  for rowidx in range(data[klong].shape[0]):
    klong_dataset_list.append('\n'.join(data[klong].iloc[rowidx])) # join with \n

In [None]:
# split train valid function 
def split_data(data:list, test_split_ratio:float) -> tuple:
  split_calculation = 100 - (round(len(data) * test_split_ratio)) # need to be int
  valid = data[:split_calculation]
  train = data[split_calculation:]
  return train, valid

In [None]:
# split train valid 0.2
klong_train, klong_valid = split_data(klong_dataset_list, 0.2)
# to datasets
klong_train = Dataset.from_dict({"content": klong_train})
klong_valid = Dataset.from_dict({"content": klong_valid})

In [None]:
# shuffle data
raw_datasets = DatasetDict(
    {
        "train": klong_train.shuffle(),  # .shuffle().select(range(50000)),
        "valid": klong_valid.shuffle(),  # .shuffle().select(range(500))
    }
)

#### Tokenizing

In [None]:
# tokenize data
def tokenize(element, context_length=128): # context_length will cut of
  outputs = tokenizer(
      element["content"],
      truncation=True,
      max_length=context_length,
      return_overflowing_tokens=True,
      return_length=True,
  )
  print(outputs.keys())
  input_batch = []
  for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
      input_batch.append(input_ids) # if enabled it will ignore token lenght shorter than context_length
  return {"input_ids": input_batch}

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)

Map:   0%|          | 0/165 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'])


Map:   0%|          | 0/1161 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'])
dict_keys(['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'])


### Modeling

In [None]:
from transformers import DataCollatorForLanguageModeling

In [None]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
try:
  out = data_collator([tokenized_datasets["train"][i] for i in range(len(tokenized_datasets["train"]))])
except:
  out = data_collator(tokenized_datasets["train"])
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a XGLMTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([165, 66])
attention_mask shape: torch.Size([165, 66])
labels shape: torch.Size([165, 66])


In [None]:
model = AutoModelForCausalLM.from_pretrained("facebook/xglm-564M")

### Training


In [None]:
# log in hugging face
from huggingface_hub import notebook_login
notebook_login() 

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install -q git+https://github.com/huggingface/accelerate

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="./KarveeSaimai",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=0,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=1,
    fp16=True,
    push_to_hub=True,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)

Cloning https://huggingface.co/Thanravee/KarveeSaimai into local empty directory.


In [None]:
# train model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss


Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.


TrainOutput(global_step=20, training_loss=0.14196333885192872, metrics={'train_runtime': 1026.2366, 'train_samples_per_second': 0.161, 'train_steps_per_second': 0.019, 'total_flos': 15211101609984.0, 'train_loss': 0.14196333885192872, 'epoch': 0.97})

# Check and generate


### Khavee and check eak tou 

### Libraly

In [1]:
!pip install -q git+https://github.com/PyThaiNLP/pythainlp

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pythainlp (setup.py) ... [?25l[?25hdone


In [2]:
# install pythainlp and ssg(subword tokenizer)
!pip install -q ssg

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m473.8/473.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.3/88.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for fire (setup.py) ... [?25l[?25hdone


In [3]:
from typing import List, Union
from pythainlp.tokenize import subword_tokenize,word_tokenize
from pythainlp.util import sound_syllable
from pythainlp.util import remove_tonemark
from pythainlp.khavee import KhaveeVerifier

In [4]:
import pythainlp as pythai
from pythainlp.tokenize import word_tokenize
from pythainlp.tokenize import subword_tokenize
from pythainlp.util import sound_syllable
from pythainlp.util import isthai
from pythainlp.transliterate import pronunciate
from pythainlp.spell import correct
from tqdm import tqdm
import numpy as np
import pandas as pd
from google.colab import drive
kv = KhaveeVerifier()
# drive.mount('/content/drive')

In [5]:
# Transformers
!pip install transformers sentencepiece
from transformers import AutoTokenizer, AutoModelForCausalLM

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [6]:
tokenizer = AutoTokenizer.from_pretrained("Thanravee/KarveeSaimai", local_files_only=False)
model = AutoModelForCausalLM.from_pretrained("Thanravee/KarveeSaimai", local_files_only=False)

Downloading (…)okenizer_config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/4.92M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/324 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

### Word and Subword Tokenizing

In [7]:
# split text from \n to list and drop soi word ->  splitted wak list (no soi)
def split_klong(klong_text):
  splitted_klong = []
  klong_list = klong_text.split('\n')
  klong_list = [klong for klong in klong_list if klong.strip()]
  for i in range(len(klong_list)):
    if i == 1 or i == 3 or i == 5: 
      klong = klong_list[i]
      if klong[0] == ' ': 
        klong = klong[1:]
      klong = klong.split(' ')
      splitted_klong.append(klong[0])
    else:
      splitted_klong.append(klong_list[i].replace(' ', ''))
  return splitted_klong

In [8]:
# subword tokenize wak with ssg and dict
def subword_token(wak, engine='ssg'):
  subword_tokenized = subword_tokenize(wak, engine='ssg')
  if len(subword_tokenized) != 5 and len(subword_tokenized) != 2:
      subword_tokenized = subword_tokenize(wak, engine='dict')
  return subword_tokenized

### Check Functions

#### Number of syllables check

In [9]:

# check number of syllables -> [True, True, True, True, True, True, True, True] (len=8)
def subword_num(splitted_klong):
  checked = []
  two = [1,3,5]
  five = [0,2,4,6]
  for num in range(len(splitted_klong)):
    if num in two:
      checked.append(len(subword_token(splitted_klong[num])) == 2)
    elif num in five: 
      checked.append(len(subword_token(splitted_klong[num])) == 5)
    elif num == 7:
      checked.append(len(subword_token(splitted_klong[num])) == 4)
  return checked

#### eak tou check


In [10]:
# check what word tone is 
def find_tone(word):
  char_list = [*word]
  if "่" in char_list or sound_syllable(word) == 'dead':
    return "eak or dead"
  elif "้" in char_list:
    return "tou"
  else:
    return False

In [11]:
# check eaktou -> list[True, True, True, True, True, True, True, True] (len=8)
def check_eaktou(splitted_klong):
  checked = []
  for num in range(len(splitted_klong)):
    tokenzied_wak = subword_token(splitted_klong[num])
    if num == 0:
      checked.append(find_tone(tokenzied_wak[3]) == "eak or dead" and find_tone(tokenzied_wak[4]) == 'tou')
    elif num == 1:
      checked.append(True)
    elif num == 2:
      checked.append(find_tone(tokenzied_wak[1]) == "eak or dead")
    elif num == 3:
      checked.append(find_tone(tokenzied_wak[0]) == 'eak or dead' and find_tone(tokenzied_wak[1]) == 'tou')
    elif num == 4:
      checked.append(find_tone(tokenzied_wak[2]) == 'eak or dead')
    elif num == 5:
      checked.append(find_tone(tokenzied_wak[1]) == 'eak or dead')
    elif num == 6:
      checked.append(find_tone(tokenzied_wak[1]) == "eak or dead" and find_tone(tokenzied_wak[4]) == 'tou')
    elif num == 7:
      checked.append(find_tone(tokenzied_wak[0]) == "eak or dead" and find_tone(tokenzied_wak[1]) == 'tou')
  return checked

#### sampas check

In [12]:
# last sound of wak from pronunciate tokenized last word of each wak
# ex [เสียงลือเสียงเล่าอ้าง] -> [อ้าง]
def sound_words(splitted_klong):
  sound_list = []
  for wak in splitted_klong:
    list_char = [*wak]
    if " " in list_char:
      wak = wak.split(" ")
      wak = wak[0]
    wak = word_tokenize(wak, engine="newmm")
    pronounce_word = pronunciate(wak[-1], engine="w2p")
    sound_list.append(pronounce_word.replace('ฺ', '').split('-')[-1])
  return sound_list

In [13]:
# check sampas -> [True, True, True] 
# [0] = sampas wak 2-3, [1] = sampas wak 2-4, [2] sampas wak 4-7
def check_sampas(sound_list):
  checked = []
  if len(sound_list) > 2:
    checked.append(kv.check_sumpus(sound_list[1],sound_list[2]))
    if len(sound_list) > 4:
      checked.append(kv.check_sumpus(sound_list[1],sound_list[4]))
      if len(sound_list) > 6:
        checked.append(kv.check_sumpus(sound_list[3],sound_list[6]))
  else:
    checked.append(True)
  return checked

#### Main Check

In [15]:
def main_check(klong_text):
  splitted_klong = split_klong(klong_text)
  checked_subword_num = subword_num(splitted_klong)
  if False in checked_subword_num:
    false_index = checked_subword_num.index(False)
    return 'syllable format error', false_index+1
  else:
    checked_eaktou = check_eaktou(splitted_klong)
    if False in checked_eaktou:
      false_index = checked_eaktou.index(False)
      return 'eaktou format error', false_index+1
    else:
      sound_list = sound_words(splitted_klong)
      checked_sampas = check_sampas(sound_list)
      if False in checked_sampas:
        wak_sampas = ['2 and 3', '2 and 5', '4 and 7']
        return 'sampas format error', wak_sampas[checked_sampas.index(False)]
      else:
        return True

### Generate Klong

In [16]:
input_text = 'เสียงลือเสี่ยงเล่าอ้าง\nอันใด พี่เอย\nเสี่ยงย่อมยอยศใคร\nทั่วหล้า\nสองเขือพี่หลับใหล\nลืมตื่น ฤาพี่\nสองพี่คิดเองอ้า\nอย่าได้ถามเผือ'
def gen_prob_next_token(text:str, model, tokenizer):
  input_ids = tokenizer(input_text, return_tensors="pt")
  #look at tensor shape
  input_ids,input_ids['input_ids'].shape 

  #get logit of the next token
  outputs = model(input_ids['input_ids'])
  logits = outputs.logits
  logits.shape #the size is equal to input token because it's predicting the next one

  #convert logit to prob; use the logits of the last input token
  import torch.nn.functional as F
  probs = F.softmax(logits[:, -1, :], dim=-1).squeeze() 
  probs, probs.argmax()

  #match prob with vocab
  import pandas as pd
  df = pd.DataFrame(tokenizer.vocab.items(), columns=['token', 'token_id']).sort_values('token_id').reset_index(drop=True)
  
  df['prob'] = probs.detach().numpy()

  possible_token = df.sort_values('prob',ascending=False).reset_index()
  thai_only = [x if isthai(x) else None for x in possible_token['token']] # thai only
  possible_token['token'] = thai_only
  possible_token = possible_token.dropna()
  return possible_token
  
prob = gen_prob_next_token(input_text, model, tokenizer) # prob คือคำที่เป็นไปได้ทั้งหมด

In [58]:
# filter broken word and get passed only 100 words
def gen_rules(probs, fast_gen=True):
  passed = []
  limiter = 100 if fast_gen else 100000000
  for prob in probs:
    if len(check_word(prob)) > 1 and len(subword_token(prob)) == 1 and '-' not in pronunciate(prob) and len(passed) <= limiter:
        passed.append(correct(prob))
  return passed

In [41]:
def check_word(word):
  alphabets = [alp for alp in [*word] if alp not in ['่','้','๊','๋','์']]
  if '์' in [*word]:
    alphabets = [*word][:-2]
  return alphabets

In [43]:
def generator(klong):
  prob = gen_prob_next_token(klong, model, tokenizer)
  new_prob = gen_rules(prob['token'].tolist())
  return new_prob

In [62]:
# get word with sampas
def get_sampassed(data:list, sampaswith):
  passed = []
  counter_exception = 0
  for possible_word in tqdm(data):
    possible_sampas = pronunciate(possible_word).split('-')[-1] # reduce word dimension
    sampaswith = pronunciate(sampaswith).split('-')[-1] # reduce word dimension
    try:
      if kv.is_sumpus(possible_sampas, sampaswith):
        passed.append(possible_word)
    except IndexError:
      counter_exception += 1
      continue
  assert len(passed) != counter_exception # if this failed mena that this function skipped all sampass which shouldn't be the case
  return passed

In [61]:
# get word with aek or too
def get_aek_too(data:list, ktype='aek'):
  passed = []
  for possible_word in tqdm(data):
      if kv.check_aek_too(possible_word) == ktype:
        passed.append(possible_word)
  return passed

In [63]:
def tone_gen(klong_text, gened_word, word_mark='no', sampas=False):
  splitted_klong = split_klong(klong_text)
  if word_mark == 'no' and sampas == False:
     probs = generator(klong_text)
     for prob in probs:
       if prob not in gened_word:
         gened_word.append(prob)
         return prob, gened_word
  elif word_mark == 'aek' and sampas == False:
    probs = generator(klong_text)
    aek = get_aek_too(probs)
    for prob in aek:
      if prob not in gened_word:
        gened_word.append(prob)
        return prob, gened_word
  elif word_mark == 'too' and sampas == False:
    probs = generator(klong_text)
    too = get_aek_too(probs, 'too')
    for prob in too:
      if prob not in gened_word:
        gened_word.append(prob)
        return prob, gened_word
  elif sampas == True and word_mark == 'no':
    probs = gen_prob_next_token(klong_text, model, tokenizer)
    probs = probs['token']
    passed = get_sampassed(probs, sound_words(splitted_klong)[1])
    for prob in passed:
       if prob not in gened_word:
         gened_word.append(prob)
         return prob, gened_word
  elif sampas == True and word_mark == 'too':
    probs = gen_prob_next_token(klong_text, model, tokenizer)
    probs = probs['token']
    passed = get_sampassed(probs, sound_words(splitted_klong)[3])
    for prob in passed:
       if prob not in gened_word and kv.check_aek_too(prob) == 'too':
         gened_word.append(prob)
         return prob, gened_word

In [60]:
def gen_klong(klong_text_input, gened_word):
  splitted_klong = split_klong(klong_text_input)
  klong_text = klong_text_input
  # วรรค 2, 4, 6
  if len(splitted_klong) in [1, 3, 5]:
    word_gen = 2
    if len(splitted_klong) == 1:
      # ฉันทลักษณ์ (none, none(sampas))
      prob, gened_word  = tone_gen(klong_text, gened_word)
      klong_text = klong_text + prob
      prob, gened_word  = tone_gen(klong_text, gened_word)
      klong_text = klong_text + prob
      klong_text = klong_text + '\n'
    elif len(splitted_klong) == 3:
      # ฉันทลักษณ์ (aek, too(sampas))
      prob, gened_word  = tone_gen(klong_text, gened_word, word_mark='aek')
      klong_text = klong_text + prob
      prob, gened_word  = tone_gen(klong_text, gened_word, 'too')
      klong_text = klong_text + prob
      klong_text = klong_text + '\n'
    elif len(splitted_klong) == 5:
      # ฉันทลักษณ์ (none, aek)
      prob, gened_word  = tone_gen(klong_text, gened_word)
      klong_text = klong_text + prob
      prob, gened_word  = tone_gen(klong_text, gened_word, word_mark='aek')
      klong_text = klong_text + prob
      klong_text = klong_text + '\n'

  # วรรค 3, 5, 7
  elif len(splitted_klong) in [2, 4, 6]:
    word_gen = 5
    if len(splitted_klong) == 2:
      # ฉันทลักษณ์ (none, aek, none, none, none(sampas))
      prob, gened_word  = tone_gen(klong_text, gened_word)
      klong_text = klong_text + prob
      prob, gened_word  = tone_gen(klong_text, gened_word, word_mark='aek')
      klong_text = klong_text + prob
      prob, gened_word  = tone_gen(klong_text, gened_word)
      klong_text = klong_text + prob
      prob, gened_word  = tone_gen(klong_text, gened_word)
      klong_text = klong_text + prob
      sampas_word = sound_words(splitted_klong)[1]
      prob, gened_word  = tone_gen(klong_text, gened_word, word_mark='no', sampas=True)
      klong_text = klong_text + prob
      klong_text = klong_text + '\n'
    elif len(splitted_klong) == 4:
      # ฉันทลักษณ์ (none, none, aek, none, none(sampas))
      prob, gened_word  = tone_gen(klong_text, gened_word)
      klong_text = klong_text + prob
      prob, gened_word  = tone_gen(klong_text, gened_word)
      klong_text = klong_text + prob
      prob, gened_word  = tone_gen(klong_text, gened_word, word_mark='aek')
      klong_text = klong_text + prob
      prob, gened_word  = tone_gen(klong_text, gened_word)
      klong_text = klong_text + prob
      sampas_word = sound_words(splitted_klong)[1]
      prob, gened_word  = tone_gen(klong_text, gened_word, word_mark='no', sampas=True)
      klong_text = klong_text + prob
      klong_text = klong_text + '\n'
    elif len(splitted_klong) == 6:
      # ฉันทลักษณ์ (none, aek, none, none, too(sampas))
      prob, gened_word  = tone_gen(klong_text, gened_word)
      klong_text = klong_text + prob
      prob, gened_word  = tone_gen(klong_text, gened_word, word_mark='aek')
      klong_text = klong_text + prob
      prob, gened_word  = tone_gen(klong_text, gened_word)
      klong_text = klong_text + prob
      prob, gened_word  = tone_gen(klong_text, gened_word)
      klong_text = klong_text + prob
      sampas_word = sound_words(splitted_klong)[1]
      prob, gened_word  = tone_gen(klong_text, gened_word, word_mark='too', sampas=True)
      klong_text = klong_text + prob
      klong_text = klong_text + '\n'
  # วรรค 8
  elif len(splitted_klong) == 7:
    # ฉันทลักษณ์ (eak, too, none, none)
    word_gen = 4
    prob, gened_word  = tone_gen(klong_text, gened_word, word_mark='aek')
    klong_text = klong_text + prob
    prob, gened_word  = tone_gen(klong_text, gened_word, 'too')
    klong_text = klong_text + prob
    prob, gened_word  = tone_gen(klong_text, gened_word)
    klong_text = klong_text + prob
    prob, gened_word  = tone_gen(klong_text, gened_word)
    klong_text = klong_text + prob
    klong_text = klong_text + '\n'
  return klong_text, gened_word

### Main Function


In [48]:
# main 
def main(klong_text):
  gened_klong = []
  splitted = split_klong(klong_text)
  if main_check(klong_text) == True:
    wak_num = len(splitted)
    for i in range(8-wak_num):
      klong_text, gened_klong = gen_klong(klong_text, gened_klong)
    return klong_text
  else:
    return main_check(klong_text)

# MAIN

In [64]:
base_text = 'ส่องคันฉ่องสะท้อน\n'
main(base_text)

100%|██████████| 101/101 [00:00<00:00, 286232.91it/s]
100%|██████████| 3699/3699 [00:36<00:00, 102.34it/s]
100%|██████████| 101/101 [00:00<00:00, 191166.38it/s]
100%|██████████| 101/101 [00:00<00:00, 128957.29it/s]
100%|██████████| 101/101 [00:00<00:00, 107491.68it/s]
100%|██████████| 3699/3699 [00:37<00:00, 97.50it/s]
100%|██████████| 101/101 [00:00<00:00, 293979.67it/s]
100%|██████████| 101/101 [00:00<00:00, 304985.39it/s]
100%|██████████| 3699/3699 [00:33<00:00, 110.48it/s]
100%|██████████| 101/101 [00:00<00:00, 123649.94it/s]
100%|██████████| 101/101 [00:00<00:00, 130829.12it/s]


'ส่องคันฉ่องสะท้อน\nข้อน้อง\nตนพี่ถ้าขวัญ่ง\nว่าต้อง\nอกดดพ่อฮาร้อง\nกลกล่าว\nกระกล่องดรติห้อง\nเรื่องค้นเสียงดก\n'