# Load datasets

In [None]:
from datasets import load_dataset
import re

dataset_repo = "NorHsangPha/shan-news-shannews_org"

dataset = load_dataset(dataset_repo, split="train")
contents = dataset["content"]

print(contents)

In [None]:
def split_text(text):
    sentences = re.split('(?<=\\။)', text)
    sentences = [s.strip().lstrip() for s in sentences if s.strip()]
    return sentences

with open("shannews.txt", 'w', encoding='utf-8') as f:
    for content in contents:
        sentences = split_text(content)

        for sentence in sentences:
            f.write(sentence + '\n')

        f.write('\n')

print("Saved to shannews.txt")

In [1]:
import re
from shannlp import word_tokenize, shan_characters

def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def remove_latin_text(text):
    text = re.sub(r"[^\u1000-\u109f\s]", '', text)
    text = re.sub(r"\s+", " ", text)
    return text

def remove_myanmar_text(text):
    tokens = word_tokenize(text, engine="newmm")
    cleaned_words = []
    for word in tokens:
        is_shan_word = True
        for char in word:
            if char not in shan_characters and not char.isspace():
                is_shan_word = False
                break
        if is_shan_word:
            cleaned_words.append(word)
    
    cleaned_text = "".join(cleaned_words)
    
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

def clean_shan_text(text, keep_numbers=False):
    
    text = remove_emojis(text)

    text = text.replace("၊", "၊ ").replace("။", "။ ").replace(" ၊", "၊ ").replace(" ။", "။ ").strip()
    text = re.sub(r"ႉ{2,}", "ႉ", text)
    text = text.replace("ႆၢ", "ၢႆ")
    text = text.replace("ေတ", "တေ")

    text = remove_latin_text(text)
    text = remove_myanmar_text(text)
    
    # Latin Numbers
    numbers = r"0-9" if keep_numbers else ""
    text = re.sub(rf'[^{numbers}{shan_characters}\s]', '', text)
    
    return text

In [6]:
from shannlp import word_tokenize

text = "ဝၢၼ်ႈယေႇပူႇၵေႃႉၸႅပ်ႉ ရေပူကကော့စပ်ရွာ"

print(word_tokenize(text, engine="newmm"))
print(remove_myanmar_text(text))

['ဝၢၼ်ႈ', 'ယေႇ', 'ပူႇ', 'ၵေႃႉ', 'ၸႅပ်ႉ', ' ', 'ရေ', 'ပူ', 'ကကော့စပ်ရွာ']
ဝၢၼ်ႈယေႇပူႇၵေႃႉၸႅပ်ႉ ရေပူ


In [None]:
import os
import random
import pathlib
import subprocess

training_text_file = './shannews.txt'

lines = []

with open(training_text_file, 'r') as input_file:
    for line in input_file.readlines():
        lines.append(line.strip())

output_directory = "../data/shn-ground-truth"
fonts_dir = '/home/noernova/Labs/tesstrain/shan-datasets/fonts'

if not os.path.exists(output_directory):
    os.mkdir(output_directory)

random.shuffle(lines)

count = 10000

lines = lines[:count]

line_count = 0
for line in lines:
    line = line.strip()
    line = clean_shan_text(line)

    # remove short sentences
    if len(line) < 20:
        continue

    training_text_file_name = pathlib.Path(training_text_file).stem
    line_training_text = os.path.join(output_directory, f'{training_text_file_name}_{line_count}.gt.txt')
    with open(line_training_text, 'w') as output_file:
        output_file.writelines([line])

    file_base_name = f'{training_text_file_name}_{line_count}'

    subprocess.run([
        'text2image',
        '--font=Shan',
        f'--text={line_training_text}',
        f'--outputbase={output_directory}/{file_base_name}',
        '--max_pages=1',
        '--strip_unrenderable_words',
        '--leading=32',
        '--xsize=3600',
        '--ysize=480',
        '--char_spacing=1.0',
        '--exposure=0',
        '--unicharset_file=data/shn/unicharset'
    ])

    line_count += 1