Recommended spec data cleaning 
- ml.m7i.8xlarge

In [None]:
%%capture
! pip install tokenizers transformers ipywidgets pandas datasets wandb huggingface_hub tqdm

In [None]:
! pip install accelerate -U
# ! pip install transformers[torch]

In [None]:
# aws s3 sync s3://monolingual.data/A/ /home/ec2-user/SageMaker/monolingual/A/ --no-sign-request

# aws s3 sync s3://openpecha.cleaned/tokenized_raw_text/ /home/ec2-user/SageMaker/monolingual/gold/ --no-sign-request

In [None]:
import os
os.environ['HF_HOME'] = '/home/ec2-user/SageMaker/cache'
os.environ['HF_DATASETS_CACHE'] = '/home/ec2-user/SageMaker/cache/datasets'

In [None]:
!echo $HF_HOME
!echo $HF_DATASETS_CACHE

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import pandas as pd
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

import torch
from torch.utils.data.dataset import Dataset

import os
import math

from huggingface_hub import HfFolder, notebook_login

In [None]:
from datasets import load_dataset
dataset = load_dataset('spsither/tibetan_monolingual_A', cache_dir="/home/ec2-user/SageMaker/cache/datasets", num_proc=8)

In [None]:
import re
def clean_transcription(text):
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.strip()
    
    text = re.sub("༌", "་",text) # there are two type of 'tsak' let's normalize 0xf0b to 0xf0c
    text = re.sub("༎", "།",text) # normalize double 'shae' 0xf0e to 0xf0d
    
    text = re.sub("ཽ", "ོ",text) # normalize
    text = re.sub("ཻ", "ེ",text) # normalize "᫥"
    
    text = re.sub(r"\s+།", "།", text)
    text = re.sub(r"།+", "།", text)
    text = re.sub(r"།", "། ", text)
    text = re.sub(r"\s+་", "་", text)
    text = re.sub(r"་+", "་", text)
    text = re.sub(r"\s+", " ", text)
    
    text = re.sub(r"ཧཧཧ+", "ཧཧཧ", text)
    text = re.sub(r'ཧི་ཧི་(ཧི་)+', r'ཧི་ཧི་ཧི་', text)
    text = re.sub(r'ཧེ་ཧེ་(ཧེ་)+', r'ཧེ་ཧེ་ཧེ་', text)
    text = re.sub(r'ཧ་ཧ་(ཧ་)+', r'ཧ་ཧ་ཧ་', text)
    text = re.sub(r'ཧོ་ཧོ་(ཧོ་)+', r'ཧོ་ཧོ་ཧོ་', text)
    text = re.sub(r'ཨོ་ཨོ་(ཨོ་)+', r'ཨོ་ཨོ་ཨོ་', text)

    chars_to_ignore_regex = "[\,\?\.\!\-\;\:\"\“\%\‘\”\�\/\{\}\(\)༽》༼《༅༄༈༑༠'|·×༆༔༷༸༾ཿ྄྅྆྇ྋ࿒ᨵ​’„╗᩺╚༿᫥ྂ༊ྈ࿄࿉࿐྾༜]"
    text = re.sub(chars_to_ignore_regex, '', text)+" "
    return text
print(clean_transcription('ཧིཧོ་ཧོ་ཧོ་ཧོ་ཧོ་ཧོ་  ཧ་ ཧ་ཧ་། །  འ་༽འ་××འ༌༌༌༌༌༌༌གྲོ།ཚ ར་སོང�་ངེ་། '))

print(clean_transcription('༼ཕ་༽། སེང་གེ་སྒྲའི་སྒྲུབ་ཐབས་བཞུགས། ། བླ་མ་དམ་པའི་ཞབས་ལ་ཕྱག་འཚལ་ལོ།'))

In [None]:
import re

delimiters = "་། "
pattern = f"[{re.escape(delimiters)}]+"
    
def max_char_btw_tsak(example):
    segments = re.split(pattern, example)
    # print([(len(segment), segment) for segment in segments if segment])
    max_length = max([len(segment) for segment in segments if segment], default=0)
    return max_length

def merge_text_lines(examples):
    examples = examples['text']
    examples = [clean_transcription(example) for example in examples]
    
    char_lens = [ len(example) for example in examples]
    
    max_lengths = [ max_char_btw_tsak(example) for example in examples]
    
    return {'text': examples, 'char_len': char_lens, 'max_char_btw_tsak': max_lengths}

In [None]:
%%time
dataset_meta = dataset.map(merge_text_lines, batched=True)

In [None]:
dataset_meta.push_to_hub('tibetan_monolingual_A_meta')

In [None]:
from datasets import load_dataset
dataset = load_dataset('spsither/tibetan_monolingual_A_meta', cache_dir="/home/ec2-user/SageMaker/cache/datasets", num_proc=16)

In [None]:
# Define a filter function
def filter_condition(examples):
    return [ max_char_btw_tsak > 1 and max_char_btw_tsak < 9 and char_len > 15 and char_len < 1000 for max_char_btw_tsak,char_len in zip(examples['max_char_btw_tsak'], examples['char_len'])]

In [None]:
# Apply the filter
filtered_dataset = dataset.filter(filter_condition, batched = True, num_proc=32)

In [None]:
filtered_dataset.push_to_hub('tibetan_monolingual_A_filtered')

In [None]:
from datasets import load_dataset
dataset = load_dataset('spsither/tibetan_monolingual_A_filtered', cache_dir="/home/ec2-user/SageMaker/cache/datasets", num_proc=32)

In [None]:
dataset = dataset.remove_columns(["char_len", 'max_char_btw_tsak'])

In [None]:
import hashlib

In [None]:
def get_hash(example):
    """Get hash of content field."""
    return {"hash": hashlib.md5(example["text"].strip().encode("utf-8")).hexdigest()}

def check_uniques(example, uniques):
    """Check if current hash is still in set of unique hashes and remove if true."""
    if example["hash"] in uniques:
        uniques.remove(example["hash"])
        return True
    else:
        return False

def preprocess(example):
    """Chain all preprocessing steps into one function to not fill cache."""
    results = dict()
    results.update(get_hash(example))
    return results

def filter(example, uniques):
    """Filter dataset with heuristics. Config, test and has_no_keywords files are removed with a given probability."""
    if not check_uniques(example, uniques):
        return False
    else:
        return True

In [None]:
# Run preprocessing
dataset = dataset.map(preprocess, num_proc=32)

In [None]:
# Deduplicate hashes
uniques_train = set(dataset['train']["hash"])

In [None]:
# Deduplicate data and apply heuristics
ds_filter_train = dataset['train'].filter(filter, fn_kwargs={"uniques": uniques_train})

In [None]:
uniques_test = set(dataset['test']["hash"])

In [None]:
# Deduplicate data and apply heuristics
ds_filter_test = dataset['test'].filter(filter, fn_kwargs={"uniques": uniques_test})

In [None]:
from datasets import Dataset, DatasetDict
deduped_filtered_dataset = DatasetDict()

deduped_filtered_dataset['train'] = ds_filter_train
deduped_filtered_dataset['test'] = ds_filter_test

In [None]:
deduped_filtered_dataset = deduped_filtered_dataset.remove_columns(["hash"])

In [None]:
deduped_filtered_dataset.push_to_hub('tibetan_monolingual_A_filtered_deduped')

In [None]:
# save the filtered_dataset as files on disk and train BPE tokenizer
deduped_filtered_dataset['test'].to_csv('deduped_filtered_dataset_test')
deduped_filtered_dataset['train'].to_csv('deduped_filtered_dataset_train')

In [None]:
! tail -n +2 deduped_filtered_dataset_test > tmp.csv && mv tmp.csv  /home/ec2-user/SageMaker/monolingual/A_filtered_deduped/deduped_filtered_dataset_test.csv
! tail -n +2 deduped_filtered_dataset_train > tmp.csv && mv tmp.csv /home/ec2-user/SageMaker/monolingual/A_filtered_deduped/deduped_filtered_dataset_train.csv

### tibetan_monolingual_A_filtered_deduped doesn't need to be merged to form paragraphs. Some sentences are 1000 char long.

In [None]:
def merge_text_lines(examples):
    # print(examples)
    examples = [example.strip() for example in examples['text']]
    merged_examples = []
    total_examples = len(examples)
    first_third_point = total_examples // 3  # End of the first third
    second_third_point = 2 * total_examples // 3  # End of the second third
    
    # print(total_examples, first_third_point, second_third_point)
    for i in range(total_examples):
        if i < first_third_point:
            step = 1  # Done merge in the first third
        elif i < second_third_point:
            step = 2  # Merge every 2 lines in the second third
        else:
            step = 4  # Merge every 4 lines in the last third

        # Check if the current index is a starting point for a new group
        if i % step == 0:
            # Prevent going beyond the list length
            merged_examples.append(' '.join(examples[i:i+step]))

    return {'text': merged_examples}

In [None]:
%%time
filtered_merged_dataset = deduped_filtered_dataset.map(merge_text_lines, batched=True)

In [None]:
filtered_merged_dataset.push_to_hub('tibetan_monolingual_A_filtered_deduped_merged_124_lines')