# Combine all the data into once dataset

Once you've run the following notebooks,
- 01_stt_pecha_tools.ipynb
- 02_prodigy.ipynb
- 03_mv_saymore.ipynb

You get the following tsv files
- 01_stt_pecha_tools.tsv
- 02_prodigy.tsv
- 03_mv_saymore.tsv

Concatenate the tsv files and clean the data

In [None]:
stt_pecha_tools = "01_stt_pecha_tools.tsv"
prodigy =         "02_prodigy.tsv"
mv =              "03_mv_saymore.tsv"

In [None]:
import pandas as pd
pecha_tools_df = pd.read_csv(stt_pecha_tools, sep='\t')
prodigy_df = pd.read_csv(prodigy, sep='\t')
mv_df = pd.read_csv(mv, sep='\t')

In [None]:
df = pd.concat([pecha_tools_df, prodigy_df, mv_df], ignore_index=True)

In [None]:
bad_apples = pd.read_csv('04_bad_apples.csv')

In [None]:
df = df[~df['file_name'].isin(bad_apples['file_name'])]

Drop duplicates, duplicates were introduced from prodigy annotation tool. One of the reason we had to move away from it.

In [None]:
df.drop_duplicates(subset='file_name', keep="first", inplace=True)

### clean the combined tsv

Remove unwanted characters and remove transcriptions with english characters

In [None]:
import re
def clean_transcription(text):
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.strip()

    text = re.sub(r"་+", "་", text)
    text = re.sub(r"།+", "།", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+།", "།", text)

    text = re.sub(r"ཧཧཧ+", "ཧཧཧ", text)
    text = re.sub(r'འེ་འེ་(འེ་)+', r'འེ་འེ་འེ་', text)
    text = re.sub(r'ཧ་ཧ་(ཧ་)+', r'ཧ་ཧ་ཧ་', text)

    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\/\{\}\(\)\༽\》\༼\《\༅\༄\༈\༑\༠]'
    
    text = re.sub(chars_to_ignore_regex, '', text)+" "
    return text

def check_if_regex(text):
    text = str(text)
    regex = re.compile(r'[a-zA-Z]+')
    match = re.search(regex, text)
    
    return bool(match)
    
print(check_if_regex('ཧཧ'))
print(check_if_regex('some text'))

In [None]:
df = df[df['uni'].str.len() > 0]

In [None]:
df = df[~df['uni'].apply(check_if_regex)]

In [None]:
df['uni'] = df['uni'].map(clean_transcription)

In [None]:
! pip install pyewts

### Convert unicode tibetan characters into wylie format

In [None]:
import pyewts

converter = pyewts.pyewts()

df['wylie'] = df['uni'].apply(converter.toWylie)

In [None]:
df['char_len'] = df['uni'].str.len()

In [None]:
! pip install matplotlib

In [None]:
df['char_len'].hist(bins=100, range=(0, 300))

In [None]:
df['char_len'].describe()

In [None]:
df.shape[0]

In [None]:
upper_cutoff = 400
lower_cutoff = 5

In [None]:
df[df['char_len'] > upper_cutoff].shape[0], df[df['char_len'] < lower_cutoff].shape[0]

In [None]:
df[df['char_len'] > upper_cutoff][['uni', 'url', 'char_len']].sort_values(by='char_len').to_csv('04_longer_than_upper_cutoff.tsv', sep='\t', index=False)

In [None]:
df[df['char_len'] < lower_cutoff][['uni', 'url', 'char_len']].sort_values(by='char_len').to_csv('04_shorter_than_lower_cutoff.tsv', sep='\t', index=False)

In [None]:
df = df[(df['char_len'] < upper_cutoff) & (df['char_len'] > lower_cutoff)]

In [None]:
df.shape[0]

### Get the audio time duration from the file name. 

There is to format for encoding the time span. We use the one with \_to\_ now. The difference came from the using a different library to do the Voice Activity Detection and splitting the audio for Tibetan Teachings. We have since started using pyannote-audio for all departments now.

In [None]:
def getTimeSpan(filename):

    filename = filename.replace(".wav", "")
    filename = filename.replace(".WAV", "")
    filename = filename.replace(".mp3", "")
    filename = filename.replace(".MP3", "")
    try:
        if "_to_" in filename:
            start, end = filename.split("_to_")
            start = start.split("_")[-1]
            end = end.split("_")[0]
            end = float(end)
            start = float(start)
            return abs(end - start)/1000
        else:
            start, end = filename.split("-")
            start = start.split("_")[-1]
            end = end.split("_")[0]
            end =   float(end)
            start = float(start)
            return abs(end - start)
    except Exception as err:
        print(f"filename is:'{filename}'. Could not parse to get time span.")
        return 0
    

getTimeSpan("STT_TT00031_03471.850-03477.44")

In [None]:
df['audio_len'] = df['file_name'].apply(getTimeSpan)

In [None]:
df['audio_len'].describe()

In [None]:
df['audio_len'].hist(bins=100, range=(0, 15))

In [None]:
df['url'] = df['url'].map(lambda x : x.replace('#','%23'))

In [None]:
! pip install botok

Use [botok](https://github.com/OpenPecha/Botok) to get count of non tibetan syllables and illegal tibetan syllables.

In [None]:
from botok import WordTokenizer

In [None]:
def process_transcript(text, tokenizer):
    tokens = tokenizer.tokenize(text)
    non_word_count = sum(1 for token in tokens if token.pos == 'NON_WORD' and not token.skrt)
    total_tokens = len(tokens)

    non_bo_word_count = 0
    for token in tokens:
        if token.chunk_type in ["LATIN", "CJK", "OTHER"] and (
            token.chunk_type != "OTHER" or not token.skrt
        ):
            non_bo_word_count += 1
    
    return non_word_count, non_bo_word_count, total_tokens

In [None]:
tokenizer = WordTokenizer()

df['non_word_count'], df['non_bo_word_count'], df['total_tokens'] = zip(*df['uni'].apply(lambda text: process_transcript(text, tokenizer)))

In [None]:
df.to_csv('04_combine_all.tsv', sep='\t', index=False)

In [None]:
df[ df['non_word_count'] > 1].shape[0], df[ df['non_bo_word_count'] > 1].shape[0]

In [None]:
df.shape

In [None]:
df['non_word_count'].describe()

In [None]:
df[ df['non_bo_word_count'] > 1 ].loc[:, ('file_name', 'uni')]

In [None]:
df['non_word_percentage'] = (df['non_word_count'] / df['total_tokens']) * 100
# df.fillna(0, inplace=True)  # Replace NaN values with 0 in case of division by zero

In [None]:
df.to_csv('04_combine_all.tsv', sep='\t', index=False)

In [None]:
df.head()

In [None]:
df.shape[0]

In [None]:
df[df['non_word_count'] > 1].shape[0] / df.shape[0] * 100

In [None]:
df_non_word = df[df['non_word_count'] > 1]

In [None]:
df_non_word.groupby('grade').size()

In [None]:
df_non_word.groupby('dept').size()

In [None]:
df_non_word.groupby('dept').sum('audio_len')['audio_len']

In [None]:
df_non_word.groupby('dept').size() / df.groupby('dept').size() * 100

In [None]:
df.groupby('dept').size() / df.shape[0] * 100

In [None]:
df_non_word = df_non_word.sort_values(by='non_word_count', ascending=False)

In [None]:
df_non_word.to_csv('04_non_word_count.tsv', sep='\t', index=False)

In [None]:
# len(train), len(val), len(test), len(train)+len(val)+len(test), len(df)

In [None]:
# train.to_csv('train.tsv', sep='\t', index=False)
# val.to_csv(  'val.tsv', sep='\t', index=False)
# test.to_csv( 'test.tsv', sep='\t', index=False)

In [None]:
df['uni'].str.contains('ཧཧཧ').value_counts()

In [None]:
import pandas as pd

df = pd.read_csv('04_combine_all.tsv', sep='\t')

In [None]:
df.head()

In [None]:
df.groupby('dept')['audio_len'].sum()/60/60

In [None]:
df['audio_len'].sum()/60/60

In [None]:
# df.sort_values(by='audio_len', ascending=False, inplace=True)
df = df.sample(frac = 1)

In [None]:
df.iloc[0:100,[0, 1, 3, 5]].to_csv("04_random_100.tsv", index=False, sep='\t')
df.iloc[0:100,[0, 1, 3, 5]].head()

In [None]:
df[df['grade'] == 3].groupby('dept').size()

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('04_combine_all.tsv', sep='\t')

In [None]:
df.groupby('dept').sum('audio_len')['audio_len']/60/60

In [None]:
df['audio_len'].sum()/60/60