In [77]:
import pandas as pd
from pathlib import Path

In [78]:
csv_path = Path('../data/csv')
have_csv = set([e.stem for e in csv_path.iterdir() if e.is_file()])

In [79]:
from datetime import datetime
def sec_to_millis(sec):
    return sec * 1000

def time_to_milli_seconds(time_str):
    try:
        time_obj = datetime.strptime(time_str, '%H:%M:%S.%f')
        total_seconds = (time_obj.hour * 3600) + (time_obj.minute * 60) + time_obj.second + (time_obj.microsecond / 1e6)
        return str(round(sec_to_millis(total_seconds)))
    except ValueError:
        raise ValueError("Invalid time format. Use '00:00:44.368' format.")

# Example usage:
time_str = "00:00:44.368"
mills = time_to_milli_seconds(time_str)
print(mills)

44368


In [80]:
import pandas as pd
from pathlib import Path
dfs = []

def get_path(row, index_value, file):
    return f"{file}_{index_value:04}_{time_to_milli_seconds(row['from'])}_to_{time_to_milli_seconds(row['to'])}.wav"

for file in have_csv:
    df = pd.read_csv(f"../data/csv/{file}.csv")
    s = df.apply(lambda row: get_path(row, row.name, file), axis=1)
    df = pd.concat([df, s],ignore_index=True, axis=1)
    dfs.append(df)

df = pd.concat(dfs)

In [81]:
df = df[[3,4]].rename(columns={3: "uni", 4: "file_name"})

In [82]:
df.head()

Unnamed: 0,uni,file_name
0,མཱ་ལིས་ཡ། ཧྲའོ་ཤའོ་ཕེབས་སོང་།,STT_MV0176_0000_151117_to_154945.wav
1,མཱ་ལིས་ཡ་རང་ལོག་ཞིག,STT_MV0176_0001_154945_to_163864.wav
2,དྲ་བ་བཀྲམས་བྱས་ཤིན་ནུའུ་ཤིན་ལིང་ངོས་ལ་མདུན་སྐྱ...,STT_MV0176_0002_163864_to_173863.wav
3,རྒྱུག་ཤར་གློད།,STT_MV0176_0003_173863_to_182596.wav
4,ད་དུང་ཆུ་ཚོད་གཅིག་དང་ཕྱེ་ཀ་ནས་ང་ཚོའི་རོགས་སྐྱོ...,STT_MV0176_0004_193769_to_199101.wav


In [83]:
import re
def clean_transcription(text):
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.strip()

    text = re.sub(r"་+", "་", text)
    text = re.sub(r"།+", "།", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+།", "།", text)
    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\/\{\}\(\)\༽\》\༼\《\༅\༄\༈\༑\༠]'
    text = re.sub(chars_to_ignore_regex, '', text)+" "

    return text

def check_clean(text):
    regex = re.compile(r'[a-zA-Z0-9]+')
    match = re.search(regex, text)
    return not bool(match)

In [84]:
df['uni'] = df['uni'].map(clean_transcription)

In [85]:
len(df)

98115

In [86]:
df = df[df['uni'].apply(check_clean)]

In [87]:
len(df)

98109

In [88]:
df['file_name'].apply(lambda x: Path("/home/monlamai/Documents/GitHub/saymore-report-generator/data/segments/"+x).exists()).value_counts()

file_name
True    98109
Name: count, dtype: int64

In [89]:
# !pip install pyewts

In [90]:
import pyewts
converter = pyewts.pyewts()

In [91]:
df['wylie'] = df['uni'].apply(lambda x: converter.toWylie(x))

In [92]:
df['url'] = df['file_name'].apply(lambda x: f"https://d38pmlk0v88drf.cloudfront.net/{x}")
df['dept'] = df.file_name.apply(lambda x: x[:6])

In [93]:
df = df.sample(frac=1)

In [94]:
df = df[['file_name', 'uni', 'wylie', 'url', 'dept']]

In [95]:
df.head()

Unnamed: 0,file_name,uni,wylie,url,dept
328,STT_MV0209_0328_2248346_to_2257270.wav,འདི་ནས་ཚུར་ཕེབས་ད། ག་ལས་ཡིན་ད,'di nas tshur phebs da/_ga las yin da_,https://d38pmlk0v88drf.cloudfront.net/STT_MV02...,STT_MV
259,STT_MV0464_0259_2560391_to_2567061.wav,ཁྱེད་རང་གསན་དང་། བྱིའུ་གི་སྐད་སྒྲ་སྙན་པོ་འདི།,khyed rang gsan dang /_byi'u gi skad sgra snya...,https://d38pmlk0v88drf.cloudfront.net/STT_MV04...,STT_MV
65,STT_MV0040_0065_606318_to_613188.wav,ཨ་ལས་ཁྱེད་རང་སངས་རྒྱས་ཀྱི་སྤྲུལ་པ་ཡིན་པ་རེད། ཡ...,a las khyed rang sangs rgyas kyi sprul pa yin ...,https://d38pmlk0v88drf.cloudfront.net/STT_MV00...,STT_MV
151,STT_MV0139_0151_1195879_to_1205596.wav,བུས་ཧན་ཟན་ཞོ་ཞོ་ཞིག་བསྐུར་ཤག ངས་ནི་བཟོ་ཤེས་ཀི་...,bus han zan zho zho zhig bskur shag_ngas ni bz...,https://d38pmlk0v88drf.cloudfront.net/STT_MV01...,STT_MV
39,STT_MV0154_0039_423657_to_431294.wav,སྨན་འགྲུབ་གྲུབ་ཡོད། ཨམ་རྗེ་ལགས་ཀིས་གསུང་ཡ་ལ། ས...,sman 'grub grub yod/_am rje lags kis gsung ya ...,https://d38pmlk0v88drf.cloudfront.net/STT_MV01...,STT_MV


In [96]:
df.to_csv("stt_mv.tsv", sep='\t', index=False)

In [97]:
len(df)

98109

In [98]:
total = len(df)

train_len = int(total * 0.9)
val_len = int(total * 0.05)
test_len = total - train_len - val_len
print(train_len, val_len, test_len, total == train_len + val_len + test_len)

train_df = df[:train_len]
val_df = df[train_len:train_len + val_len]
test_df = df[train_len + val_len:]

print(len(train_df), len(val_df), len(test_df), len(train_df) + len(val_df) + len(test_df) == total)

88298 4905 4906 True
88298 4905 4906 True


In [99]:
train_df.to_csv('train.tsv', index=False, sep='\t')
test_df.to_csv('test.tsv', index=False, sep='\t')
val_df.to_csv('validation.tsv', index=False, sep='\t')