In [100]:
import pandas as pd
from pathlib import Path

In [101]:
csv_path = Path('../data/csv')
have_csv = set([e.stem for e in csv_path.iterdir() if e.is_file()])

In [102]:
from datetime import datetime
def sec_to_millis(sec):
    return sec * 1000

def time_to_milli_seconds(time_str):
    try:
        time_obj = datetime.strptime(time_str, '%H:%M:%S.%f')
        total_seconds = (time_obj.hour * 3600) + (time_obj.minute * 60) + time_obj.second + (time_obj.microsecond / 1e6)
        return str(round(sec_to_millis(total_seconds)))
    except ValueError:
        raise ValueError("Invalid time format. Use '00:00:44.368' format.")

# Example usage:
time_str = "00:00:44.368"
mills = time_to_milli_seconds(time_str)
print(mills)

44368


In [103]:
import pandas as pd
from pathlib import Path
dfs = []

def get_path(row, index_value, file):
    return f"{file}_{index_value:04}_{time_to_milli_seconds(row['from'])}_to_{time_to_milli_seconds(row['to'])}.wav"

for file in have_csv:
    df = pd.read_csv(f"../data/csv/{file}.csv")
    s = df.apply(lambda row: get_path(row, row.name, file), axis=1)
    df = pd.concat([df, s],ignore_index=True, axis=1)
    dfs.append(df)

df = pd.concat(dfs)

In [104]:
df = df[[3,4]].rename(columns={3: "uni", 4: "file_name"})

In [105]:
df.head()

Unnamed: 0,uni,file_name
0,མཱ་ལིས་ཡ། ཧྲའོ་ཤའོ་ཕེབས་སོང་།,STT_MV0176_0000_151117_to_154945.wav
1,མཱ་ལིས་ཡ་རང་ལོག་ཞིག,STT_MV0176_0001_154945_to_163864.wav
2,དྲ་བ་བཀྲམས་བྱས་ཤིན་ནུའུ་ཤིན་ལིང་ངོས་ལ་མདུན་སྐྱ...,STT_MV0176_0002_163864_to_173863.wav
3,རྒྱུག་ཤར་གློད།,STT_MV0176_0003_173863_to_182596.wav
4,ད་དུང་ཆུ་ཚོད་གཅིག་དང་ཕྱེ་ཀ་ནས་ང་ཚོའི་རོགས་སྐྱོ...,STT_MV0176_0004_193769_to_199101.wav


In [106]:
import re
def clean_transcription(text):
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.strip()

    text = re.sub(r"་+", "་", text)
    text = re.sub(r"།+", "།", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+།", "།", text)
    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\/\{\}\(\)\༽\》\༼\《\༅\༄\༈\༑\༠]'
    text = re.sub(chars_to_ignore_regex, '', text)+" "

    return text

def check_clean(text):
    regex = re.compile(r'[a-zA-Z0-9]+')
    match = re.search(regex, text)
    return not bool(match)

In [107]:
df['uni'] = df['uni'].map(clean_transcription)

In [108]:
len(df)

98115

In [109]:
df = df[df['uni'].apply(check_clean)]

In [110]:
len(df)

98109

In [111]:
df['file_name'].apply(lambda x: Path("/home/monlamai/Documents/GitHub/saymore-report-generator/data/segments/"+x).exists()).value_counts()

file_name
True    98109
Name: count, dtype: int64

In [112]:
# !pip install pyewts

In [113]:
import pyewts
converter = pyewts.pyewts()

In [114]:
df['wylie'] = df['uni'].apply(lambda x: converter.toWylie(x))

In [115]:
df['url'] = df['file_name'].apply(lambda x: f"https://d38pmlk0v88drf.cloudfront.net/mv_wav/{x}")
df['dept'] = df.file_name.apply(lambda x: x[:6])

In [116]:
df = df.sample(frac=1)

In [117]:
df = df[['file_name', 'uni', 'wylie', 'url', 'dept']]

In [118]:
df.head()

Unnamed: 0,file_name,uni,wylie,url,dept
257,STT_MV0044_0257_1663248_to_1673096.wav,བླ་སྨན་ཝང་ལགས། ཁྱེད་རང་གིས་ཞིས་ལའེ་ལི་མ་མཐོང་ག...,bla sman wang lags/_khyed rang gis zhis la'e l...,https://d38pmlk0v88drf.cloudfront.net/mv_wav/S...,STT_MV
388,STT_MV0178_0388_4086331_to_4087613.wav,རྫས་མདེལ་ཚང་མ་རྦད་དེ་རྫོགས་སོང་།,rdzas mdel tshang ma rbad de rdzogs song /_,https://d38pmlk0v88drf.cloudfront.net/mv_wav/S...,STT_MV
321,STT_MV0200_0321_2365061_to_2374202.wav,ཨ་ཅག་ལ་སྤུན་དོག་གཟིགས་ཀེ། ཉོ་ཡི་མིན།་སྤུན་དོག་...,a cag la spun dog gzigs ke/_nyo yi min/ spun d...,https://d38pmlk0v88drf.cloudfront.net/mv_wav/S...,STT_MV
95,STT_MV0103_0095_653959_to_659897.wav,ལྟ་ཞིབ་པའི་མོས་མཐུན་མ་བྱས་ན་སུས་རང་གར་མཐོ་སྤར་...,lta zhib pa'i mos mthun ma byas na sus rang ga...,https://d38pmlk0v88drf.cloudfront.net/mv_wav/S...,STT_MV
101,STT_MV0087_0101_979476_to_980796.wav,ངས་སྡེར་ས་རང་མིན།,ngas sder sa rang min/_,https://d38pmlk0v88drf.cloudfront.net/mv_wav/S...,STT_MV


In [119]:
df.to_csv("stt_mv.tsv", sep='\t', index=False)

In [120]:
len(df)

98109

In [121]:
total = len(df)

train_len = int(total * 0.9)
val_len = int(total * 0.05)
test_len = total - train_len - val_len
print(train_len, val_len, test_len, total == train_len + val_len + test_len)

train_df = df[:train_len]
val_df = df[train_len:train_len + val_len]
test_df = df[train_len + val_len:]

print(len(train_df), len(val_df), len(test_df), len(train_df) + len(val_df) + len(test_df) == total)

88298 4905 4906 True
88298 4905 4906 True


In [122]:
train_df.to_csv('train.tsv', index=False, sep='\t')
test_df.to_csv('test.tsv', index=False, sep='\t')
val_df.to_csv('validation.tsv', index=False, sep='\t')