In [1]:
import pandas as pd
from pathlib import Path

In [2]:
csv_path = Path('../data/csv')
have_csv = set([e.stem for e in csv_path.iterdir() if e.is_file()])

In [3]:
from datetime import datetime
def sec_to_millis(sec):
    return sec * 1000

def time_to_milli_seconds(time_str):
    try:
        time_obj = datetime.strptime(time_str, '%H:%M:%S.%f')
        total_seconds = (time_obj.hour * 3600) + (time_obj.minute * 60) + time_obj.second + (time_obj.microsecond / 1e6)
        return str(round(sec_to_millis(total_seconds)))
    except ValueError:
        raise ValueError("Invalid time format. Use '00:00:44.368' format.")

# Example usage:
time_str = "00:00:44.368"
mills = time_to_milli_seconds(time_str)
print(mills)

44368


In [4]:
import pandas as pd
from pathlib import Path
dfs = []

def get_path(row, index_value, file):
    return f"{file}_{index_value:04}_{time_to_milli_seconds(row['from'])}_to_{time_to_milli_seconds(row['to'])}.wav"

for file in have_csv:
    df = pd.read_csv(f"../data/csv/{file}.csv")
    s = df.apply(lambda row: get_path(row, row.name, file), axis=1)
    df = pd.concat([df, s],ignore_index=True, axis=1)
    dfs.append(df)

df = pd.concat(dfs)

In [5]:
df = df[[3,4]].rename(columns={3: "sentence", 4: "path"})

In [6]:
df.head()

Unnamed: 0,sentence,path
0,སྦྱིན་བདག་ཇིན་ལགས་ད་རེས་ཡིན་ན་ཞེ་བོ་ཞིག་བཏུངས་...,STT_MV0011_0000_0_to_7575.wav
1,ཡིན་ནས་ཁྱེད་རང་གཉིས་གིས་འགོ་པ་ལེན་ལགས་ལ་རྗེས་མ...,STT_MV0011_0001_7575_to_16415.wav
2,མདའ་དཔོན་ཀྲང་ལགས་ག་ལེར་ཕེབས་གོ བཞུགས་གོ་,STT_MV0011_0002_16415_to_22496.wav
3,སྦྱིན་བདག་རྒྱུག་ཁྱི་དེ་གཉིས་རྫི་རྣོ་སེ་བྱུང་ང་...,STT_MV0011_0003_22496_to_29494.wav
4,འདུག་སེ་ལྟ་དུས་ས་ཁྲ་འདི་གནད་འགག་ཏོག་ཙི་འདུག་,STT_MV0011_0004_29494_to_33804.wav


In [7]:
import re
def clean_transcription(text):
    text = text.replace('\n', ' ')
    text = text.strip()

    text = re.sub(r"་+", "་", text)
    text = re.sub(r"།+", "།", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+།", "།", text)
    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\/\{\}\(\)\༽\》\༼\《\༅\༄\༈\༑\༠]'
    text = re.sub(chars_to_ignore_regex, '', text)+" "

    return text

def check_clean(text):
    regex = re.compile(r'[a-zA-Z0-9]+')
    match = re.search(regex, text)
    return not bool(match)

In [8]:
df['sentence'] = df['sentence'].map(clean_transcription)

In [9]:
len(df)

98115

In [10]:
df = df[df['sentence'].apply(check_clean)]

In [11]:
len(df)

98109

In [13]:
df['path'].apply(lambda x: Path("/home/monlamai/Documents/GitHub/saymore-report-generator/data/segments/"+x).exists()).value_counts()

path
True    98109
Name: count, dtype: int64

In [41]:
# !pip install pyewts

Collecting pyewts
  Using cached pyewts-0.2.0.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: pyewts
  Building wheel for pyewts (setup.py) ... [?25ldone
[?25h  Created wheel for pyewts: filename=pyewts-0.2.0-py3-none-any.whl size=16846 sha256=17d4fd8cdf07c6eaed1907b9304cfdd15dcd9a8115a1c4838d29b615502d3a5e
  Stored in directory: /home/monlamai/.cache/pip/wheels/88/60/74/963829fa17f8c05336ee8e674ef0c1e32ebe8aa6f7c20dd960
Successfully built pyewts
Installing collected packages: pyewts
Successfully installed pyewts-0.2.0


In [14]:
import pyewts
converter = pyewts.pyewts()

In [15]:
df['sentence'] = df['sentence'].apply(lambda x: converter.toWylie(x))

In [23]:
df = df.sample(frac=1)

In [24]:
df.to_csv("stt_mv-wylie.tsv", sep='\t', index=False)

In [25]:
len(df)

98109

In [26]:
total = len(df)

train_len = int(total * 0.9)
val_len = int(total * 0.05)
test_len = total - train_len - val_len
print(train_len, val_len, test_len, total == train_len + val_len + test_len)

train_df = df[:train_len]
val_df = df[train_len:train_len + val_len]
test_df = df[train_len + val_len:]

print(len(train_df), len(val_df), len(test_df), len(train_df) + len(val_df) + len(test_df) == total)

88298 4905 4906 True
88298 4905 4906 True


In [29]:
train_df.to_csv('train-wylie.tsv', index=False, sep='\t')
test_df.to_csv('test-wylie.tsv', index=False, sep='\t')
val_df.to_csv('valid-wylie.tsv', index=False, sep='\t')