In [67]:
import pandas as pd
from pathlib import Path

In [68]:
csv_path = Path('../data/csv')
have_csv = set([e.stem for e in csv_path.iterdir() if e.is_file()])

In [69]:
from datetime import datetime
def sec_to_millis(sec):
    return sec * 1000

def time_to_milli_seconds(time_str):
    try:
        time_obj = datetime.strptime(time_str, '%H:%M:%S.%f')
        total_seconds = (time_obj.hour * 3600) + (time_obj.minute * 60) + time_obj.second + (time_obj.microsecond / 1e6)
        return str(round(sec_to_millis(total_seconds)))
    except ValueError:
        raise ValueError("Invalid time format. Use '00:00:44.368' format.")

# Example usage:
time_str = "00:00:44.368"
mills = time_to_milli_seconds(time_str)
print(mills)

44368


In [70]:
import pandas as pd
from pathlib import Path
dfs = []

def get_path(row, index_value, file):
    return f"{file}_{index_value:04}_{time_to_milli_seconds(row['from'])}_to_{time_to_milli_seconds(row['to'])}.wav"

for file in have_csv:
    df = pd.read_csv(f"../data/csv/{file}.csv")
    s = df.apply(lambda row: get_path(row, row.name, file), axis=1)
    df = pd.concat([df, s],ignore_index=True, axis=1)
    dfs.append(df)

df = pd.concat(dfs)

In [71]:
df = df[[3,4]].rename(columns={3: "sentence", 4: "path"})

In [72]:
df.head()

Unnamed: 0,sentence,path
0,ང་ལོ་གཅིག་ལ་བསླེབས་དུས་རྨོ་ལགས་ཀིས་རྒལ་པ་བསྣམས...,STT_MV0178_0000_22205_to_31969.wav
1,ཨ་,STT_MV0178_0001_31969_to_41873.wav
2,མཚོ་རྫས་ཕུལ།,STT_MV0178_0002_58881_to_64039.wav
3,ཀླུའི་རྒྱལ་པོ་མཆོག,STT_MV0178_0003_89009_to_95813.wav
4,ཐན་སྐྱོན་ཚབས་ཆེན་བྱུང་ནས་འབངས་མི་སེར་ཚོ་ཤི་ཁར་...,STT_MV0178_0004_95813_to_100310.wav


In [73]:
import re
def clean_transcription(text):
    text = text.replace('\n', ' ')
    text = text.strip()

    text = re.sub(r"་+", "་", text)
    text = re.sub(r"།+", "།", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+།", "།", text)
    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\/\{\}\(\)\༽\》\༼\《\༅\༄\༈\༑\༠]'
    text = re.sub(chars_to_ignore_regex, '', text)+" "

    return text

def check_clean(text):
    regex = re.compile(r'[a-zA-Z0-9]+')
    match = re.search(regex, text)
    return not bool(match)

In [74]:
df['sentence'] = df['sentence'].map(clean_transcription)

In [75]:
len(df)

97793

In [76]:
df = df[df['sentence'].apply(check_clean)]

In [77]:
len(df)

97787

In [78]:
df.to_csv('stt_mv-uni.tsv', index=False, sep = '\t')

In [79]:
df['path'].apply(lambda x: Path("/home/monlamai/Documents/GitHub/saymore-report-generator/data/segments/"+x).exists()).value_counts()

path
True    97787
Name: count, dtype: int64

In [41]:
!pip install pyewts

Collecting pyewts
  Using cached pyewts-0.2.0.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: pyewts
  Building wheel for pyewts (setup.py) ... [?25ldone
[?25h  Created wheel for pyewts: filename=pyewts-0.2.0-py3-none-any.whl size=16846 sha256=17d4fd8cdf07c6eaed1907b9304cfdd15dcd9a8115a1c4838d29b615502d3a5e
  Stored in directory: /home/monlamai/.cache/pip/wheels/88/60/74/963829fa17f8c05336ee8e674ef0c1e32ebe8aa6f7c20dd960
Successfully built pyewts
Installing collected packages: pyewts
Successfully installed pyewts-0.2.0


In [80]:
import pyewts
converter = pyewts.pyewts()

In [81]:
df['sentence'] = df['sentence'].apply(lambda x: converter.toWylie(x))

In [82]:
df.to_csv("stt_mv-wylie.tsv", sep='\t', index=False)