In [None]:
import sys
sys.path.append('../util')

from util import read_spreadsheet
from util import get_max_db_id
from util import collect_segments
from util import download_audio_gdrive

In [5]:
! pip install python-docx

Collecting python-docx
  Using cached python_docx-1.1.0-py3-none-any.whl (239 kB)
Collecting lxml>=3.1.0
  Downloading lxml-5.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Installing collected packages: lxml, python-docx
Successfully installed lxml-5.1.0 python-docx-1.1.0


In [30]:
import os
import gdown
from docx import Document

def docx_to_txt(docx_path):
        doc = Document(docx_path)
        fullText = []
        for para in doc.paragraphs:
            fullText.append(para.text)
        return '\n'.join(fullText)

def download_etext(gd_url,file_name):
    if not os.path.exists('etexts'):
        os.makedirs('etexts')

    if not os.path.exists('docx'):
        os.makedirs('docx')
    
    if os.path.exists(f'etexts/{file_name}.txt'):
        return
    docx_url, _ = os.path.split(gd_url)
    docx_url = os.path.join(docx_url, 'export?format=docx')
    docx_path = gdown.download(docx_url, output=f'docx/{file_name}.docx', quiet=False, fuzzy=True)
    # Convert the .docx file to text
    text = docx_to_txt(docx_path)
    # Create a .txt path with the same name as the .docx file
    txt_path = os.path.join('etexts/', file_name + '.txt')
    # Save the text to a .txt file
    with open(txt_path, 'w',encoding='utf-8') as f:
        f.write(text.replace('\n', ' '))

In [31]:
df = read_spreadsheet(sheet_id="1yKSzConuVWo8BuMDs2mabF5iiBKUz2wF--LIabFN6QE")

for index, row in df.iterrows():
    file_name = row['ID']
    gd_url = row['Etext link']
    Sr_no = row['z']
    # if not isinstance(gd_url, str) or not isinstance(id, str):
    #     continue
    if Sr_no >= 595 and Sr_no <= 644:
        print(file_name, gd_url, Sr_no)
        download_etext(gd_url, file_name)


STT_AB00595 https://docs.google.com/document/d/1TqSCgaVpB2K8hi7srpOKS6M4O5uLiFH0N2KXfguZBhA/edit 595.0


Downloading...
From: https://docs.google.com/document/d/1TqSCgaVpB2K8hi7srpOKS6M4O5uLiFH0N2KXfguZBhA/export?format=docx
To: /media/monlamai/SSD/GitHub/split/STT_AB/docx/STT_AB00595.docx
9.22kB [00:00, 246kB/s]


In [81]:
from pathlib import Path

import pandas as pd
from fast_antx.core import transfer
import numpy as np


def extract_tsv_text(tsvFile, ColumnNumber):
    """extracts text from dataframe using column number
    Args:
        tsvFile (Dataframe): dataframe of predicted tsv file
        ColumnNumber (integer/string):column name of the text to be extracted

    Returns:
        string: extracted text from tsv file
    """
    # read the tsv file
    predictedText = tsvFile[ColumnNumber].tolist()
    # to avoid unwanted splits in a word we replace space with _
    for count, text in enumerate(predictedText):
        predictedText[count] = predictedText[count].replace(" ", "_")
    predictedText = "\n".join(" ".join(predictedText).split())
    print("extracted text from tsv file..")
    return predictedText


def get_original_text(OriginalText):
    """reads the original text and removes unwanted characters

    Args:
        OriginalText (string): location of the original text file

    Returns:
        string: original text without unwanted characters
    """
    target = Path(f"{OriginalText}").read_text(encoding="utf-8")
    # remove unwanted characters
    target = target.replace("“", "").replace("”", "")
    print("extracted text from original file..")

    return target


def transfer_text(OriginalText, PredictedTSV, file_name, ColumnNumber='inference_transcript'):
    """transfers the annotation from predicted text to original text and returns a dataframe

    Args:
        OriginalText (string): location of the original string
        PredictedTSV (string): location of the predicted tsv file
        ColumnNumber (int/string): name of the coloumn in which transcripted text is there in .tsv file

    Returns:
        dataframe: dataframe that contains transfered annotation on original text
    """
    tsvFile = pd.read_csv(f"{PredictedTSV}", sep="\t")
    tsvFile = tsvFile[tsvFile['file_name'].str[0:11] == file_name]

    tsvFile.sort_values(by=['file_name'], inplace=True)

    source = extract_tsv_text(tsvFile, ColumnNumber)
    target = get_original_text(OriginalText)
    annotation = [["segment", "(\n)"]]
    transferedText = transfer(source, annotation, target).split("\n")
    if len(transferedText) > len(tsvFile):
        transferedText = transferedText[:len(tsvFile)]
        tsvFile[ColumnNumber] = transferedText
        status= f'Truncated {abs(len(transferedText)-len(tsvFile))}'
    elif len(transferedText) < len(tsvFile):
        transferedText = transferedText + [np.nan]*(len(tsvFile) - len(transferedText))
        tsvFile[ColumnNumber] = transferedText
        status=f'Padded {abs(len(transferedText)-len(tsvFile))}'
    else:
        tsvFile[ColumnNumber] = transferedText
        status='Normal'

    # returns a dataframe
    return tsvFile, status

In [82]:
temp = []
for file_name in [f"STT_AB00{x}" for x in range(595,645)]:
    transfer_text_df, status = transfer_text(f'etexts/{file_name}.txt',f'stt_ab_from_gdrive.tsv', file_name)
    temp.append(transfer_text_df)
    print(status)
df = pd.concat(temp)


extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extracted text from original file..
Normal
extracted text from tsv file..
extract

In [83]:
df.head()

Unnamed: 0,file_name,url,inference_transcript,audio_duration
7537,STT_AB00595_0001_21936_to_27346,https://d38pmlk0v88drf.cloudfront.net/stt_pech...,འགྲུལ་པ་དང་ལམ། ནང་མི་ཞིག་སྒེར་གྱི་མོ་ཊ་ཁྲིད་ནས...,5.41
767,STT_AB00595_0002_28250_to_37056,https://d38pmlk0v88drf.cloudfront.net/stt_pech...,རྒྱབ་ཏུ་ཨ་མ་བཟང་མོ། ཕྲུ་གུ་འབྲིང་བ་བུ་དོན་གྲུ...,8.806
3194,STT_AB00595_0003_38523_to_41988,https://d38pmlk0v88drf.cloudfront.net/stt_pech...,ཕྲུ་གུ་ཚོའི་དབར་ལ་ལོ་གསུམ་རེའི་་ཁྱད་པར་ཡོད་པ་...,3.465
8450,STT_AB00595_0004_59854_to_64411,https://d38pmlk0v88drf.cloudfront.net/stt_pech...,་ལོ་ན་བཞི་བཅུ་ཡིན་པའི་ཕ་ཉ,4.557
869,STT_AB00595_0005_77875_to_87295,https://d38pmlk0v88drf.cloudfront.net/stt_pech...,ི་མ་མོ་ཊ་གཏོང་རྒྱུར་དབྱིངས་འཕར་ཏེ་མགྱོགས་པོ་གཏ...,9.42


In [85]:
df.to_csv('test.tsv', sep='\t', index=False)

In [86]:
group_ab_ga_id = 1
group_ab_gb_id = 2
group_ab_gc_id = 7

In [87]:
df['group_id'] = group_ab_ga_id

In [88]:
df['state'] = 'imported'

In [89]:
df.fillna('', inplace=True)

In [90]:
def filter_length(st):
    return len(st) < 500

In [91]:
df = df[df['inference_transcript'].apply(lambda x: len(x) < 500)]

In [93]:
last_db_id = get_max_db_id()

id_arr = list(range(last_db_id + 1, df.shape[0] + last_db_id + 1))

df['id'] = id_arr

The maximum ID in the 'Task' table is: 420928


In [94]:
df.head()

Unnamed: 0,file_name,url,inference_transcript,audio_duration,group_id,state,id
7537,STT_AB00595_0001_21936_to_27346,https://d38pmlk0v88drf.cloudfront.net/stt_pech...,འགྲུལ་པ་དང་ལམ། ནང་མི་ཞིག་སྒེར་གྱི་མོ་ཊ་ཁྲིད་ནས...,5.41,1,imported,420929
767,STT_AB00595_0002_28250_to_37056,https://d38pmlk0v88drf.cloudfront.net/stt_pech...,རྒྱབ་ཏུ་ཨ་མ་བཟང་མོ། ཕྲུ་གུ་འབྲིང་བ་བུ་དོན་གྲུ...,8.806,1,imported,420930
3194,STT_AB00595_0003_38523_to_41988,https://d38pmlk0v88drf.cloudfront.net/stt_pech...,ཕྲུ་གུ་ཚོའི་དབར་ལ་ལོ་གསུམ་རེའི་་ཁྱད་པར་ཡོད་པ་...,3.465,1,imported,420931
8450,STT_AB00595_0004_59854_to_64411,https://d38pmlk0v88drf.cloudfront.net/stt_pech...,་ལོ་ན་བཞི་བཅུ་ཡིན་པའི་ཕ་ཉ,4.557,1,imported,420932
869,STT_AB00595_0005_77875_to_87295,https://d38pmlk0v88drf.cloudfront.net/stt_pech...,ི་མ་མོ་ཊ་གཏོང་རྒྱུར་དབྱིངས་འཕར་ཏེ་མགྱོགས་པོ་གཏ...,9.42,1,imported,420933


In [103]:
df = df[df['file_name'] != 'STT_AB00609_0001_1134_to_7278']

df = df[df['file_name'] != 'STT_AB00609_0002_8643_to_11203']

In [104]:
df.to_csv('stt_ab_upload.csv', index=False)

In [107]:
have = '/home/monlamai/_Task__202401231441.csv'

In [108]:
have_files = open(have).read().split('\n')

In [112]:
df = df[~df['file_name'].isin(have_files)]

In [113]:
df.shape

(9354, 7)

In [110]:
have_files = have_files[1:]