In [None]:
import sys
sys.path.append('../util')

from util import read_spreadsheet
from util import collect_segments
from util import download_audio_gdrive
from util import split_audio_files
from util import get_time_span
from util import get_max_db_id


In [None]:
! pip install -U yt-dlp[default]                     # Install yt-dlp

In [None]:
df = read_spreadsheet(sheet_id = "1yKSzConuVWo8BuMDs2mabF5iiBKUz2wF--LIabFN6QE")

for index, row in df.iterrows():
    file_name = row['ID']
    gd_url = row['Audio link']
    Sr_no = row['z']
    
    # if not isinstance(gd_url, str) or not isinstance(id, str):
    #     continue
    if Sr_no >= 645 and Sr_no <= 691:
        gd_url = gd_url.split('?')[0]
        print(gd_url, file_name)
        # file_name = file_name + ".mp3"
        # download_audio_gdrive(gd_url, file_name)
        yt_downloaded = f"""yt-dlp --extract-audio --audio-quality 0 --audio-format wav --postprocessor-args "-ar 16000 -ac 1" {gd_url} -o './full_audio/{file_name}.%(ext)s'"""
        ! {yt_downloaded}



In [None]:
split_audio_files('STT_AB', 'wav')

In [None]:
collect_segments('STT_AB', 'after_split', 'segments_ab')

In [None]:
! aws s3 cp segments_ab  s3://monlam.ai.stt/wav16k/ --recursive

In [None]:
target_directory = f"segments_ab"

In [None]:
from transformers import pipeline
from tqdm.auto import tqdm

from pathlib import Path

target_path = Path(target_directory)

rows = []
generator = pipeline(model="openpecha/wav2vec2_run8")

for file in tqdm(target_path.glob('*.wav'), total=len(list(target_path.glob('*.wav')))):
    inf = generator(str(file))["text"]
    rows.append([file.stem, f"https://d38pmlk0v88drf.cloudfront.net/wav16k/{file.name}", inf, get_time_span(str(file.name))])

In [None]:
import pandas as pd
df = pd.DataFrame(rows, columns =['file_name', 'url', 'inference_transcript', 'audio_duration'])    

In [None]:
df.head()

In [None]:
df[['inference_transcript','url']].iloc[0:10].to_dict()

In [None]:
df.to_csv(f"stt_ab_from_yt.tsv", index=False, sep="\t")

In [None]:
from pathlib import Path

import pandas as pd
from fast_antx.core import transfer
import numpy as np


def extract_tsv_text(tsvFile, ColumnNumber):
    """extracts text from dataframe using column number
    Args:
        tsvFile (Dataframe): dataframe of predicted tsv file
        ColumnNumber (integer/string):column name of the text to be extracted

    Returns:
        string: extracted text from tsv file
    """
    # read the tsv file
    predictedText = tsvFile[ColumnNumber].tolist()
    # to avoid unwanted splits in a word we replace space with _
    for count, text in enumerate(predictedText):
        predictedText[count] = predictedText[count].replace(" ", "_")
    predictedText = "\n".join(" ".join(predictedText).split())
    print("extracted text from tsv file..")
    return predictedText


def get_original_text(OriginalText):
    """reads the original text and removes unwanted characters

    Args:
        OriginalText (string): location of the original text file

    Returns:
        string: original text without unwanted characters
    """
    target = Path(f"{OriginalText}").read_text(encoding="utf-8")
    # remove unwanted characters
    target = target.replace("“", "").replace("”", "")
    print("extracted text from original file..")

    return target


def transfer_text(OriginalText, PredictedTSV, file_name, ColumnNumber='inference_transcript'):
    """transfers the annotation from predicted text to original text and returns a dataFrame

    Args:
        OriginalText (string): location of the original string
        PredictedTSV (string): location of the predicted tsv file
        ColumnNumber (int/string): name of the column in which transcribed text is there in .tsv file

    Returns:
        dataFrame: dataFrame that contains transferred annotation on original text
    """
    tsvFile = pd.read_csv(f"{PredictedTSV}", sep="\t")
    tsvFile = tsvFile[tsvFile['file_name'].str[0:11] == file_name]

    tsvFile.sort_values(by=['file_name'], inplace=True)

    source = extract_tsv_text(tsvFile, ColumnNumber)
    target = get_original_text(OriginalText)
    annotation = [["segment", "(\n)"]]
    transferredText = transfer(source, annotation, target).split("\n")
    if len(transferredText) > len(tsvFile):
        transferredText = transferredText[:len(tsvFile)]
        tsvFile[ColumnNumber] = transferredText
        status= f'Truncated {abs(len(transferredText)-len(tsvFile))}'
    elif len(transferredText) < len(tsvFile):
        transferredText = transferredText + [np.nan]*(len(tsvFile) - len(transferredText))
        tsvFile[ColumnNumber] = transferredText
        status=f'Padded {abs(len(transferredText)-len(tsvFile))}'
    else:
        tsvFile[ColumnNumber] = transferredText
        status='Normal'

    # returns a dataFrame
    return tsvFile, status

In [None]:
temp = []
for file_name in [f"STT_AB00{x}" for x in range(645,692)]:
    transfer_text_df, status = transfer_text(f'etexts/{file_name}.txt',f'stt_ab_from_yt.tsv', file_name)
    temp.append(transfer_text_df)
    print(status)
df = pd.concat(temp)



In [None]:
df.head()

In [None]:
group_ab_ga_id = 1
group_ab_gb_id = 2
group_ab_gc_id = 7

In [None]:
df['group_id'] = group_ab_ga_id

In [None]:
df['state'] = 'transcribing'
df.fillna('', inplace=True)
def filter_length(st):
    return len(st) < 500

In [None]:
df = df[df['inference_transcript'].apply(lambda x: len(x) < 500)]

In [None]:
df = df.sort_values('file_name')
df = df.reset_index(drop=True)

In [None]:
last_db_id = get_max_db_id()

id_arr = list(range(last_db_id + 1, df.shape[0] + last_db_id + 1))

df['id'] = id_arr


In [None]:
df.head()

In [None]:
df.to_csv('stt_ab_upload_new.csv', index=False)

In [None]:
df.shape