# Data from prodigy instance

The AWS EC2 instance where we used to run Prodigy is at prodigy.bdrc.io 
The instance has all the data before we moved to stt.pecha.tools
Only the Quality Control team has worked on the prodigy instance recently to review all task on prodigy.

To load data form prodigy, log into the server
> ssh spsither@prodigy.bdrc.io

Run the following script to export all the data from prodigy to jsonl files
>/home/spsither/staging/export_all.sh

Create a tar with all the jsonl files
>tar -cJvf staging.tar staging/*.jsonl

In [None]:
# bring the tar file from the server to the local machine
! scp spsither@prodigy.bdrc.io:/home/spsither/staging.tar data/

In [None]:
# extract the tar file
! tar -xf data/staging.tar -C data/

In [None]:
import pandas as pd
def read_jsonl(filename):
    df = pd.read_json(filename,lines=True)
    df = df.reset_index(drop=True)
    return df

In [None]:
import os

jsonl_directory = "data/staging/"

files = os.listdir(jsonl_directory)

pattern = "jsonl"
files_matching_pattern = [file for file in files if file.endswith(pattern)]

In [None]:
#all reviewed files 
reviewed_jsonl_files = []
for file in files_matching_pattern:
    if 'review' in file and not 'stt_second_review' in file:
        reviewed_jsonl_files.append(file)

In [None]:
#all un-reviewed files
transcribed_jsonl_files = []
for file in files_matching_pattern:
    if 'review' not in file and not 'stt_second_review' in file:
        transcribed_jsonl_files.append(file)

In [None]:
! pip install tqdm
! pip install ipywidgets

In [None]:
#concatenate all the transcribed dataFrames
temp = pd.DataFrame([])
from tqdm.auto import tqdm
for file in tqdm(transcribed_jsonl_files):
    df = read_jsonl(f"{jsonl_directory}/{file}")
    temp = pd.concat([temp,df],axis=0)
    transcribed_df = temp.reset_index(drop=True)

In [None]:
#concatenate all the reviewed dataFrames
from tqdm.auto import tqdm
temp = pd.DataFrame([])
for file in tqdm(reviewed_jsonl_files):
    df = read_jsonl(f"{jsonl_directory}/{file}")
    temp = pd.concat([temp,df],axis=0)
    reviewed_df = temp.reset_index(drop=True)

In [None]:
finale_df = read_jsonl(f"{jsonl_directory}/stt_second_review.jsonl")

In [None]:
transcribed_df['id'].fillna(transcribed_df['text'], inplace=True)
reviewed_df['id'].fillna(reviewed_df['text'], inplace=True)
finale_df['id'].fillna(finale_df['text'], inplace=True)

In [None]:
audio_extensions = ['.mp3', '.wav', '.MP3', '.WAV']

In [None]:
for audio_extension in audio_extensions:
    transcribed_df['id'] = transcribed_df['id'].str.replace(audio_extension, '')
    reviewed_df['id'] = reviewed_df['id'].str.replace(audio_extension, '')
    finale_df['id'] = finale_df['id'].str.replace(audio_extension, '')

In [None]:
len(transcribed_df),len(reviewed_df),len(finale_df)

In [None]:
len(set(transcribed_df['id'])), len(set(reviewed_df['id'])), len(set(finale_df['id']))

In [None]:
100 - len(set(transcribed_df['id'])) / len(transcribed_df) * 100

In [None]:
transcribed_df.shape[0], reviewed_df.shape[0], finale_df.shape[0]

In [None]:
transcribed_df.drop_duplicates(subset='id', keep="first", inplace=True)
reviewed_df.drop_duplicates(subset='id', keep="first", inplace=True)
finale_df.drop_duplicates(subset='id', keep="first", inplace=True)

In [None]:
transcribed_df.shape[0], reviewed_df.shape[0], finale_df.shape[0]

In [None]:
len(transcribed_df),len(reviewed_df),len(finale_df)

In [None]:
transcribed_df = transcribed_df[transcribed_df['answer'] == 'accept']
reviewed_df = reviewed_df[reviewed_df['answer'] == 'accept']
finale_df = finale_df[finale_df['answer'] == 'accept']

In [None]:
len(transcribed_df),len(reviewed_df),len(finale_df)

In [None]:
# leakage check
len(transcribed_df) - len(finale_df), len(transcribed_df) - len(reviewed_df)

In [None]:
# left side intersection
intersection = reviewed_df.merge(finale_df, how='left', on='id')

In [None]:
intersection['grade'] = 1 # 1 means the task is transcribed only

In [None]:
intersection.loc[~intersection['transcript_x'].isna(), 'grade'] = 2 # 2 means the task is reviewed

In [None]:
intersection.loc[~intersection['transcript_y'].isna(), 'grade'] = 3 # 3 means the task is reviewed twice by the qc team

In [None]:
intersection['transcript_y'].fillna(intersection['transcript_x'], inplace=True) # overwrite the transcribed text with the reviewed text

In [None]:
intersection = intersection[~intersection['transcript_y'].isna()]

In [None]:
intersection = intersection[['transcript_y', 'id', 'grade']]

In [None]:
intersection['url'] = 'https://d38pmlk0v88drf.cloudfront.net/wav/' + intersection['id'] + '.wav'

In [None]:
intersection.shape

In [None]:
i = 11001
intersection.loc[i]['transcript_y'], intersection.loc[i,'url']

In [None]:
intersection['dept'] = intersection['id'].str[:6]

In [None]:
intersection.groupby('dept').size()

In [None]:
intersection.rename(columns={"id": "file_name", "transcript_y": "uni", "url": "url", "dept": "dept", "grade": "grade"}, inplace=True)

In [None]:
last_intersection = intersection[['file_name', 'uni', 'url', 'dept', 'grade']]
last_intersection.head()

In [None]:
last_intersection.to_csv('02_prodigy.tsv', sep='\t', index=False)

In [None]:
last_intersection[last_intersection['file_name'] == 'STT_AB00001_0029_150584_to_151748']

In [1]:
import pandas as pd
df = pd.read_csv('02_prodigy_finalised.csv')

In [3]:
len(df)

374548

In [6]:
df.head()

Unnamed: 0,id,transcript,len,dept
0,STT_TT00001_00233.450-00233.950,དེ་འདྲ་ཡིན་དུས་,15,STT_TT
1,STT_TT00001_00328.100-00329.300,ཨེ། དཔལ་མགོན་འཕགས་པ་ཀླུ་སྒྲུབ་ཀི་,33,STT_TT
2,STT_TT00001_00083.000-00083.750,ཉིན་རེ་ཉིན་རེའི།,16,STT_TT
3,STT_TT00001_00240.650-00241.550,འདིའི་གོང་ལ།,12,STT_TT
4,STT_TT00001_00102.350-00103.700,ཨེ་ནས་ཅིག་སེམས་འཁྲུགས་བསྡད་ཡ།,29,STT_TT


In [4]:
def getTimeSpan(filename):

    filename = filename.replace(".wav", "")
    filename = filename.replace(".WAV", "")
    filename = filename.replace(".mp3", "")
    filename = filename.replace(".MP3", "")
    try:
        if "_to_" in filename:
            start, end = filename.split("_to_")
            start = start.split("_")[-1]
            end = end.split("_")[0]
            end = float(end)
            start = float(start)
            return abs(end - start)/1000
        else:
            start, end = filename.split("-")
            start = start.split("_")[-1]
            end = end.split("_")[0]
            end =   float(end)
            start = float(start)
            return abs(end - start)
    except Exception as err:
        print(f"filename is:'{filename}'. Could not parse to get time span.")
        return 0
    

getTimeSpan("STT_TT00031_03471.850-03477.44")

5.5900000000001455

In [7]:
df['audio_len'] = df['id'].apply(getTimeSpan)

In [8]:
df['audio_len'].sum()/60/60

307.3489561111112