# Audio Processing with silero_vad

1. **Speech Detection and Trimming:**  
   - The `silero_vad` model processes all audio files and generates timestamps indicating where speech is present.  
   - Audio files are then trimmed according to these timestamps so that the resulting audio files contain only speech (removing any empty spaces).

2. **Chunking of Audio Files:**  
   - The trimmed audio files are further segmented into 30-second chunks.  
   - These chunks are saved into folders following the structure below.

3. **File Format and Storage:**  
   - The 30-second audio chunks are stored in the following directory format:  
     **dataset/audio_chunks/audio_chunks_1/**  
     Each folder contains the respective 30-second audio chunks of the processed audio file.


In [1]:
SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint

USE_PIP = True # download model using pip package or torch.hub
USE_ONNX = False # change this to True if you want to test onnx model
if USE_ONNX:
    %pip install -q onnxruntime
if USE_PIP:
  %pip install -q silero-vad
  from silero_vad import (load_silero_vad,
                          read_audio,
                          get_speech_timestamps,
                          save_audio,
                          VADIterator,
                          collect_chunks)
  model = load_silero_vad(onnx=USE_ONNX)
else:
  model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                model='silero_vad',
                                force_reload=True,
                                onnx=USE_ONNX)

  (get_speech_timestamps,
  save_audio,
  read_audio,
  VADIterator,
  collect_chunks) = utils

    torch (>=1.8.*)
           ~~~~~~^[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import yaml

yaml_file = "sample/text/train.yaml"

with open(yaml_file, "r", encoding="utf-8") as f:
    data = yaml.safe_load(f)

# Use a list to preserve order, and a set to track seen wav values.
unique_wav_files = []
seen = set()

for entry in data:
    if "wav" in entry:
        wav_file = entry["wav"]
        if wav_file not in seen:
            seen.add(wav_file)
            unique_wav_files.append(wav_file)

# Combine each wav file with the prefix
combined_paths = ["sample/audio/" + wav for wav in unique_wav_files]

for path in combined_paths:
    print(path)

print(len(combined_paths))


sample/audio/hi106106145-1.wav
sample/audio/hi106106145-2.wav
sample/audio/hi106106145-3.wav
sample/audio/hi106106145-4.wav
sample/audio/hi106106145-5.wav
sample/audio/hi106106145-6.wav
sample/audio/hi106106145-7.wav
sample/audio/hi106106145-8.wav
sample/audio/hi106106145-9.wav
sample/audio/hi106106145-10.wav
10


In [5]:
import os
import concurrent.futures

# Create base output folder
base_output_folder = "sample/audio_chunks"
os.makedirs(base_output_folder, exist_ok=True)

def process_audio_file(idx, audio_file):
    output_folder = os.path.join(base_output_folder, f"audio_chunks_{idx}")
    os.makedirs(output_folder, exist_ok=True)
    
    print(f"\nProcessing file {idx}: {audio_file}")
    
    # Read the audio file and detect speech segments
    wav = read_audio(audio_file, sampling_rate=SAMPLING_RATE)
    speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)

    # Skip processing if no speech is detected
    if not speech_timestamps:
        print(f"Skipping file {audio_file} (no speech detected).")
        return

    # Save the collected speech parts
    temp_filename = f'new_speech{idx}.wav'
    save_audio(temp_filename, collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE)

    # Reload the new speech-only audio file
    wav = read_audio(temp_filename, sampling_rate=SAMPLING_RATE)

    # Define chunk duration in seconds and compute the number of samples per chunk
    chunk_duration_sec = 15
    chunk_length = SAMPLING_RATE * chunk_duration_sec

    # Calculate total number of chunks (the last one may be shorter)
    num_chunks = (len(wav) + chunk_length - 1) // chunk_length

    # Loop over the audio and save each chunk separately
    for i in range(num_chunks):
        start = i * chunk_length
        end = start + chunk_length
        audio_chunk = wav[start:end]

        base_name = os.path.splitext(os.path.basename(audio_file))[0]
        chunk_filename = os.path.join(output_folder, f"{base_name}_chunk_{i+1}.wav")

        save_audio(chunk_filename, audio_chunk, sampling_rate=SAMPLING_RATE)
        print(f"Saved {chunk_filename}")

# Main block: submit all audio file processing jobs in parallel.
if __name__ == "__main__":
    start_index = 0
    # Assuming combined_paths is defined and is a list of audio file paths
    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = []
        for idx, audio_file in enumerate(combined_paths[start_index:], start=start_index + 1):
            futures.append(executor.submit(process_audio_file, idx, audio_file))
        
        # Optionally, wait for all jobs to complete and catch any exceptions.
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"Error processing a file: {e}")



Processing file 8: sample/audio/hi106106145-8.wav
Processing file 1: sample/audio/hi106106145-1.wav
Processing file 2: sample/audio/hi106106145-2.wav
Processing file 5: sample/audio/hi106106145-5.wav
Processing file 3: sample/audio/hi106106145-3.wav
Processing file 6: sample/audio/hi106106145-6.wav
Processing file 4: sample/audio/hi106106145-4.wav
Processing file 7: sample/audio/hi106106145-7.wav


Processing file 9: sample/audio/hi106106145-9.wav

Processing file 10: sample/audio/hi106106145-10.wav






Saved sample/audio_chunks/audio_chunks_10/hi106106145-10_chunk_1.wav
Saved sample/audio_chunks/audio_chunks_10/hi106106145-10_chunk_2.wav
Saved sample/audio_chunks/audio_chunks_10/hi106106145-10_chunk_3.wav
Saved sample/audio_chunks/audio_chunks_10/hi106106145-10_chunk_4.wav
Saved sample/audio_chunks/audio_chunks_10/hi106106145-10_chunk_5.wav
Saved sample/audio_chunks/audio_chunks_10/hi106106145-10_chunk_6.wav
Saved sample/audio_chunks/audio_chunks_10/hi106106145-10_chunk_7.wav
Saved

# File Structure and Transcript Data Overview

1. **Audio Files:**  
   - There are 1561 audio files, named as:  
     **audio_chunks_1, audio_chunks_2, ..., audio_chunks_1561.**

2. **Transcript Files:**  
   - There are 1561 transcript files, named as:  
     **transcripts_audio_chunks_1, transcripts_audio_chunks_2, ..., transcripts_audio_chunks_1561.**

3. **Mapping Between Audio and Transcript:**  
   - Each entry in a transcript file corresponds to a 30-second audio file from the respective folder (e.g., audio_chunks_1).

4. **Transcript Entry Structure (JSONL format):**  
   Each transcript entry is structured as follows:

   ```python
   transcript_entry = {
       "audio": {
           "path": abs_file_path,
           "array": audio_arrays.tolist(),
           "sampling_rate": sr
       },
       "sentence": transcription.get('text', '')
   }


In [None]:
import whisper
import torch

# Load the Whisper model ("turbo") and move it to GPU if available.
# turbo and large gives similar result, better using turbo for speed

model = whisper.load_model("large")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

  checkpoint = torch.load(fp, map_location=device)


Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-31): 32 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=1280, out_features=1280, bias=True)
          (key): Linear(in_features=1280, out_features=1280, bias=False)
          (value): Linear(in_features=1280, out_features=1280, bias=True)
          (out): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=1280, out_features=5120, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=5120, out_features=1280, bias=True)
        )
        (mlp_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm(

In [10]:
import os
import re
import librosa
import json

# Define a natural sort key to sort file names in numeric order.
def natural_sort_key(s):
    # Splits the string into numbers and text parts.
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]

# Create a new folder for transcript files.
transcripts_folder = "sample/transcript_files"
os.makedirs(transcripts_folder, exist_ok=True)

# Folder where the audio chunks subfolders are stored.
audio_chunks_folder = 'sample/audio_chunks'

# Process one subfolder for demonstration. For example, process "audio_chunks_1".
for i in range(1, 11):
    folder_path = os.path.join(audio_chunks_folder, f"audio_chunks_{i}")
    transcript_list = []
    
    # List all files in the current folder and sort them naturally.
    files = [f for f in os.listdir(folder_path) if f.lower().endswith('.wav')]
    files = sorted(files, key=natural_sort_key)
    
    for file in files:
        file_name = os.path.join(folder_path, file)
        abs_file_path = os.path.abspath(file_name)        
        try:
            # Load the audio file using librosa (sampling rate 16000)
            audio_arrays, sr = librosa.load(file_name, sr=16000)
        except Exception as e:
            print(f"Error loading {abs_file_path}: {e}")
            continue
        
        try:
            # Transcribe the audio file using Whisper.
            transcription = model.transcribe(file_name)
        except Exception as e:
            print(f"Error transcribing {abs_file_path}: {e}")
            continue
            
        transcript_entry = {
            "audio": {
                "path": abs_file_path,
                "array": audio_arrays.tolist(),
                "sampling_rate": sr
            },
            "sentence": transcription.get('text', '')
        }
        transcript_list.append(transcript_entry)
    
    # Create a transcript file name based on the subfolder name.
    subfolder_name = os.path.basename(folder_path)  # e.g., "audio_chunks_1"
    transcript_filename = os.path.join(transcripts_folder, f"transcripts_{subfolder_name}.jsonl")
    
    # Write all transcript entries into the JSON Lines file.
    with open(transcript_filename, "w", encoding="utf-8") as f:
        for entry in transcript_list:
            json_line = json.dumps(entry, ensure_ascii=False)
            f.write(json_line + "\n")
    
    print(f"Saved transcripts to {transcript_filename}")

Saved transcripts to sample/transcript_files/transcripts_audio_chunks_1.jsonl
Saved transcripts to sample/transcript_files/transcripts_audio_chunks_2.jsonl
Saved transcripts to sample/transcript_files/transcripts_audio_chunks_3.jsonl
Saved transcripts to sample/transcript_files/transcripts_audio_chunks_4.jsonl
Saved transcripts to sample/transcript_files/transcripts_audio_chunks_5.jsonl
Saved transcripts to sample/transcript_files/transcripts_audio_chunks_6.jsonl
Saved transcripts to sample/transcript_files/transcripts_audio_chunks_7.jsonl
Saved transcripts to sample/transcript_files/transcripts_audio_chunks_8.jsonl
Saved transcripts to sample/transcript_files/transcripts_audio_chunks_9.jsonl
Saved transcripts to sample/transcript_files/transcripts_audio_chunks_10.jsonl


In [11]:
import json
import glob
import os
import re
import string

input_folder = "/home/niramay/Desktop/BhasaAnuvaad/sample/transcript_files"
output_file = "/home/niramay/Desktop/BhasaAnuvaad/sample/eng_combined.txt"
pattern = os.path.join(input_folder, "transcripts_audio_chunks_*.jsonl")
jsonl_files = glob.glob(pattern)
def extract_number(filename):
    # This regex finds the number between "aligned_transcripts_audio_chunks_" and ".jsonl"
    match = re.search(r"transcripts_audio_chunks_(\d+)\.jsonl", filename)
    return int(match.group(1)) if match else float('inf')

# Sort the files numerically based on the number in the filename
jsonl_files.sort(key=extract_number)

In [12]:
def remove_punctuation(text):
    return re.sub(rf"[{re.escape(string.punctuation)}]", "", text)

all_sentences = []

for file_path in jsonl_files:
    with open(file_path,'r',encoding='utf-8') as infile:
            with open(file_path, "r", encoding="utf-8") as infile:
                for line in infile:
                    if not line.strip():
                        continue
                    try:
                        # Each line is a JSON string (e.g., "some text|more text|...")
                        entry = json.loads(line)
                        text=entry.get('sentence')
                        # If for some reason the line isn't a string, skip it
                        if not isinstance(text, str):
                            continue

                        # Split the text by the "|" delimiter
                        sentences = text.split(".")
                        
                        # Strip whitespace and add non-empty sentences to our list
                        for sentence in sentences:
                            stripped = sentence.strip()
                            if stripped:
                                all_sentences.append(stripped)
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON in file {file_path}: {e}")
                        continue


with open(output_file, "w", encoding="utf-8") as outfile:
    for sentence in all_sentences:
        outfile.write(sentence + "\n")

print(f"Processed {len(jsonl_files)} files and combined {len(all_sentences)} sentences into {output_file}")

Processed 10 files and combined 1942 sentences into /home/niramay/Desktop/BhasaAnuvaad/sample/eng_combined.txt


In [13]:
import jiwer
from jiwer import wer

reference_file = '/home/niramay/Desktop/BhasaAnuvaad/sample/text/train.en'  
hypothesis_file = '/home/niramay/Desktop/BhasaAnuvaad/sample/eng_combined.txt'

with open(reference_file, 'r') as f:
    ground_truth = f.read()

with open(hypothesis_file, 'r') as f:
    hypothesis = f.read()

# Compute the Word Error Rate (WER)
error_rate = jiwer.wer(ground_truth, hypothesis)
print("WER:", error_rate)
    

WER: 0.2077811996722516


## Error
The below code is causing error in parallel processing. Showing 

Error processing /home/niramay/Desktop/BhasaAnuvaad/sample/audio_chunks/audio_chunks_1/hi106106145-1_chunk_8.wav: Expected key.size(1) == value.size(1) to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)

Error processing /home/niramay/Desktop/BhasaAnuvaad/sample/audio_chunks/audio_chunks_1/hi106106145-1_chunk_6.wav: Key and Value must have the same sequence length

In [3]:
# import os
# import re
# import whisper
# import librosa
# import json
# import torch
# import concurrent.futures
# import numpy as np

# def natural_sort_key(s):
#     return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]

# def load_and_preprocess_audio(file_path, target_sr=16000, max_duration=15):
#     """
#     Load and preprocess audio file with robust error handling
    
#     Args:
#         file_path (str): Path to audio file
#         target_sr (int): Target sampling rate
#         max_duration (float): Maximum audio duration in seconds
    
#     Returns:
#         numpy.ndarray: Preprocessed audio array
#     """
#     try:
#         # Load audio file
#         audio_arrays, sr = librosa.load(file_path, sr=target_sr)
        
#         # Check audio length
#         max_length = max_duration * target_sr
#         if len(audio_arrays) > max_length:
#             audio_arrays = audio_arrays[:max_length]
        
#         # Ensure audio is mono and float32
#         if audio_arrays.ndim > 1:
#             audio_arrays = librosa.to_mono(audio_arrays)
        
#         # Normalize audio
#         audio_arrays = librosa.util.normalize(audio_arrays)
        
#         return audio_arrays
#     except Exception as e:
#         print(f"Audio preprocessing error for {file_path}: {e}")
#         return None

# def process_audio_file(folder_path, file, model):
#     """Process one audio file: load, transcribe, and build a transcript entry."""
#     file_path = os.path.join(folder_path, file)
#     abs_file_path = os.path.abspath(file_path)
    
#     try:
#         # Preprocess audio
#         audio_arrays = load_and_preprocess_audio(file_path)
        
#         if audio_arrays is None:
#             print(f"Skipping {abs_file_path} due to preprocessing failure")
#             return None
        
#         # Ensure audio is float32 and compatible with Whisper
#         audio_arrays = audio_arrays.astype(np.float32)
        
#         # Transcribe the audio file using Whisper
#         transcription = model.transcribe(audio_arrays)
        
#         transcript_entry = {
#             "audio": {
#                 "path": abs_file_path,
#                 "array": audio_arrays.tolist(),
#                 "sampling_rate": 16000
#             },
#             "sentence": transcription.get('text', '').strip()
#         }
        
#         return transcript_entry
    
#     except Exception as e:
#         print(f"Error processing {abs_file_path}: {e}")
#         return None

# def main():
#     # Load the Whisper model with specific configuration
#     model = whisper.load_model("base")  # Consider using "base" for less memory
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model.to(device)
    
#     # Create folders
#     transcripts_folder = "sample/transcripts_files"
#     audio_chunks_folder = 'sample/audio_chunks'
#     os.makedirs(transcripts_folder, exist_ok=True)
    
#     # Process subfolders
#     for i in range(1, 11):
#         folder_path = os.path.join(audio_chunks_folder, f"audio_chunks_{i}")
#         transcript_list = []
        
#         # List and sort wav files
#         files = [f for f in os.listdir(folder_path) if f.lower().endswith('.wav')]
#         files = sorted(files, key=natural_sort_key)
        
#         # Process files concurrently
#         with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
#             futures = [executor.submit(process_audio_file, folder_path, file, model) for file in files]
#             for future in concurrent.futures.as_completed(futures):
#                 result = future.result()
#                 if result is not None:
#                     transcript_list.append(result)
        
#         # Create transcript file
#         subfolder_name = os.path.basename(folder_path)
#         transcript_filename = os.path.join(transcripts_folder, f"transcripts_{subfolder_name}.jsonl")
        
#         # Write transcripts
#         with open(transcript_filename, "w", encoding="utf-8") as f:
#             for entry in transcript_list:
#                 json_line = json.dumps(entry, ensure_ascii=False)
#                 f.write(json_line + "\n")
        
#         print(f"Saved transcripts to {transcript_filename}")

# if __name__ == "__main__":
#     main()

# Transcript Alignment Process

For each transcript file, we follow these steps:

1. **Sentence Splitting:**  
   - Each line from the English transcript (generated by the Whisper model) is split into individual sentences using the full stop (`.`) as a delimiter.

2. **Embedding Generation and Similarity Calculation:**  
   - Generate sentence-level embeddings for the English sentences.  
   - Utilize the provided Hindi sentence embeddings from the translation file.  
   - Calculate the cosine similarity between each English sentence embedding and the Hindi sentence embeddings to find the best matching pair.

3. **Alignment and Combination:**  
   - Align the corresponding English and Hindi sentences based on the cosine similarity scores.  
   - Combine the aligned sentence pairs to reconstruct the full translated sentences for each transcript line.


# Combined Transcript Entry Structure

Each combined transcript entry is structured as follows:

```python
transcript_entry = {
    "audio": {
        "path": abs_file_path,
        "array": audio_arrays.tolist(),
        "sampling_rate": sr
    },
    "translation": translation_text, 
    "sentence": transcription.get('text', '')
}


In [1]:
import torch
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from pathlib import Path
import glob
import json
import os
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity

output_dir = "transcripts_folder"
os.makedirs(output_dir, exist_ok=True)

def natural_sort_key(s):
    # Splits the string into numbers and text parts.
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]


# Get a list of all English transcript files.
transcript_files = [f for f in os.listdir('transcripts_files') if f.lower().endswith('.jsonl')]
transcript_files = sorted(transcript_files,key=natural_sort_key)
transcript_files=transcript_files[1066:]

# Define a helper function to align one English transcript using the current local Hindi subset.
def align_transcript(english_transcript):
    # Split transcript into sentences using period as a separator;
    # keep sentences that have more than one word.
    sentences = [s.strip() for s in english_transcript.split('.') if len(s.split()) > 1]
    if not sentences:
        return ""
    # Encode English sentences.
    eng_embeddings = emb_model.encode(sentences)
    # Normalize embeddings.
    eng_embeddings = torch.nn.functional.normalize(torch.tensor(eng_embeddings), dim=-1).numpy()
    # Compute cosine similarity with precomputed Hindi embeddings.
    sims = cosine_similarity(eng_embeddings, hindi_embeddings)
    # For each English sentence, pick the Hindi sentence with maximum similarity.
    aligned = [hindi_sentences[np.argmax(sim)] for sim in sims]

    unique_aligned = []
    for sentence in aligned:
        # Use .strip() to avoid differences due to leading/trailing whitespace.
        if sentence.strip() not in [s.strip() for s in unique_aligned]:
            unique_aligned.append(sentence)

    # Join the best matching Hindi sentences to form a single aligned transcript.
    return "| ".join(unique_aligned)

# Process each transcript file.
for transcript_file in transcript_files:
    path = os.path.abspath(transcript_file)
    # Extract the relevant part from the path to determine the matching audio file.
    aud_path = path.split(os.sep)[5]  # Adjust the index based on your folder structure.
    result = aud_path.removeprefix('transcripts_').removesuffix('.jsonl')
    
    # Build the corresponding folder path for audio chunks.
    folder_path = os.path.join('dataset/audio_chunks', result)
    # Get the first file from the folder.
    first_file = next((file for file in sorted(Path(folder_path).iterdir()) if file.is_file()), None)
    first_file = str(first_file).split(os.sep)[3]
    # Process the file name to obtain a base audio file name (without chunk info).
    jsonl_path = first_file.removeprefix(f'dataset/audio_chunks/{result}/').removesuffix('.wav')
    jsonl_path = jsonl_path.split("_chunk_")[0] + ".wav"
    
    # Determine the indices in train.yaml that correspond to this audio file.
    matching_indices = []
    with open("dataset/text/txt/train.yaml", "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            if "wav:" in line and jsonl_path in line:
                matching_indices.append(i)
    
    # From train.hi, extract only the Hindi translations at those matching indices.
    train_hi_path = "dataset/text/txt/train.hi"  # Path to the file with Hindi translations.
    hindi_sentences = []
    with open(train_hi_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=0):
            if (i + 1) in matching_indices:  # YAML indices started at 1.
                hindi_sentences.append(line.strip())

    
    # Compute embeddings for the local Hindi sentences.
    hindi_embeddings = emb_model.encode(hindi_sentences)
    # Read English transcripts from the JSONL file.
    english_transcripts = []
    with open('transcripts_files/'+transcript_file, "r", encoding="utf-8") as f:
        for line in f:
            entry = json.loads(line)
            # Expect each entry to have a "sentence" field.
            english_transcripts.append(entry.get("sentence", ""))
    
    # Align each English transcript to Hindi.
    hindi_transcripts = []
    for transcript in english_transcripts:
        aligned_text = align_transcript(transcript)
        hindi_transcripts.append(aligned_text)

    # Write the aligned Hindi transcripts to a new JSONL file.
    base_name = os.path.basename(transcript_file)
    aligned_file = os.path.join('transcripts_folder', f"aligned_{base_name}")
    with open(aligned_file, "w", encoding="utf-8") as f:
        for line in hindi_transcripts:
            f.write(json.dumps(line, ensure_ascii=False) + "\n")
    print(f"Saved aligned transcripts to {aligned_file}")
    
    # Combine the English and Hindi transcripts.
    combined_file = os.path.join('transcripts_folder', f"combined_{base_name}")
    with open('transcripts_files/'+transcript_file, 'r', encoding='utf-8') as eng_f, \
         open(aligned_file, 'r', encoding='utf-8') as hin_f, \
         open(combined_file, 'w', encoding='utf-8') as out_f:
        for eng_line, hin_line in zip(eng_f, hin_f):
            eng_entry = json.loads(eng_line)
            hindi_text = json.loads(hin_line)  # Plain string.
            eng_entry["translation"] = hindi_text
            out_f.write(json.dumps(eng_entry, ensure_ascii=False) + "\n")
    print(f"Saved combined transcripts to {combined_file}")

Saved aligned transcripts to transcripts_folder/aligned_transcripts_audio_chunks_1067.jsonl
Saved combined transcripts to transcripts_folder/combined_transcripts_audio_chunks_1067.jsonl
Saved aligned transcripts to transcripts_folder/aligned_transcripts_audio_chunks_1068.jsonl
Saved combined transcripts to transcripts_folder/combined_transcripts_audio_chunks_1068.jsonl
Saved aligned transcripts to transcripts_folder/aligned_transcripts_audio_chunks_1069.jsonl
Saved combined transcripts to transcripts_folder/combined_transcripts_audio_chunks_1069.jsonl
Saved aligned transcripts to transcripts_folder/aligned_transcripts_audio_chunks_1070.jsonl
Saved combined transcripts to transcripts_folder/combined_transcripts_audio_chunks_1070.jsonl
Saved aligned transcripts to transcripts_folder/aligned_transcripts_audio_chunks_1071.jsonl
Saved combined transcripts to transcripts_folder/combined_transcripts_audio_chunks_1071.jsonl
Saved aligned transcripts to transcripts_folder/aligned_transcripts_au

In [10]:
import json

# Define the JSONL file path
file_path = "transcripts_folder/combined_transcripts_audio_chunks_1560.jsonl"

# Open and read the first line
with open(file_path, 'r', encoding='utf-8') as file:
    first_line = file.readline()  # Read the first JSON object (line)

    # Parse JSON
    first_entry = json.loads(first_line)

    # Extract 'translation' and 'sentence' values
    audio=first_entry.get('audio')
    path=audio.get('path')
    translation = first_entry.get("translation")
    sentence = first_entry.get("sentence")

    # Print the results
    print("Audio:", path)
    print("Translation:", translation)
    print("Sentence:", sentence)

Audio: /home/niramay/Desktop/BhasaAnuvaad/dataset/audio_chunks/audio_chunks_1560/hi127108005-19_chunk_1.wav
Translation: इंजीनियरिंग में शिक्षण और सीखना टेल प्रो एन जे राव इलेक्ट्रॉनिक्स सिस्टम इंजीनियरिंग विभाग भारतीय विज्ञान संस्थान बैंगलोर व्याख्यान 19 सीओ की प्राप्ति मॉड्यूल 1 की इकाई 19 में आपका स्वागत है और अभिवादन| यहाँ हम पाठ्यक्रम परिणामों की प्राप्ति पर चर्चा करेंगे| पिछली इकाई में हमने समझा कि पाठ्यक्रम के परिणाम कैसे लिखें और प्रत्येक पाठ्यक्रम के परिणाम को संबंधित पीओ, पीएसओ, संज्ञानात्मक स्तर, ज्ञान श्रेणियों और कक्षा सत्रों की संख्या के साथ टैग करें, जिनका उपयोग आप उस विशेष पीओ को संबोधित करने के लिए कर सकते हैं।| तो अब हमारे पास एक कोर्स के लिए COs की एक पूरी तस्वीर लिखने की एक पूरी तस्वीर है
Sentence:  Greetings and welcome to unit 19 of module 1. Here we address the attainment of course outcomes. In the previous unit, we understood how to write course outcomes and tagging each course outcome with corresponding POs, PSOs, cognitive level, knowledge, categories and numb