# S2S

### Import libraries

In [None]:
# Import libraries
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, util
from dataclasses import dataclass
from typing import List, Dict
from collections import deque
from tqdm import tqdm
from dataclasses import dataclass
import whisper
import time
from datetime import datetime
import json

### Prepare dataset

In [None]:
eng_train_df = pd.read_csv(os.path.join(extract_directory_src, 'train.tsv'), sep = '\t', header = None)
eng_train_df.columns = ['audio', 'sentence']
eng_train_df.head(), len(eng_train_df)

(                          audio                                      sentence
 0  common_voice_cy_17301829.mp3                            i need soap please
 1  common_voice_cy_17301830.mp3                       what time will he start
 2  common_voice_cy_17301835.mp3                             where do you live
 3  common_voice_cy_17301844.mp3  ifan huw dafydd iwan llwyd john pierce jones
 4  common_voice_cy_17301845.mp3                                 what is islam,
 1241)

In [None]:
def make_parallel_dataset(eng_df : pd.DataFrame, eng_audio_path : str, cy_audio_path : str) -> List:
  """Make parallel dataset from source and target dataset"""

  # 1. Make path for audio and transcripts
  eng_clips = os.path.join(eng_audio_path, 'train')
  cy_clips = os.path.join(cy_audio_path, 'clips')

  # 2. Load transcript for welsh
  cy_transcript = os.path.join(cy_audio_path, 'train.tsv')
  metadata = {}

  if not os.path.exists(cy_transcript):
    cy_transcript = os.path.join(cy_audio_path, 'validated.tsv')

  welsh_df = pd.read_csv(cy_transcript, sep = '\t')
  for _, row in welsh_df.iterrows():
    metadata[row['path']] = row['sentence']
  print(f'Loaded {len(metadata)} welsh audio + transcripts')

  # 3. Create parallel dataset
  parallel_dataset = []
  for i, row in eng_df.head(20).iterrows():

    ## Load welsh audio
    cy_audio_name = row['audio']
    cy_audio_path_file = os.path.join(cy_clips, cy_audio_name)
    if not os.path.exists(cy_audio_path_file):
      continue
    y_cy, sr_cy = librosa.load(cy_audio_path_file, sr = None)
    y_cy = librosa.resample(y_cy, orig_sr=sr_cy, target_sr=16000)
    cy_text = metadata.get(cy_audio_name, '')

    ## Load english audio
    eng_audio_name = f'{cy_audio_name}.wav'
    eng_audio_path_file = os.path.join(eng_clips, eng_audio_name)

    if not os.path.exists(eng_audio_path_file):
      continue

    y_en, sr_en = librosa.load(eng_audio_path_file, sr = None)
    y_en = librosa.resample(y_en, orig_sr=sr_en, target_sr=16000)
    eng_text = row.get('sentence', '')

    ## Make parallel dataset
    parallel_dataset.append({
      'id' : i,
      'eng_audio' : y_en,
      'eng_transcript' : eng_text,
      'cy_audio' : y_cy,
      'cy_transcript' : cy_text,
      'src' : eng_audio_name,
      'tgt' : cy_audio_name
    })

  print(f'Loaded {len(parallel_dataset)} parallel pairs')
  return parallel_dataset


In [None]:
parallel_dataset = make_parallel_dataset(eng_train_df, extract_directory_src, audio_path_tgt)

Loaded 8014 welsh audio + transcripts
Loaded 20 parallel pairs


In [None]:
parallel_dataset[:3]

[{'id': 0,
  'eng_audio': array([ 8.3589659e-04,  1.1703324e-03,  1.1525394e-03, ...,
          5.8207661e-11, -5.8207661e-11,  1.1641532e-10], dtype=float32),
  'eng_transcript': 'i need soap please',
  'cy_audio': array([-1.2732926e-11, -4.5474735e-12,  1.4097168e-11, ...,
         -4.6382229e-07,  5.4650445e-07,  9.3370363e-07], dtype=float32),
  'cy_transcript': '',
  'src': 'common_voice_cy_17301829.mp3.wav',
  'tgt': 'common_voice_cy_17301829.mp3'},
 {'id': 1,
  'eng_audio': array([ 7.5494684e-04,  1.0414989e-03,  1.0550146e-03, ...,
          1.2098171e-06, -1.1129887e-06,  1.1659868e-06], dtype=float32),
  'eng_transcript': 'what time will he start',
  'cy_audio': array([-6.8212103e-12, -2.7284841e-12,  2.7284841e-12, ...,
         -8.0095524e-06,  5.0972621e-06, -9.4979041e-06], dtype=float32),
  'cy_transcript': '',
  'src': 'common_voice_cy_17301830.mp3.wav',
  'tgt': 'common_voice_cy_17301830.mp3'},
 {'id': 2,
  'eng_audio': array([ 1.0495777e-03,  1.1955972e-03,  1.1615595

### Audio prepocessing (making chunks)

In [None]:
# Create audio chunks

@dataclass
class AudioChunk:
    """Single audio chunk with metadata"""
    audio: np.ndarray
    chunk_id: int
    start_time: float
    end_time: float
    sampling_rate: int = 16000

class AudioChunker:
    """
    Chunks audio into overlapping windows for streaming simulation with chunk duration of 2s and hop duration of 0.5s (overlap between chunks)
    """

    def __init__(self, chunk_duration=2.0, hop_duration=0.5, sr=16000):
        self.chunk_duration = chunk_duration
        self.hop_duration = hop_duration
        self.sr = sr
        self.chunk_samples = int(chunk_duration * sr)
        self.hop_samples = int(hop_duration * sr)

    def create_chunks(self, audio: np.ndarray) -> List[AudioChunk]:
        """Create overlapping chunks from audio array"""
        chunks = []
        chunk_id = 0

        for start in range(0, len(audio), self.hop_samples):
            end = start + self.chunk_samples

            if start >= len(audio):
                break

            # Extract chunk
            chunk_audio = audio[start:end]

            # Pad last chunk if needed
            if len(chunk_audio) < self.chunk_samples:
                chunk_audio = np.pad(chunk_audio, (0, self.chunk_samples - len(chunk_audio)))

            chunks.append(AudioChunk(
                audio=chunk_audio,
                chunk_id=chunk_id,
                start_time=start / self.sr,
                end_time=min(end, len(audio)) / self.sr,
                sampling_rate=self.sr
            ))
            chunk_id += 1

        return chunks

In [None]:
# Initialize chunker
chunker = AudioChunker(chunk_duration=2.0, hop_duration=0.5, sr=16000)

In [None]:
# Creating context buffer
class ContextBuffer:
    """
    Maintains recent chunks for context used during training to simulate streaming with context.
    """

    def __init__(self, max_chunks=4):
        self.max_chunks = max_chunks
        self.buffer = deque(maxlen=max_chunks)

    def add(self, chunk_audio: np.ndarray):
        """Add chunk to buffer"""
        self.buffer.append(chunk_audio)

    def get_context(self) -> np.ndarray:
        """Get concatenated audio from buffer"""
        if len(self.buffer) == 0:
            return None
        return np.concatenate(list(self.buffer))

    def is_ready(self) -> bool:
        """Check if buffer has minimum context"""
        return len(self.buffer) >= 2  # Need at least 2 chunks

    def reset(self):
        """Clear buffer"""
        self.buffer.clear()

In [None]:
# Create dataset with chunking
class ChunkedS2STDataset(Dataset):
    """
    1. Loads parallel English to Welsh data
    2. Chunks audio into 2s windows
    3. Provides context buffer data
    """

    def __init__(self, parallel_data, chunker, max_chunks_per_sample=None):
        """
        Args:
            parallel_data: List of parallel pairs from make_parallel_dataset()
            chunker: AudioChunker instance
            max_chunks_per_sample: Max chunks to use per audio (None = all)
        """
        self.parallel_data = parallel_data
        self.chunker = chunker
        self.max_chunks_per_sample = max_chunks_per_sample

        # Pre-compute all chunks for faster training
        self.chunked_samples = []

        for sample in parallel_data:
            # Chunk source (English) audio
            source_chunks = chunker.create_chunks(sample['eng_audio'])

            # Chunk target (Welsh) audio
            target_chunks = chunker.create_chunks(sample['cy_audio'])

            # Limit number of chunks if specified
            if max_chunks_per_sample:
                source_chunks = source_chunks[:max_chunks_per_sample]
                target_chunks = target_chunks[:max_chunks_per_sample]

            self.chunked_samples.append({
                'id': sample['id'],
                'source_chunks': source_chunks,      # English chunks
                'target_chunks': target_chunks,      # Welsh chunks
                'source_text': sample['eng_transcript'],  # English text
                'target_text': sample['cy_transcript'],   # Welsh text
                'source_file': sample['src'],
                'target_file': sample['tgt']
            })

        print(f"Pre-computed chunks for {len(self.chunked_samples)} samples")

    def __len__(self):
        return len(self.chunked_samples)

    def __getitem__(self, idx):
        """
        Returns one training sample with:
        1. Source audio chunks (English)
        2. Target audio chunks (Welsh)
        3. Source text (English)
        4. Target text (Welsh)
        """
        sample = self.chunked_samples[idx]

        return {
            'id': sample['id'],
            'source_chunks': sample['source_chunks'],
            'target_chunks': sample['target_chunks'],
            'source_text': sample['source_text'],
            'target_text': sample['target_text'],
            'num_source_chunks': len(sample['source_chunks']),
            'num_target_chunks': len(sample['target_chunks'])
        }

In [None]:
# Create dataset
train_dataset = ChunkedS2STDataset(parallel_data=parallel_dataset, chunker=chunker, max_chunks_per_sample=10)
print(f'Training dataset created with {len(train_dataset)} samples')

Pre-computed chunks for 20 samples
Training dataset created with 20 samples


In [None]:
train_dataset[0]

{'id': 0,
 'source_chunks': [AudioChunk(audio=array([0.0008359 , 0.00117033, 0.00115254, ..., 0.        , 0.        ,
         0.        ], dtype=float32), chunk_id=0, start_time=0.0, end_time=1.3375, sampling_rate=16000),
  AudioChunk(audio=array([-0.01307321,  0.02549626, -0.01774494, ...,  0.        ,
          0.        ,  0.        ], dtype=float32), chunk_id=1, start_time=0.5, end_time=1.3375, sampling_rate=16000),
  AudioChunk(audio=array([0.0122569 , 0.01207083, 0.01178565, ..., 0.        , 0.        ,
         0.        ], dtype=float32), chunk_id=2, start_time=1.0, end_time=1.3375, sampling_rate=16000)],
 'target_chunks': [AudioChunk(audio=array([-1.2732926e-11, -4.5474735e-12,  1.4097168e-11, ...,
          8.1197126e-04,  7.2016800e-04,  5.6324806e-04], dtype=float32), chunk_id=0, start_time=0.0, end_time=2.0, sampling_rate=16000),
  AudioChunk(audio=array([1.0184628e-04, 1.8466072e-04, 2.0494346e-05, ..., 7.7348419e-02,
         7.1228340e-02, 6.6100359e-02], dtype=float32

In [None]:
# Create collate function
def collate_chunked_batch(batch):
    """Handles variable-length chunks per sample"""

    if len(batch) == 1:
        return batch[0]

    # For larger batches, padding will be applied. So for now, we use batch_size=1
    raise NotImplementedError('Batch size > 1 not yet supported')

In [None]:
# Create DataLoaderder
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_chunked_batch, num_workers=0)

# Get one batch
batch = next(iter(train_loader))
batch

{'id': 17,
 'source_chunks': [AudioChunk(audio=array([0.00225219, 0.00283589, 0.00261182, ..., 0.00253015, 0.00149526,
         0.00111748], dtype=float32), chunk_id=0, start_time=0.0, end_time=2.0, sampling_rate=16000),
  AudioChunk(audio=array([0.02935768, 0.0112407 , 0.022212  , ..., 0.        , 0.        ,
         0.        ], dtype=float32), chunk_id=1, start_time=0.5, end_time=2.1125, sampling_rate=16000),
  AudioChunk(audio=array([0.02652753, 0.03057722, 0.02021658, ..., 0.        , 0.        ,
         0.        ], dtype=float32), chunk_id=2, start_time=1.0, end_time=2.1125, sampling_rate=16000),
  AudioChunk(audio=array([0.09374263, 0.09572086, 0.10089844, ..., 0.        , 0.        ,
         0.        ], dtype=float32), chunk_id=3, start_time=1.5, end_time=2.1125, sampling_rate=16000),
  AudioChunk(audio=array([0.00172371, 0.00146554, 0.0012197 , ..., 0.        , 0.        ,
         0.        ], dtype=float32), chunk_id=4, start_time=2.0, end_time=2.1125, sampling_rate=160