In [None]:
pip install librosa pandas torch torchaudio transformers tqdm numpy

In [None]:
import os
import subprocess

def convert_audio(input_dir, output_dir):
    # 確保輸出目錄存在
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # 遍歷目錄中的所有 .wav 文件
    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith(".wav"):
                input_path = os.path.join(root, file)
                
                # 生成輸出文件的路徑
                relative_path = os.path.relpath(root, input_dir)  # 保留相對路徑
                output_folder = os.path.join(output_dir, relative_path)
                if not os.path.exists(output_folder):
                    os.makedirs(output_folder)
                
                output_path = os.path.join(output_folder, file)
                
                # 調用 Sox 進行格式轉換
                command = [
                    "sox", input_path,
                    "-r", "16000",         # 16 kHz
                    "-e", "signed-integer",# signed-integer
                    "-b", "16",            # 16 bits
                    output_path
                ]
                
                print(f"Converting: {input_path} -> {output_path}")
                subprocess.run(command, check=True)

# 設置資料夾
train_input_dir = "train"
train_output_dir = "train_converted"
test_input_dir = "test"
test_output_dir = "test_converted"

# 轉換 train 和 test 資料夾中的音頻
convert_audio(train_input_dir, train_output_dir)
convert_audio(test_input_dir, test_output_dir)

In [None]:
import os
import csv
import librosa
import numpy as np
from sklearn.model_selection import train_test_split

def load_lexicon(lexicon_path):
    # lexicon.txt格式假設為：
    # word phone1 phone2 ...
    # 如：ba b a
    # a iNULL a
    # 將每一行的第一個欄位當成拼音字串，其餘欄位是對應音素。
    lex_map = {}
    with open(lexicon_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split()
            # 第一個為詞彙(如"ba")，後面為其對應音素序列
            word = parts[0]
            phones = parts[1:]
            lex_map[word] = phones
    return lex_map

In [None]:
def text_to_phoneme_sequence(text, lex_map):
    # text 如："li be e mih kiann lan lan san san long be tsiau tsng"
    # 一般是空白分詞，若需要根據您的實際資料格式調整
    words = text.strip().split()
    
    phoneme_seq = []
    for w in words:
        # 檢查字典中是否有此詞
        # 實務上，有些拼音可能要再細分或檢查
        if w in lex_map:
            phoneme_seq.extend(lex_map[w])
        else:
            # 若無對應，則可考慮略過或保留原文字串
            # 這裡先簡單略過或將 w 作為一整個 token
            # 實務上建議先確保 lexicon 完備
            phoneme_seq.append(w)
    return phoneme_seq

In [None]:
def spec_augment(log_mel_spec, time_mask_num=1, freq_mask_num=1, time_mask_size=20, freq_mask_size=10):
    # log_mel_spec: shape (n_mels, T)
    n_mels, T = log_mel_spec.shape

    # 頻率遮罩
    for _ in range(freq_mask_num):
        f = np.random.randint(0, freq_mask_size)
        f_start = np.random.randint(0, n_mels - f)
        log_mel_spec[f_start:f_start+f, :] = 0

    # 時間遮罩
    for _ in range(time_mask_num):
        t = np.random.randint(0, time_mask_size)
        t_start = np.random.randint(0, T - t)
        log_mel_spec[:, t_start:t_start+t] = 0

    return log_mel_spec

In [None]:
def load_audio_features(wav_path, sr=16000, n_mels=80, frame_length=0.025, frame_shift=0.01):
    # 載入 wav 音檔
    y, sr = librosa.load(wav_path, sr=sr)
    # 計算 n_fft, hop_length, win_length
    n_fft = int(sr * frame_length)
    hop_length = int(sr * frame_shift)
    win_length = n_fft

    # 計算 Mel-filterbank 特徵
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, 
                                              hop_length=hop_length, 
                                              win_length=win_length, 
                                              n_mels=n_mels, fmin=20, fmax=sr/2)
    # 將能量取log(避免log(0)可加上小量)
    log_mel_spec = np.log(np.maximum(mel_spec, 1e-10))
    return log_mel_spec

In [None]:
def preprocess_data(entries, wav_dir, lexicon_map):
    features_list = []
    labels_list = []
    for row in entries:
        utt_id = row['id']
        text = row['text']
        wav_path = os.path.join(wav_dir, utt_id + '.wav')
        if not os.path.exists(wav_path):
            print(f"Warning: {wav_path} not found.")
            continue
        feats = load_audio_features(wav_path)
        phoneme_seq = text_to_phoneme_sequence(text, lexicon_map)
        features_list.append(feats)
        labels_list.append(phoneme_seq)
    return features_list, labels_list

In [None]:
if __name__ == "__main__":
    lexicon_path = 'lexicon.txt'
    train_csv_path = 'train-toneless.csv'
    wav_dir = 'train_converted'
    
    # 讀取 lexicon
    lex_map = load_lexicon(lexicon_path)
    
    # 讀取整個訓練 csv
    entries = []
    with open(train_csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            entries.append(row)
    
    # 分割比例: 90% 用於訓練, 10% 用於驗證
    train_entries, valid_entries = train_test_split(entries, test_size=0.1, random_state=42)
    
    # 前處理訓練集資料
    train_features, train_labels = preprocess_data(train_entries, wav_dir, lex_map)
    # 前處理驗證集資料
    valid_features, valid_labels = preprocess_data(valid_entries, wav_dir, lex_map)
    
    # 建立 phoneme->id 映射表 (根據訓練集和驗證集所有出現過的 phoneme)
    all_phones = set(p for seq in train_labels+valid_labels for p in seq)
    phone2id = {p: i for i, p in enumerate(sorted(all_phones))}
    
    # 將訓練與驗證標籤轉為 ID 序列
    train_labels_id = [[phone2id[p] for p in seq] for seq in train_labels]
    valid_labels_id = [[phone2id[p] for p in seq] for seq in valid_labels]
    
    # 至此您擁有：
    # train_features, train_labels_id
    # valid_features, valid_labels_id
    # 可以進一步用於模型訓練及驗證。

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

# 假設您已經由前處理程式碼取得:
# train_features, train_labels_id, valid_features, valid_labels_id, phone2id

class SpeechDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feat = self.features[idx]
        label = self.labels[idx]
        return feat, label

def collate_fn(batch):
    # 與前面範例類似的collate，將不同長度序列對齊
    max_time = max(f.shape[1] for f, _ in batch)
    max_label_len = max(len(l) for _, l in batch)
    
    n_mels = batch[0][0].shape[0]
    batch_size = len(batch)

    feats_padded = np.zeros((batch_size, n_mels, max_time), dtype=np.float32)
    labels_padded = np.zeros((batch_size, max_label_len), dtype=np.int32)
    input_lengths = []
    target_lengths = []

    for i, (f, l) in enumerate(batch):
        T = f.shape[1]
        L = len(l)
        feats_padded[i, :, :T] = f
        labels_padded[i, :L] = l
        input_lengths.append(T)
        target_lengths.append(L)

    feats_padded = torch.from_numpy(feats_padded).unsqueeze(1) # (B, 1, n_mels, T)
    # DeepSpeech2 通常使用 (B, C, F, T) 格式的輸入，C=1為單通道
    # 不過這裡n_mels在F維、T在W維的話，需要注意conv層的kernel維度設計
    # 我們假設Conv的輸入 (B, C, F, T) 中 F對應頻率, T對應時間
    # 已有 feats_padded shape 為 (B, n_mels, T) -> (B,1,n_mels,T)，
    # 我們將 n_mels 視為頻率維度， T 視為時間維度

    labels_padded = torch.from_numpy(labels_padded)
    input_lengths = torch.tensor(input_lengths, dtype=torch.int32)
    target_lengths = torch.tensor(target_lengths, dtype=torch.int32)

    return feats_padded, labels_padded, input_lengths, target_lengths

train_dataset = SpeechDataset(train_features, train_labels_id)
valid_dataset = SpeechDataset(valid_features, valid_labels_id)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

class DeepSpeech2(nn.Module):
    def __init__(self, input_freq_dim, num_classes, rnn_hidden_size=512, rnn_layers=5, bidirectional=True):
        super(DeepSpeech2, self).__init__()
        
        # 前端卷積層：假設使用2層2D卷積，如論文中所示
        # 論文中使用 kernel size= (41,11), stride=(2,2)等較大的kernel進行時間/頻率子採樣
        # 這裡簡化使用較小的kernel，例如 (3,3)，根據需求可調整
        # Input shape: (B, 1, n_mels, T)
        
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3,3), stride=(2,2), padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), stride=(2,2), padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
        )
        
        # 經過兩層conv後，時間與頻率維度都會縮小，需計算縮小後的維度
        # 假設 n_mels為F, T為時間維度
        # 兩層conv各有stride=2, 則 F維度和T維度各縮小4倍 (2層,每層2倍)
        # 新的頻率維度: ceil(F/2) -> 再/2
        # 新的時間維度：同上
        # 這裡計算一下最終的F', T'
        # 簡化處理：若 F與T足夠大，我們可直接forward一次dummy tensor來求輸出維度
        
        # 先dummy forward計算RNN輸入維度
        dummy_input = torch.zeros(1, 1, input_freq_dim, 200) # 假設T=200作測試
        with torch.no_grad():
            conv_out = self.conv(dummy_input)
        _, c_out, f_out, t_out = conv_out.shape
        # conv_out shape: (1, c_out, f_out, t_out)
        # RNN輸入需要 (B, T, Feature)，我們需將C,F,T展平為 T和Feature
        # 具體而言，我們會將 freq和channel合併，最後RNN的input_dim = c_out * f_out
        
        rnn_input_dim = c_out * f_out

        # RNN部分 (使用GRU)
        # 論文用的為GRU，這裡採用GRU，layers=5為例
        self.rnn = nn.GRU(input_size=rnn_input_dim, hidden_size=rnn_hidden_size, 
                          num_layers=rnn_layers, batch_first=True, 
                          bidirectional=bidirectional)

        factor = 2 if bidirectional else 1
        self.fc = nn.Linear(rnn_hidden_size * factor, num_classes)

    def forward(self, x):
        # x: (B, 1, F, T)
        x = self.conv(x)  # (B, c_out, f_out, t_out)
        
        # 將輸出轉為 (B, T_out, c_out*f_out)
        B, C, F, T = x.size()
        # 轉換為 (B, T, C*F)
        x = x.permute(0, 3, 1, 2).contiguous() # (B, T, C, F)
        x = x.view(B, T, C*F)

        # 經過RNN
        x, _ = self.rnn(x) # (B, T, hidden*factor)

        # 全連接層映射至 num_classes
        logits = self.fc(x) # (B, T, num_classes)

        # CTC需要log_softmax
        log_probs = nn.functional.log_softmax(logits, dim=-1)
        return log_probs

n_mels = train_features[0].shape[0]
num_phones = len(phone2id)
blank_id = 0  # 假設blank是0

model = DeepSpeech2(input_freq_dim=n_mels, num_classes=num_phones, rnn_hidden_size=512, rnn_layers=5, bidirectional=True)
model = model.cuda()

criterion = nn.CTCLoss(blank=blank_id, zero_infinity=True)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for feats_padded, labels_padded, input_lengths, target_lengths in train_loader:
        # feats_padded: (B, 1, n_mels, T)
        feats_padded = feats_padded.cuda()
        labels_padded = labels_padded.cuda()
        input_lengths = input_lengths.cuda()
        target_lengths = target_lengths.cuda()

        optimizer.zero_grad()
        log_probs = model(feats_padded) # (B, T', num_classes)
        # 計算CTC的時候，要注意卷積後時間軸長度改變了，
        # 因此input_lengths也要依據卷積縮小倍率做相應的縮減。
        # 假設縮短了4倍時間(因為stride=2兩次)，新的input_lengths = old_lengths // 4 或其它計算(需根據實際stride與padding計算)
        
        # 如果您固定了卷積的stride與padding，可以在collate_fn中提前計算縮短後的input_lengths並一併回傳
        # 這裡以簡化處理：假設每次都固定一樣的縮放比例 (stride=2兩層 => 總共時間縮小4倍)
        new_input_lengths = (input_lengths // 4).to(torch.int32)
        
        # log_probs shape for CTC: (T, B, C)
        log_probs = log_probs.transpose(0, 1)
        loss = criterion(log_probs, labels_padded, new_input_lengths, target_lengths)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # 驗證
    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for feats_padded, labels_padded, input_lengths, target_lengths in valid_loader:
            feats_padded = feats_padded.cuda()
            labels_padded = labels_padded.cuda()
            input_lengths = input_lengths.cuda()
            target_lengths = target_lengths.cuda()

            log_probs = model(feats_padded)
            new_input_lengths = (input_lengths // 4).to(torch.int32)
            log_probs = log_probs.transpose(0, 1)
            loss = criterion(log_probs, labels_padded, new_input_lengths, target_lengths)
            valid_loss += loss.item()

    valid_loss /= len(valid_loader)

    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}")
    
# 在訓練完成後儲存模型權重
torch.save(model.state_dict(), "deepspeech2_model.pth")


In [None]:
import os
import csv
import torch
import torch.nn as nn
import numpy as np
import librosa

# 假設您已有:
# model: DeepSpeech2 模型並已load訓練後參數
# device: torch.device
# phone2id: {phone_str: id_int}, 並包含 <blank>:0
id2phone = {v: k for k, v in phone2id.items()}

def load_audio_features(wav_path, sr=16000, n_mels=80, frame_length=0.025, frame_shift=0.01):
    y, sr = librosa.load(wav_path, sr=sr)
    n_fft = int(sr * frame_length)
    hop_length = int(sr * frame_shift)
    win_length = n_fft
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft,
                                              hop_length=hop_length,
                                              win_length=win_length,
                                              n_mels=n_mels, fmin=20, fmax=sr/2)
    log_mel_spec = np.log(np.maximum(mel_spec, 1e-10))
    return log_mel_spec

def ctc_greedy_decode(logits, blank_id=0):
    # logits: (T, batch, output_dim)
    pred_ids = torch.argmax(logits, dim=-1)  # (T, batch)
    pred_ids = pred_ids.transpose(0,1)       # (batch, T)
    results = []
    for seq in pred_ids:
        prev = blank_id
        out = []
        for p in seq:
            p = p.item()
            if p != blank_id and p != prev:
                out.append(p)
            prev = p
        results.append(out)
    return results

def inference(model, test_wav_dir, sample_csv_path, output_csv_path):
    test_entries = []
    with open(sample_csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            test_entries.append(row)

    model.eval()

    predictions = []
    with torch.no_grad():
        for entry in test_entries:
            utt_id = entry["id"]
            wav_path = os.path.join(test_wav_dir, utt_id + ".wav")
            if not os.path.exists(wav_path):
                print(f"Warning: {wav_path} not found.")
                predictions.append({"id": utt_id, "text": ""})
                continue

            feats = load_audio_features(wav_path)  # (n_mels, T)
            # DeepSpeech2 輸入維度 (B,1,F,T)
            # feats shape為 (n_mels,T) -> 增加batch與channel維度 (1,1,n_mels,T)
            feats_t = torch.tensor(feats, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)

            # 模型輸出為 (B,T',C)
            logits = model(feats_t)  # (B, T', C)
            # 取log softmax (model若已回傳log_probs則可省略)
            # 這裡假設model已回傳log_probs，如模型中已有log_softmax則可直接使用logits
            # 若model回傳的是logits，請使用:
            # log_probs = nn.functional.log_softmax(logits, dim=-1)
            log_probs = logits  # 若上面forward已經是log_softmax的結果

            # CTC decode預期 (T',B,C)
            log_probs = log_probs.transpose(0,1)  # (T',B,C)
            pred_ids = ctc_greedy_decode(log_probs, blank_id=0)[0]

            pred_phones = [id2phone[i] for i in pred_ids if id2phone[i] != "iNULL"]
            pred_text = " ".join(pred_phones)
            predictions.append({"id": utt_id, "text": pred_text})

    with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=["id", "text"])
        writer.writeheader()
        for pred in predictions:
            writer.writerow(pred)

# 載入模型參數
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load("deepspeech2_model.pth", map_location=device))
model.to(device)

# 推論
inference(model, "test_converted", "sample.csv", "sample_DeepSpeech2.csv")
