In [None]:
!mkdir -p ~/.kaggle

# Move kaggle.json to the correct folder
!mv kaggle.json ~/.kaggle/
!kaggle competitions download -c audio-understanding
!mkdir -p /content/kaggle_competition

In [None]:
!unzip -q /content/audio-understanding.zip -d /content/kaggle_competition

In [None]:
!pip install librosa soundfile noisereduce torchaudio datasets evaluate transformers \
    speechbrain git+https://github.com/snakers4/silero-vad.git plotly PyWavelets scipy
!pip install --upgrade voicefixer
!pip install jiwer
!pip install voicefixer
!pip install webrtcvad
!pip install polars

In [1]:
# import webrtcvad

import os
import librosa
import numpy as np
import random
import pandas as pd
from IPython.display import display, Audio
import string
from tqdm.notebook import tqdm
import collections
from pathlib import Path
import polars as pl
import plotly.graph_objs as go

import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader

import noisereduce as nr
import pywt
import jiwer

from numpy.fft import fft, fftfreq
from scipy.signal import butter, lfilter

from datasets import load_dataset, Audio, ClassLabel, DatasetDict
from speechbrain.pretrained import SpectralMaskEnhancement
from speechbrain.pretrained import SepformerSeparation as separator
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    pipeline
)
import speechbrain
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from transformers import AutoModelForCausalLM, AutoTokenizer
import dolphin


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data

## EDA

In [None]:
folder = "/content/kaggle_competition/speechs/speechs/test"
wav_files = [f for f in os.listdir(folder) if f.endswith(".wav")]
random_file = random.choice(wav_files)

print(random_file)

In [None]:
waveform, sample_rate = torchaudio.load(os.path.join(folder, random_file))
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)

In [None]:
waveform, sample_rate

In [None]:
wave = waveform[0].numpy()
x = np.arange(len(wave)) / 16000  # time axis in seconds

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=wave, mode='lines', name='Waveform'))
fig.update_layout(title='Audio Waveform', xaxis_title='Time (s)', yaxis_title='Amplitude')
fig.show()

In [None]:
Audio(waveform.numpy(), rate=16000)

In [None]:
# Visualization
def visualize_sound_3in1(waveform, waveclean, sample_rate=16000):
    waveform = waveform.squeeze()
    waveclean = waveclean.squeeze()

    # 1. Time domain comparison
    x = np.linspace(0, len(waveform) / sample_rate, len(waveform))
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x, y=waveform, mode='lines', name='Original'))
    fig.add_trace(go.Scatter(x=x, y=waveclean, mode='lines', name='Filtered'))
    fig.update_layout(title='Filtered Audio (Time Domain)', xaxis_title='Time (s)', yaxis_title='Amplitude')
    fig.show()

    # 2. Frequency domain
    fft_output = torch.fft.rfft(waveclean)
    magnitude_spectrum = torch.abs(fft_output)
    num_bins = len(magnitude_spectrum)
    frequencies = torch.linspace(0, sample_rate / 2, num_bins)
    fig = go.Figure(data=go.Scatter(x=frequencies.numpy(), y=magnitude_spectrum.numpy(), mode='lines'))
    fig.update_layout(
        title='FFT Magnitude Spectrum (Filtered)',
        xaxis_title='Frequency (Hz)',
        yaxis_title='Magnitude',
        hovermode='x unified',
        template='plotly_white'
    )
    fig.show()

    # 3. Playback (only works in Jupyter/IPython)
    display(Audio(waveform, rate=sample_rate))
    display(Audio(waveclean, rate=sample_rate))

### FFT

In [None]:
waveform = waveform.squeeze()

fft_output = torch.fft.rfft(waveform)

magnitude_spectrum = torch.abs(fft_output)

num_bins = len(magnitude_spectrum)
frequencies = torch.linspace(0, sample_rate / 2, num_bins)

# Convert to numpy for Plotly (Plotly can handle tensors too, but numpy is common)
frequencies_np = frequencies.numpy()
magnitude_spectrum_np = magnitude_spectrum.numpy()

# 4. Plotting with Plotly
fig = go.Figure(data=go.Scatter(x=frequencies_np, y=magnitude_spectrum_np, mode='lines'))

fig.update_layout(
    title='FFT Magnitude Spectrum',
    xaxis_title='Frequency (Hz)',
    yaxis_title='Magnitude',
    hovermode='x unified', # Shows a tooltip across the x-axis for all traces
    template='plotly_white' # A clean white background template
)

fig.show()

## Signal

### Bandpass

In [None]:
#Band pass filter function
def get_dominant_frequency(signal, sample_rate=16000):
    n = len(signal)
    yf = np.abs(fft(signal))[:n // 2]  # Take positive frequencies
    xf = fftfreq(n, 1 / sample_rate)[:n // 2]  # Frequency bins

    idx = np.argmax(yf)  # Index of max amplitude
    dominant_freq = xf[idx]
    return dominant_freq

def butter_bandpass(lowcut, highcut, fs, order=5):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return b, a

def apply_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [None]:
wavelet_fft = waveform.squeeze()

highest_freq = get_dominant_frequency(wavelet_fft, 16000)
lowcut = max(20, highest_freq - 1000)
wavelet_clean = apply_bandpass_filter(wavelet_fft, lowcut, highest_freq + 2000, sample_rate)

visualize_sound_3in1(wavelet_fft, wavelet_clean)

### NoiseReducer

In [None]:
wavelet_fft = waveform.squeeze()

wavelet_clean = nr.reduce_noise(y=wavelet_fft, sr=sample_rate, prop_decrease=0.95, stationary=False, use_tqdm=False, n_fft=2048, win_length=None, hop_length=512)
visualize_sound_3in1(wavelet_fft, wavelet_clean)

### Wavelet

In [None]:
def wavelet_denoise(data, wavelet='db4', level=1):
    coeff = pywt.wavedec(data, wavelet, mode="per")

    sigma = np.median(np.abs(coeff[-level])) / 0.6745
    uthresh = sigma * np.sqrt(2 * np.log(len(data))) *0.8

    coeff[1:] = [pywt.threshold(i, value=uthresh, mode='soft') for i in coeff[1:]]
    return pywt.waverec(coeff, wavelet, mode="per")

waveform_fft = waveform.squeeze()

wavelet_clean = wavelet_denoise(waveform_fft)
visualize_sound_3in1(waveform_fft, wavelet_clean)

### Mel-Spectogram

In [None]:
import librosa.display
import matplotlib.pyplot as plt

S = librosa.stft(waveform[0].numpy())
S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)

# Before
plt.figure(figsize=(12, 4))
librosa.display.specshow(librosa.amplitude_to_db(np.abs(S), ref=np.max),
                         sr=16000, x_axis='time', y_axis='hz')
plt.title('Original Spectrogram')
plt.colorbar()
plt.show()

In [None]:
import librosa.display
import matplotlib.pyplot as plt

S = librosa.stft(waveform_clean.numpy())
S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
# Before
plt.figure(figsize=(12, 4))
librosa.display.specshow(librosa.amplitude_to_db(np.abs(S), ref=np.max),
                         sr=16000, x_axis='time', y_axis='hz')
plt.title('Original Spectrogram')
plt.colorbar()
plt.show()

In [None]:
S = librosa.feature.melspectrogram(y=waveform_clean.numpy(), sr=16000, n_fft=2048, hop_length=512, n_mels=128)
S_db = librosa.power_to_db(S, ref=np.max)

plt.figure(figsize=(12, 4))
librosa.display.specshow(librosa.amplitude_to_db(np.abs(S), ref=np.max),
                         sr=16000, x_axis='time', y_axis='hz')
plt.title('Original Spectrogram')
plt.colorbar()
plt.show()

## Speechmodel

### Denoising

### Speechbrain

In [None]:
waveform.shape

In [None]:
waveform, sample_rate = torchaudio.load('/content/kaggle_competition/speechs/speechs/test/682ab42b-441c-4089-a508-96fda0104e35.wav')
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)

In [None]:
# Load enhancement model
enhancer = SpectralMaskEnhancement.from_hparams(
    source="speechbrain/metricgan-plus-voicebank",
    savedir="tmpdir_metricgan"
)
# Enhance
# Assume waveform has shape [1, time]
lengths = torch.tensor([1.0])  # Full length used

# Enhance the audio
wavelet_clean = enhancer.enhance_batch(waveform, lengths)


visualize_sound_3in1(waveform, wavelet_clean)


### VoiceFixer

In [None]:
# Initialize the model
voicefixer = VoiceFixer()

# Download pretrained model if not already downloaded
voicefixer.load_model()

# Set your file paths
input_path = "thai_input.wav"      # Path to your Thai audio
output_path = "thai_output.wav"    # Path to save enhanced audio

# Run inference (restore)
voicefixer.restore(input_path, output_path, cuda=False)

# Preprocess

### function

In [2]:
#Wavelet function
def wavelet_denoise(data, wavelet='db4', level=1):
    coeff = pywt.wavedec(data, wavelet, mode="per")

    sigma = np.median(np.abs(coeff[-level])) / 0.6745
    uthresh = sigma * np.sqrt(2 * np.log(len(data))) *0.8

    coeff[1:] = [pywt.threshold(i, value=uthresh, mode='soft') for i in coeff[1:]]
    return pywt.waverec(coeff, wavelet, mode="per")

In [3]:
#Band pass filter function
def get_dominant_frequency(signal, sample_rate=16000):
    n = len(signal)
    yf = np.abs(fft(signal))[:n // 2]  # Take positive frequencies
    xf = fftfreq(n, 1 / sample_rate)[:n // 2]  # Frequency bins

    idx = np.argmax(yf)  # Index of max amplitude
    dominant_freq = xf[idx]
    return dominant_freq

def butter_bandpass(lowcut, highcut, fs, order=5):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return b, a

def apply_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [4]:
enhancer = SpectralMaskEnhancement.from_hparams(
    source="/project/ai901504-ai0004/501641_Big/week4/metricgan-plus-voicebank",
    savedir="/project/ai901504-ai0004/501641_Big/week4/tmpdir_metricgan",
    run_opts={"device": str('cuda')}
)

def deep_speech(waveform):

  if isinstance(waveform, np.ndarray):
        waveform = torch.from_numpy(waveform)
  if waveform.ndim == 1:
        waveform = waveform.unsqueeze(0)  # [1, N]
  waveform = waveform.float().to(device)  # float32

  # Assume waveform has shape [1, time]
  lengths = torch.tensor([1.0]).to(device)  # Full length used

  # Enhance the audio
  wavelet_clean = enhancer.enhance_batch(waveform, lengths)

  return wavelet_clean

In [5]:
from scipy.io.wavfile import write

def voice_fixer(waveform):
    if isinstance(waveform, np.ndarray):
        waveform_int16 = np.int16(waveform * 32767)
        write("output.wav", 16000, waveform_int16)
    elif isinstance(waveform, torch.Tensor):
        # Make sure tensor shape is [channels, samples]
        if waveform.dim() == 1:
            waveform = waveform.unsqueeze(0)
        torchaudio.save("output.wav", waveform, sample_rate=16000)
    else:
        raise TypeError("Input waveform must be a NumPy array or PyTorch tensor")

    voicefixer = VoiceFixer()
    voicefixer.restore('/content/output.wav', '/content/process.wav', cuda=False)
    waveform, sample_rate = torchaudio.load('/content/process.wav')

    return waveform

In [6]:
from IPython.display import Audio, display

# Visualization
def visualize_sound_3in1(waveform, waveclean, sample_rate=16000):
    waveform = waveform.squeeze()
    waveclean = waveclean.squeeze()

    # 1. Time domain comparison
    x1 = np.linspace(0, len(waveform) / sample_rate, len(waveform))
    x2 = np.linspace(0, len(waveclean) / sample_rate, len(waveclean))
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x1, y=waveform.numpy(), mode='lines', name='Original'))
    fig.add_trace(go.Scatter(x=x2, y=waveclean.numpy(), mode='lines', name='Filtered'))
    fig.update_layout(title='Filtered Audio (Time Domain)', xaxis_title='Time (s)', yaxis_title='Amplitude')
    fig.show()

    # 2. Frequency domain
    fft_output = torch.fft.rfft(waveclean)
    magnitude_spectrum = torch.abs(fft_output)
    num_bins = len(magnitude_spectrum)
    frequencies = torch.linspace(0, sample_rate / 2, num_bins)
    fig = go.Figure(data=go.Scatter(x=frequencies.numpy(), y=magnitude_spectrum.numpy(), mode='lines'))
    fig.update_layout(
        title='FFT Magnitude Spectrum (Filtered)',
        xaxis_title='Frequency (Hz)',
        yaxis_title='Magnitude',
        hovermode='x unified',
        template='plotly_white'
    )
    fig.show()

    # 3. Playback (only works in Jupyter/IPython)
    display(Audio(waveform, rate=sample_rate))
    display(Audio(waveclean, rate=sample_rate))


In [7]:
def normalize_waveform(waveform):
    # Ensure waveform is a tensor
    if not isinstance(waveform, torch.Tensor):
        waveform = torch.tensor(waveform)

    # Use torch.abs and torch.max for tensors
    max_val = torch.max(torch.abs(waveform))
    if max_val == 0:
        return waveform  # Avoid division by zero
    return waveform / max_val

In [8]:
def noise_reduce(waveform):
  #Change to the right format
  wavelet_clean = waveform.squeeze()

  #2.Start with Wavelet
  wavelet_clean = wavelet_denoise(wavelet_clean)

  #3.Band pass filter
  highest_freq = get_dominant_frequency(wavelet_clean, 16000)
  lowcut = max(300, highest_freq - 1000)
  wavelet_clean = apply_bandpass_filter(wavelet_clean, lowcut, max(4000, highest_freq + 1000), sample_rate)

  #4.deep_speech
  # wavelet_clean = deep_speech(wavelet_clean)

  #5.noise_reducer
  wavelet_clean = nr.reduce_noise(y=wavelet_clean, sr=sample_rate, prop_decrease=0.6, stationary=True, use_tqdm=False, n_fft=2048, win_length=1536, hop_length=512)

  #6.normalize
  wavelet_clean = normalize_waveform(wavelet_clean)

  return wavelet_clean

In [9]:
def deep_noise_reduce(waveform):
  #Change to the right format
  wavelet_clean = waveform.squeeze()

  #1.Start with Wavelet
  wavelet_clean = wavelet_denoise(wavelet_clean)

  #2.deep_speech
  wavelet_clean = deep_speech(wavelet_clean)

  #3.normalize
  wavelet_clean = normalize_waveform(wavelet_clean)

  return wavelet_clean

In [None]:
'''from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks


ans = pipeline(
    Tasks.acoustic_noise_suppression,
    model='/project/ai901504-ai0004/501641_Big/week4/speech_dfsmn_ans_psm_48k_causal')
result = ans(
    '/content/speechs/speechs/test/1460a0f9-fbb6-4db1-a3af-582b93e79e6d.wav',
    output_path='output.wav')'''

### sampling

In [None]:
# waveform, sample_rate = torchaudio.load('/content/kaggle_competition/speechs/speechs/test/1460a0f9-fbb6-4db1-a3af-582b93e79e6d.wav')
# resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
# waveform = resampler(waveform)

In [None]:
folder = "/project/ai901504-ai0004/501641_Big/week4/speechs/speechs/test"
wav_files = [f for f in os.listdir(folder) if f.endswith(".wav")]
random_file = random.choice(wav_files)

print(random_file)

waveform, sample_rate = torchaudio.load(os.path.join(folder, random_file))
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)

waveform, sample_rate = torchaudio.load('/project/ai901504-ai0004/501641_Big/week4/speechs/speechs/test/4a656e78-62b6-4277-a30b-9cfedbc3a184.wav')
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)

# ASR

In [None]:
#โหลดโมเดลDolphinจากไฟล์ก่อนหน้า
import dolphin
import IPython.display as ipd
import librosa
import os
from tqdm import tqdm
from pathlib import Path

model = dolphin.load_model("small",
                           "/project/ai901504-ai0004/501641_Big/week4/DataoceanAI-dolphin-small",
                           "cuda")


In [None]:
MODEL_NAME = "/project/ai901504-ai0004/501641_Big/week4/faster-whisper-large-v2-th"
lang = "th"

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=20,
    device=device,
    batch_size=8,
)

In [13]:
ans = pipeline(
    Tasks.acoustic_noise_suppression,
    model='/project/ai901504-ai0004/501641_Big/week4/speech_dfsmn_ans_psm_48k_causal')

In [None]:
!rm -rf /project/ai901504-ai0004/501641_Big/week4/denoise
!mkdir /project/ai901504-ai0004/501641_Big/week4/denoise
!mkdir /project/ai901504-ai0004/501641_Big/week4/denoise/train
!mkdir /project/ai901504-ai0004/501641_Big/week4/denoise/test
!mkdir /project/ai901504-ai0004/501641_Big/week4/denoise/resample_train
!mkdir /project/ai901504-ai0004/501641_Big/week4/denoise/resample_test

In [10]:
train_df = pl.read_csv('/project/ai901504-ai0004/501641_Big/week4/Human_Labor_train.csv')
test_df = pl.read_csv('//project/ai901504-ai0004/501641_Big/week4/test.csv')

predict_data = 'test'
predict_df = test_df
num_files = 300

In [11]:
input_paths = Path(f'/project/ai901504-ai0004/501641_Big/week4/speechs/speechs/{predict_data}').glob('*.wav')
# input_paths = Path(f'/project/ai901504-ai0004/501641_Big/week4/noise_augment').glob('*.wav')
output_path = Path(f'/project/ai901504-ai0004/501641_Big/week4/denoise/{predict_data}')

In [None]:
for path in tqdm(input_paths, total=num_files):
  waveform, sample_rate = torchaudio.load(str(path))
  waveform = deep_noise_reduce(waveform)
  waveform = waveform.cpu()
  torchaudio.save(output_path/path.name, waveform, sample_rate)

In [None]:
for path in tqdm(input_paths, total=num_files):
  waveform, sample_rate = torchaudio.load(str(path))
  waveform = noise_reduce(waveform)
  waveform = waveform.unsqueeze(0)
  torchaudio.save(output_path/path.name, waveform, sample_rate)

In [14]:
input_paths = Path(f'/project/ai901504-ai0004/501641_Big/week4/speechs/speechs/{predict_data}').glob('*.wav')
resample_path = Path(f'/project/ai901504-ai0004/501641_Big/week4/denoise/resample_{predict_data}')
output_path = Path(f'/project/ai901504-ai0004/501641_Big/week4/denoise/{predict_data}')

# Step 1: Resample to 48kHz
for path in tqdm(input_paths, total=num_files):
    waveform, sample_rate = torchaudio.load(str(path))
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=48000)
    waveform = resampler(waveform)
    torchaudio.save(resample_path / path.name, waveform, 48000)

# Step 2: Noise reduction
resample_paths = list(resample_path.glob('*.wav'))
for path in tqdm(resample_paths, total=num_files):
    result = ans(str(path), output_path=resample_path / path.name)

# Step 3: Resample back to 16kHz
for path in tqdm(resample_paths, total=num_files):
    waveform, sample_rate = torchaudio.load(str(path))
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)
    torchaudio.save(output_path / path.name, waveform, 16000)

In [None]:
output_path = Path(f'/project/ai901504-ai0004/501641_Big/week4/denoise/{predict_data}')
# output_path = Path(f'/project/ai901504-ai0004/501641_Big/week4/noise_augment')

output_paths = list(output_path.glob('*.wav')) 

file_names = [path.stem for path in output_paths]
out_paths = [str(path) for path in output_paths]

In [18]:
# folder_base = Path(f'/project/ai901504-ai0004/501641_Big/week4/speechs/speechs/{predict_data}')
folder_base = Path(f'/project/ai901504-ai0004/501641_Big/week4/noise_augment')
# folder_denoise = Path("/project/ai901504-ai0004/501641_Big/week4/noise_augment")
folder_denoise = Path(f'/project/ai901504-ai0004/501641_Big/week4/denoise/{predict_data}')
wav_files = [f for f in os.listdir(folder_base) if f.endswith(".wav")]
random_file = random.choice(wav_files)

print(random_file)

waveform_base, sample_rate = torchaudio.load(os.path.join(folder_base, random_file))
waveform_denoise, sample_rate = torchaudio.load(os.path.join(folder_denoise, random_file))

# waveform_base, sample_rate = torchaudio.load('/project/ai901504-ai0004/501641_Big/week4/noise_augment/33466244-2697-4de2-81b1-196a56ffd6bb.wav')
# waveform_denoise, sample_rate = torchaudio.load(f'/project/ai901504-ai0004/501641_Big/week4/denoise/train/33466244-2697-4de2-81b1-196a56ffd6bb.wav')

visualize_sound_3in1(waveform_base, waveform_denoise)

In [None]:
def noise_augment(waveform, sample_rate=16000, noise_factor=0.005):
    if waveform.ndim == 2:
        waveform = waveform[0]  # [1, N] -> [N]

    noise = torch.randn_like(waveform)  # generate noise with same shape
    augmented_waveform = waveform + noise_factor * noise
    augmented_waveform = torch.clamp(augmented_waveform, -1.0, 1.0)

    return augmented_waveform.unsqueeze(0) # [1, N]


input_path = Path(f'/project/ai901504-ai0004/501641_Big/week4/denoise/{predict_data}').glob('*.wav')

for path in tqdm(input_path, total=num_files):
  waveform, sample_rate = torchaudio.load(str(path))
  waveform = noise_augment(waveform)
  torchaudio.save(path, waveform, sample_rate=16000)

In [None]:
result = pipe(out_paths)

In [None]:
del pipe

In [None]:
import gc

torch.cuda.empty_cache()
gc.collect()

In [None]:
result[:10]

In [None]:
result_dict = {}
i = 0

for name in file_names:
  result_dict[name] = result[i]['text']
  i += 1

predict_df = predict_df.with_columns(
    pl.col("id").replace(result_dict).alias("denoised_text")
)

In [None]:
predict_df.head()

## Eval

In [None]:
references = predict_df['Gemini_Transcript ไว้แก้'].to_list()
hypothesis = predict_df['denoised_text'].to_list()
error_metrics = jiwer.compute_measures(references, hypothesis)
error_metrics['wer']

In [None]:
##Thonburian
#DeepSpeech
#0.95

#Signal Processing
#0.91
#0.944640753828033

#Normal
#1.07

#Chinoise
#0.9358068315665489

#############################################
##Pathumma
#DeepSpeech
#2.5

#Signal Processing
#3.6

#Normal
#0.95

#Chinese
#0.9567137809187279
#0.9584805653710248

#Chinoise
#

#################################################
##Biodatlab
#Normal
#1.0032391048292109

################################################
##Monsoon
#Normal
#1.0

#Signal Processing
#1.0

#Deepspeech
#1.0
################################################
##whisper-th-large-combined
#Deepspeech
#0.9829210836277974

#Signal Processing
#0.9234393404004712

#Normal
#0.9511189634864546

#Chinese
#0.9390459363957597

## Test inference

In [None]:
!rm -rf /ai901504-ai0004/501641_Big/week4/inference
!mkdir /ai901504-ai0004/501641_Big/week4/inference
!mkdir /ai901504-ai0004/501641_Big/week4/inference/Normal

In [None]:
!rm -rf /ai901504-ai0004/501641_Big/week4/inference/Deepspeech
!mkdir /ai901504-ai0004/501641_Big/week4/inference/Deepspeech

In [None]:
input_dir = Path("/project/ai901504-ai0004/501641_Big/week4/inference/Normal")
output_dir = Path("/project/ai901504-ai0004/501641_Big/week4/inference/Deepspeech")

for path in tqdm(list(input_dir.glob("*.wav")), total=5):
    try:
        waveform, sample_rate = torchaudio.load(str(path))
        waveform = deep_noise_reduce(waveform)
        waveform = waveform.cpu()
        torchaudio.save(str(output_dir / path.name), waveform, sample_rate)
    except Exception as e:
        print(f"❌ Error processing {path.name}: {e}")

In [None]:
!rm -rf /ai901504-ai0004/501641_Big/week4/inference/Signalprocessing
!mkdir /ai901504-ai0004/501641_Big/week4/inference/Signalprocessing

In [None]:
input_dir = Path("/project/ai901504-ai0004/501641_Big/week4/inference/Normal")
output_dir = Path("/project/ai901504-ai0004/501641_Big/week4/inference/Signalprocessing")

# Process audio files
for path in tqdm(list(input_dir.glob("*.wav")), total=5):
    try:
        waveform, sample_rate = torchaudio.load(str(path))
        waveform = noise_reduce(waveform)
        waveform = waveform.unsqueeze(0)  # Ensure shape (1, N) for saving
        torchaudio.save(str(output_dir / path.name), waveform, sample_rate)
    except Exception as e:
        print(f"❌ Error processing {path.name}: {e}")

In [None]:
input_paths = Path(f'/project/ai901504-ai0004/501641_Big/week4/inference/Normal').glob('*.wav')
resample_path = Path(f'/project/ai901504-ai0004/501641_Big/week4/inference/resample')
output_path = Path(f'/project/ai901504-ai0004/501641_Big/week4/inference/denoise')

# Step 1: Resample to 48kHz
for path in tqdm(input_paths, total=num_files):
    waveform, sample_rate = torchaudio.load(str(path))
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=48000)
    waveform = resampler(waveform)
    torchaudio.save(resample_path / path.name, waveform, 48000)

# Step 2: Noise reduction
resample_paths = list(resample_path.glob('*.wav'))
for path in tqdm(resample_paths, total=num_files):
    result = ans(str(path), output_path=resample_path / path.name)

# Step 3: Resample back to 16kHz
for path in tqdm(resample_paths, total=num_files):
    waveform, sample_rate = torchaudio.load(str(path))
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)
    torchaudio.save(output_path / path.name, waveform, 16000)

In [None]:
folder_base = Path(f'/project/ai901504-ai0004/501641_Big/week4/inference/Normal')

folder_noise = Path(f'/project/ai901504-ai0004/501641_Big/week4/inference/denoise')

wav_files = [f for f in os.listdir(folder_base) if f.endswith(".wav")]
random_file = random.choice(wav_files)

print(random_file)

waveform_base, sample_rate = torchaudio.load(os.path.join(folder_base, random_file))
waveform_noise, sample_rate = torchaudio.load(os.path.join(folder_noise, random_file))

visualize_sound_3in1(waveform_base, waveform_noise)

In [None]:
output_path = Path(f'/project/ai901504-ai0004/501641_Big/week4/inference/Normal')
output_paths = list(output_path.glob('*.wav')) 

file_names = [path.stem for path in output_paths]
out_paths = [str(path) for path in output_paths]

result0 = pipe(out_paths)

In [None]:
result0

In [None]:
output_path = Path(f'/project/ai901504-ai0004/501641_Big/week4/inference/Signalprocessing')
output_paths = list(output_path.glob('*.wav')) 

file_names = [path.stem for path in output_paths]
out_paths = [str(path) for path in output_paths]

result1 = pipe(out_paths)

In [None]:
result1

In [None]:
output_path = Path(f'/project/ai901504-ai0004/501641_Big/week4/inference/Deepspeech')
output_paths = list(output_path.glob('*.wav')) 

file_names = [path.stem for path in output_paths]
out_paths = [str(path) for path in output_paths]

result2 = pipe(out_paths)

In [None]:
result2

In [None]:
import librosa.display
import matplotlib.pyplot as plt

S = librosa.stft(waveform_base[0].numpy())
S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)

# Before
plt.figure(figsize=(12, 4))
librosa.display.specshow(librosa.amplitude_to_db(np.abs(S), ref=np.max),
                         sr=16000, x_axis='time', y_axis='hz')
plt.title('Original Spectrogram')
plt.colorbar()
plt.show()

In [None]:
import librosa.display
import matplotlib.pyplot as plt

S = librosa.stft(waveform_denoise[0].numpy())
S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
# Before
plt.figure(figsize=(12, 4))
librosa.display.specshow(librosa.amplitude_to_db(np.abs(S), ref=np.max),
                         sr=16000, x_axis='time', y_axis='hz')
plt.title('Original Spectrogram')
plt.colorbar()
plt.show()

In [None]:
S = librosa.feature.melspectrogram(y=waveform_denoise[0].numpy(), sr=16000, n_fft=2048, hop_length=512, n_mels=128)
S_db = librosa.power_to_db(S, ref=np.max)

plt.figure(figsize=(12, 4))
librosa.display.specshow(librosa.amplitude_to_db(np.abs(S), ref=np.max),
                         sr=16000, x_axis='time', y_axis='hz')
plt.title('Original Spectrogram')
plt.colorbar()
plt.show()