In [1]:
import numpy as np
import scipy.signal as signal
import soundfile as sf
import librosa
import random
from scipy.signal import butter, lfilter
from audiomentations import BandPassFilter


def downsample_and_normalize(audio, target_rate, original_rate):
    # Downsample audio to target_rate and normalize
    number_of_samples = round(len(audio) * float(target_rate) / original_rate)
    downsampled_audio = librosa.resample(audio, orig_sr=original_rate, target_sr=target_rate)
    normalized_signal = downsampled_audio / np.max(np.abs(downsampled_audio))
    return normalized_signal

def irs_filter(audio):
    # Apply a simple filter to simulate IRS filtering
    b, a = signal.butter(4, 0.2)
    return signal.lfilter(b, a, audio)

def line_filter(audio):
    # Simulate line filter effect
    b, a = signal.butter(4, 0.3, 'high')
    return signal.lfilter(b, a, audio)

def pcm_encode_decode(audio):
    # Simple PCM encode-decode process
    quantized = np.round(audio * 255).astype(int)  # 8-bit quantization
    decoded = quantized.astype(float) / 255
    return decoded

def add_echo(audio, delay_samples, attenuation):
    # Basic echo effect simulation
    echo = np.zeros_like(audio)
    echo[delay_samples:] = audio[:-delay_samples] * attenuation
    return audio + echo

def add_noise(audio, noise_type='white'):
    # Add simulated noise to the signal
    if noise_type == 'white':
        noise = np.random.normal(0, 0.1, len(audio))
        return audio + noise
    return audio

def add_white_noise(audio, noise_level=0.001):
    return audio + noise_level * np.random.randn(len(audio))

def add_impulse_noise(audio, probability=0.001, intensity=0.5):
    # Add impulse noise by setting a random set of samples to max amplitude
    impulse_indices = np.random.uniform(size=len(audio)) < probability
    audio[impulse_indices] = intensity * np.sign(audio[impulse_indices])
    return audio


def augment_audio(path_input_audio,path_save_audio):
    # Load the audio file
    audio_data, sample_rate = sf.read(path_input_audio)
    # Assuming the audio is mono for simplicity
    if len(audio_data.shape) == 2:
        audio_data = audio_data[:,0]

    # transform = BandPassFilter(min_center_freq=100.0, max_center_freq=6000, p=1.0)
    transform = BandPassFilter(min_center_freq=1000, max_center_freq=6000, p=1.0)
    audio_data = transform(audio_data, sample_rate=16000)

    delay_seconds = random.choice([0.03,0.04,0.05])  # Thời gian trễ của echo tính bằng giây
    attenuation = 0.6
    delay_samples = int(delay_seconds * sample_rate)
    audio_data = add_echo(audio_data, delay_samples, attenuation)
    # Apply the simulation methods
    audio_data = downsample_and_normalize(audio_data, 16000, sample_rate)
    # audio_data = add_white_noise(audio_data)
    # audio_data = add_impulse_noise(audio_data, probability=0.001, intensity=random.choice([0.1,0.2,0.3]))

    noise_amp = 0.005*np.random.uniform()*np.amax(audio_data)
    audio_data = audio_data + noise_amp * np.random.normal(size=audio_data.shape[0])
    # Save the processed audio
    sf.write(path_save_audio, audio_data, 16000)

augment_audio("/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/audio_demo.wav","/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/audio_demo_real_1.wav")

LibsndfileError: Error opening '/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/audio_demo.wav': System error.

# Purpose method Augment  

In [2]:
from audiomentations import AddShortNoises, PolarityInversion
import scipy.signal as signal
import soundfile as sf
import librosa
import random
from scipy.signal import butter, lfilter

transform = AddShortNoises(
    sounds_path="/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/03_tieng-on_noise_16k-2.wav",
    min_snr_in_db=50.0,
    max_snr_in_db=70.0,
    noise_rms="absolute",
    min_time_between_sounds=0.1,
    max_time_between_sounds=0.5,
    noise_transform=PolarityInversion(),
    p=1.0
)
path_input_audio = "/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/audio_demo_real_1.wav"
my_waveform_ndarray, sample_rate = sf.read(path_input_audio)
augmented_sound = transform(my_waveform_ndarray, sample_rate=16000) 
output_path = "/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/augment_2.wav"
sf.write(output_path, augmented_sound, sample_rate)

LibsndfileError: Error opening '/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/audio_demo_real_1.wav': System error.

In [10]:
import subprocess

def convert_m4a_to_wav(input_file, output_file):
    command = [
        'ffmpeg',
        '-i', input_file,
        '-ar', '16000',
        '-ac', '1',
        output_file
    ]
    
    subprocess.run(command, check=True)

# Sử dụng hàm để chuyển đổi tệp
input_file = '/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/AddShortNoises_transformed.flac'
output_file = '/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/noise.wav'
convert_m4a_to_wav(input_file, output_file)

ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

In [2]:
import pandas as pd
metadata_train = "/home/lmnguyen/Projects/cmc_stt/finetune-fast-conformer/notebooks/results_csv/total_data_rms_filter_train.csv"
train_df = pd.read_csv(metadata_train)
train_df = train_df.sample(frac=1).reset_index(drop=True)
print(train_df.shape)

(2467111, 3)


In [3]:
search_augment = "augment"
search_bud500 = "BUD500"
search_trung = "trung"
search_bac = "bac"
search_nam = "nam"
train_augment = train_df[train_df["file"].str.contains(rf'{search_augment}')]
print(train_augment.shape)
train_filter_augment = train_df[~train_df["file"].isin(train_augment["file"])]
bud_500_data = train_filter_augment[train_filter_augment["file"].str.contains(rf'{search_bud500}')]
train_filter_bud500_augment = train_filter_augment[~train_filter_augment["file"].isin(bud_500_data["file"])]
print(bud_500_data.shape)
# print(train_filter_bud500_augment.shape)
search_trung_filter = train_filter_bud500_augment[train_filter_bud500_augment["file"].str.contains(rf'{search_trung}')]
print(search_trung_filter.shape)
train_filter_bud500_augment_trung = train_filter_bud500_augment[~train_filter_bud500_augment["file"].isin(search_trung_filter["file"])]
# print(train_filter_bud500_augment_trung.shape)
search_bac_filter = train_filter_bud500_augment_trung[train_filter_bud500_augment_trung["file"].str.contains(rf'{search_bac}')]
train_filter_bud500_augment_trung_bac = train_filter_bud500_augment_trung[~train_filter_bud500_augment_trung["file"].isin(search_bac_filter["file"])]
print(search_bac_filter.shape)
# print(train_filter_bud500_augment_trung_bac.shape)
search_nam_filter = train_filter_bud500_augment_trung_bac[train_filter_bud500_augment_trung_bac["file"].str.contains(rf'{search_nam}')]
train_filter_bud500_augment_trung_bac_nam = train_filter_bud500_augment_trung_bac[~train_filter_bud500_augment_trung_bac["file"].isin(search_nam_filter["file"])]
print(search_nam_filter.shape)
print(train_filter_bud500_augment_trung_bac_nam.shape)
print(int(train_augment.shape[0])+int(bud_500_data.shape[0])+int(search_trung_filter.shape[0])+int(search_bac_filter.shape[0])+int(search_nam_filter.shape[0])+int(train_filter_bud500_augment_trung_bac_nam.shape[0]))

(246454, 3)
(558502, 3)
(168536, 3)
(167519, 3)
(205047, 3)
(1121053, 3)
2467111


In [4]:
df_bac = search_bac_filter.sample(frac=0.2, random_state=1)
df_trung = search_trung_filter.sample(frac=0.2, random_state=1)
df_nam = search_nam_filter.sample(frac=0.2, random_state=1)
df_bud500 = bud_500_data.sample(frac=0.05, random_state=1)
random_data = train_filter_bud500_augment_trung_bac_nam.sample(frac=0.02, random_state=1)
print(len(df_bac) + len(df_trung) + len(df_nam) + len(df_bud500) + len(random_data))

158566


In [5]:
df_augment = pd.concat([df_bac,df_trung,df_nam,df_bud500,random_data]).sample(frac=1, random_state=1)
df_augment.shape

(158566, 3)

In [55]:
path_test = "/home/ntdong/Data/STT_dataset/Tiktok_20_12_2023/Tiktok/giong_nam/2023/12/20/audio/7016160944707390747/34_1ch.wav"
path_save = "/home/lmnguyen/Projects/cmc_stt/finetune-fast-conformer/check_augment/aaaaa.wav"
augment_audio(path_input_audio=path_test,path_save_audio=path_save)



In [None]:
import tqdm
dir_save_audio = "/home/ntdong/Data/STT_dataset/61.Augment2same_telesales/wavs"
path_save_csv = "/home/ntdong/Data/STT_dataset/61.Augment2same_telesales/metadata.csv"
i = 0
with open(path_save_csv, 'w', encoding='utf8') as fp:
    print(f'file,text,duration', file=fp)
    for index, row in tqdm.tqdm(df_augment.iterrows()):
        # print(row[0])
        path_save_wav = f"{dir_save_audio}/{index}.wav"
        augment_audio(row[0],path_save_wav)
        print(f'{path_save_wav},{row[1]},{row[2]}', file=fp)

# Purpose an method data augmentation which to the same the bug in recurrent model  

## Filter data augmentation in clear data

In [1]:
# import pandas as pd
# metadata_train = "/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/results_csv/train_2023_2024.csv"
# train_df = pd.read_csv(metadata_train)
# train_df = train_df.sample(frac=1).reset_index(drop=True)
# print(train_df.shape)

(2742568, 3)


In [78]:
# non_augment_df = train_df[~train_df['file'].str.contains('augment')] #data origin
# non_augment_df = non_augment_df[~non_augment_df['file'].str.contains('_bg_noise')] #data origin
# non_augment_df = non_augment_df[~non_augment_df['file'].str.contains('short_noise')] #data origin
# non_augment_df = non_augment_df[~non_augment_df['file'].str.contains('denoise_dns48')]
# non_augment_df = non_augment_df[~non_augment_df['file'].str.contains('compresison_aac')]#new_bg
# non_augment_df = non_augment_df[~non_augment_df['file'].str.contains('new_bg')]
# # non_augment_df = non_augment_df.sample(frac=1).reset_index(drop=True)
# # denoise_dns48
# #60.BUD500_15_05_2024

# augment_df = train_df[~train_df.index.isin(non_augment_df.index)]
# augment_df = augment_df[~augment_df['file'].str.contains('60.BUD500_15_05_2024')]
# print(len(augment_df),len(non_augment_df))
# columns_to_save = ["file", "text", "duration"]
# non_augment_df.to_csv('/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/results_csv/2023_2024_origin_data_clean.csv', columns=columns_to_save, index=False, encoding='utf-8')
# augment_df.to_csv('/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/results_csv/2023_2024_augment_data_clean.csv', columns=columns_to_save, index=False, encoding='utf-8')

In [76]:
1549074 + 923545

2472619

In [64]:
# ket hop 10% augment trước với data origin
# augment_sample_size = int(len(augment_df) * 0.4)
# augment_sample_df = augment_df.sample(n=augment_sample_size, random_state=42)
# combined_train_df = pd.concat([non_augment_df, augment_sample_df])
# print(len(combined_train_df))

1765558


## Function increase audio speed 

In [3]:
import pandas as pd
metadata_origindata = "/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/results_csv/2023_2024_origin_data_clean.csv"
metadata_augment = "/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/notebooks/results_csv/2023_2024_augment_data_clean.csv"
original_df = pd.read_csv(metadata_origindata)
augment_df = pd.read_csv(metadata_augment)
original_df = original_df.sample(frac=1).reset_index(drop=True)
augment_df = augment_df.sample(frac=1).reset_index(drop=True)
print(original_df.shape)
print(augment_df.shape)
original_df.shape[0] + augment_df.shape[0]

(1549074, 3)
(1193494, 3)


2742568

In [4]:
from pydub import AudioSegment
import os
import soundfile as sf
import random
import librosa

def change_speed(audio_path,output_path,speed=1.2):
    """
    Tăng tốc độ của audio và thay đổi chất giọng:

    :param audio_path: Đường dẫn tới file audio gốc
    :param speed: Hệ số tăng tốc (1.3 có nghĩa là nhanh hơn 30%)
    :return: Đối tượng AudioSegment đã được tăng tốc
    """
    speed = random.choice([1.3,1.4,1.5])
    audio = AudioSegment.from_file(audio_path)
    # Tăng tốc độ audio
    new_audio = audio._spawn(audio.raw_data, overrides={
        "frame_rate": int(audio.frame_rate * speed)
    }).set_frame_rate(audio.frame_rate)
    new_audio.export(output_path, format="wav")

# input_path = '/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/infer_N/chunk_5_normalized.wav'
# output_path = '/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/infer_N/cut_audio_to_check/faster_5.wav'

# faster_audio = change_speed(input_path)
# faster_audio.export(output_path, format="wav")


In [6]:
def change_speed_without_pitch(audio_path, output_path, speed=1.3):
    """
    Tăng tốc độ của audio mà không thay đổi chất giọng.

    :param audio_path: Đường dẫn tới file audio gốc
    :param output_path: Đường dẫn để lưu file audio mới
    :param speed: Hệ số tăng tốc (1.3 có nghĩa là nhanh hơn 30%)
    """
    speed = random.choice([1.3,1.4,1.5])
    # Đọc file audio
    y, sr = librosa.load(audio_path, sr=None)
    
    # Tăng tốc độ mà không thay đổi chất giọng
    y_fast = librosa.effects.time_stretch(y,rate=speed)
    
    # Lưu file audio mới
    sf.write(output_path, y_fast, sr)

# input_path = '/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/infer_N/chunk_5_normalized.wav'
# output_path = '/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/infer_N/cut_audio_to_check/faster_5_1.wav'
# change_speed_without_pitch(input_path, output_path, speed=1.3)
# print(f"Audio đã được tăng tốc và lưu tại {output_path}")

In [7]:
# input_audio_path = "/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/infer_N/0_1ch.wav"  # Replace with your actual path
# output_audio_path = "/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/infer_N/cut_audio_to_check/file_check.wav"  # Replace with your desired path

def increase_lound(input_path,output_path):
    ratio_augment_audio = [0.05,0.1,0.15,0.2,0.25]
    audio_data, sample_rate = sf.read(input_path)
    gain_factor = random.choice(ratio_augment_audio)
    augmented_audio_data = audio_data * gain_factor
    sf.write(output_path, augmented_audio_data, sample_rate)

In [8]:
def change_speed_increase_lound(audio_path, output_path):
    speed = random.choice([1.3,1.4,1.5])
    gain_factor = random.choice([0.05,0.1,0.15,0.2,0.25])
    y, sr = librosa.load(audio_path, sr=None)
    y_fast = librosa.effects.time_stretch(y,rate=speed)
    augmented_audio_data = y_fast * gain_factor
    sf.write(output_path, augmented_audio_data, sr)

# input_audio_path = "/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/infer_N/0_1ch.wav"  # Replace with your actual path
# output_audio_path = "/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/infer_N/cut_audio_to_check/file_check_222.wav"  # Replace with your desired path

# change_speed_increase_lound(input_audio_path,output_audio_path)


## Data Augment processing 

In [9]:
sample_size = int(len(original_df) * 0.10)

augment_increase_speed = original_df.sample(n=sample_size, random_state=42)
change_speed_pitch = original_df.sample(n=int(len(original_df) * 0.05), random_state=42)

original_df = original_df.sample(frac=1).reset_index(drop=True)
for i in augment_increase_speed["file"]:
    print(i)
    break

augment_reduce_lound = original_df.sample(n=sample_size, random_state=15)

for i in augment_reduce_lound["file"]:
    print(i)
    break
original_df = original_df.sample(frac=1).reset_index(drop=True)

augment_increase_speed_reduce_lound = original_df.sample(n=sample_size, random_state=30)

for i in augment_increase_speed_reduce_lound["file"]:
    print(i)
    break

/mnt/driver/STT_data/STT_dataset/13.VLSP-ASR-2021/labeled_trainingset/wav/269-00011981-00012321.wav
/mnt/driver/STT_data/STT_dataset/10.telesale/audio_telesale/tmduy/1578277341.201708_16k.wav/split_1/split_1.wav
/mnt/driver/STT_data/STT_dataset/2.tts_big_vinno/wav/06834_new.wav


In [10]:
print(augment_increase_speed.shape)
print(augment_reduce_lound.shape)
print(augment_increase_speed_reduce_lound.shape)

(154907, 3)
(154907, 3)
(154907, 3)


In [14]:
import tqdm
out_put_wav = "/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/63.Augment_speed_lound/wavs"
with open('/home/pdnguyen/fast_confomer_finetun/finetune-fast-conformer/63.Augment_speed_lound/augment_increase_speed_reduce_lound.csv', 'w', encoding='utf8') as fp:
    print(f'file,text,duration', file=fp)
    # for index,row in tqdm.tqdm(augment_increase_speed.iterrows()):
    #     input_file = row["file"]
    #     text = row["text"]
    #     duration = row["duration"]
    #     output_file = f'{out_put_wav}/{index}_augment_increase_speed.wav'
    #     change_speed_without_pitch(input_file,output_file)
    #     print(f'{output_file},{text},{duration}', file=fp)
    # for index,row in tqdm.tqdm(change_speed_pitch.iterrows()):
    #     input_file = row["file"]
    #     text = row["text"]
    #     duration = row["duration"]
    #     output_file = f'{out_put_wav}/{index}_change_speed_pitch.wav'
    #     change_speed(input_file,output_file)
    #     print(f'{output_file},{text},{duration}', file=fp)
    # for index,row in tqdm.tqdm(augment_reduce_lound.iterrows()):
    #     input_file = row["file"]
    #     text = row["text"]
    #     duration = row["duration"]
    #     output_file = f'{out_put_wav}/{index}_augment_reduce_lound.wav'
    #     increase_lound(input_file,output_file)
    #     print(f'{output_file},{text},{duration}', file=fp)
    for index,row in tqdm.tqdm(augment_increase_speed_reduce_lound.iterrows()):
        input_file = row["file"]
        text = row["text"]
        duration = row["duration"]
        output_file = f'{out_put_wav}/{index}_augment_increase_speed_reduce_lound.wav'
        change_speed_increase_lound(input_file,output_file)
        print(f'{output_file},{text},{duration}', file=fp)

16980it [2:02:25,  2.23it/s]