In [None]:
import numpy as np
import librosa

def spectral_subtraction(noisy_audio, sr, noise_sample, alpha=.1):

  n_fft = 2048
  hop_length = 512
  noisy_spectrogram = librosa.stft(noisy_audio, n_fft=n_fft, hop_length=hop_length)

  noisy_mag = np.abs(noisy_spectrogram)
  noisy_phase = np.angle(noisy_spectrogram)

  noise_spectrogram = librosa.stft(noise_sample, n_fft=n_fft, hop_length=hop_length)
  noise_mag, _ = np.abs(noise_spectrogram), np.angle(noise_spectrogram)
  noise_power_spectrum = np.mean(noise_mag**2, axis=1, keepdims=True)



  clean_mag = np.maximum(noisy_mag - alpha * noise_power_spectrum, 0)

  denoised_audio = librosa.istft(clean_mag * noisy_phase)

  return denoised_audio

noisy_audio, sr = librosa.load("/content/gm1_02_033.wav")
noise_sample, _ = librosa.load("/content/cabin-pressure_76bpm_F_minor.wav")


denoised_audio = spectral_subtraction(noisy_audio, sr, noise_sample)


from soundfile import write

write("denoised_audio.wav", denoised_audio, sr)

In [None]:
import numpy as np
import librosa
from soundfile import write

def spectral_subtraction(noisy_audio, sr, noise_sample, alpha=1):
    n_fft = 2048
    hop_length = 512
    noisy_spectrogram = librosa.stft(noisy_audio, n_fft=n_fft, hop_length=hop_length)

    noisy_mag = np.abs(noisy_spectrogram)
    noisy_phase = np.angle(noisy_spectrogram)

    if noise_sample.shape[0] < noisy_audio.shape[0]:
        noise_sample = np.pad(noise_sample, (0, noisy_audio.shape[0] - noise_sample.shape[0]), mode='constant')

    noise_spectrogram = librosa.stft(noise_sample, n_fft=n_fft, hop_length=hop_length)
    noise_mag, _ = np.abs(noise_spectrogram), np.angle(noise_spectrogram)

    if noise_mag.ndim > noisy_mag.ndim:
        noise_mag = noise_mag.squeeze()
    noise_power_spectrum = np.mean(noise_mag**2, axis=1, keepdims=True)

    clean_mag = np.maximum(noisy_mag - alpha * noise_power_spectrum, 0)

    denoised_audio = librosa.istft(clean_mag * noisy_phase)

    return denoised_audio

def evaluate_noise_reduction(noisy_audio, denoised_audio, sr):
    noisy_power = librosa.feature.rms(y=noisy_audio)[0]
    denoised_power = librosa.feature.rms(y=denoised_audio)[0]

    signal_power = np.mean(denoised_power**2)
    noise_power = np.mean(noisy_power**2) - signal_power

    epsilon = 1e-8
    noise_power = np.maximum(noise_power, epsilon)

    snr = 10 * np.log10(signal_power / noise_power)

    return snr


noisy_audio, sr_noisy = librosa.load("/content/gm1_02_033.wav")
denoised_audio, sr_denoised = librosa.load("/content/denoised_audio.wav")


evaluation_metrics = evaluate_noise_reduction(noisy_audio, denoised_audio, sr_noisy)
print("SNR:", evaluation_metrics)


SNR: 53.64247438063657


In [None]:
from transformers import AutoModelForCTC, Wav2Vec2Processor
import torch
import torchaudio
import string
def transcribe_with_wav2vec(file_path):
    model = AutoModelForCTC.from_pretrained("Jzuluaga/wav2vec2-large-960h-lv60-self-en-atc-uwb-atcc-and-atcosim")
    processor = Wav2Vec2Processor.from_pretrained("Jzuluaga/wav2vec2-large-960h-lv60-self-en-atc-uwb-atcc-and-atcosim")

    waveform, file_sampling_rate = torchaudio.load(file_path)
    if file_sampling_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=file_sampling_rate, new_freq=16000)(waveform)

    waveform = waveform.squeeze(0)

    input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values

    with torch.no_grad():
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(pred_ids)

    return transcription[0]
def customcorrection(predicttext,text):
    cleaned_ofrealtext = ''.join(char for char in text.lower() if char not in string.punctuation)
    cleaned_ofpredicttext = ''.join(char for char in predicttext.lower() if char not in string.punctuation)
    n_samples =len(cleaned_ofrealtext.split())
    n_correct=0

    for i in cleaned_ofpredicttext.split():
        if i in cleaned_ofrealtext.split():
            n_correct+=1
    return 1 * n_correct / n_samples
if __name__ == "__main__":
    audio_path = '/content/gm1_02_033.wav'

    transcription_wav2vec = transcribe_with_wav2vec(audio_path)
    print("Transcription (Wav2Vec 2.0):", transcription_wav2vec)
    print("ACC (Wav2Vec 2.0):", customcorrection(transcription_wav2vec,"sabena nine seven zero climb to flight level three four zero"))
    print("ACC (Wav2Vec 2.0):", customcorrection(transcription_wav2vec,"sabena nine seven zero climb to flight level three four zero"))

Some weights of the model checkpoint at Jzuluaga/wav2vec2-large-960h-lv60-self-en-atc-uwb-atcc-and-atcosim were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Jzuluaga/wav2vec2-large-960h-lv60-self-en-atc-uwb-atcc-and-atcosim and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.

Transcription (Wav2Vec 2.0): sabena nine seven zero climb to flight level three four zero
ACC (Wav2Vec 2.0): 1.0
ACC (Wav2Vec 2.0): 1.0


In [None]:
from google.colab import drive
import os
from transformers import AutoModelForCTC, Wav2Vec2Processor
import torch
import torchaudio
import string

def transcribe_with_wav2vec(model, processor, file_path):
    waveform, file_sampling_rate = torchaudio.load(file_path)
    if file_sampling_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=file_sampling_rate, new_freq=16000)(waveform)

    waveform = waveform.squeeze(0)

    input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values

    with torch.no_grad():
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(pred_ids)

    return transcription[0]

def calculate_metrics(predicted_text, actual_text):
    predicted_text = ''.join(char for char in predicted_text.lower() if char not in string.punctuation)
    actual_text = ''.join(char for char in actual_text.lower() if char not in string.punctuation)

    predicted_words = predicted_text.split()
    actual_words = actual_text.split()

    true_positives = sum(1 for word in predicted_words if word in actual_words)
    false_positives = len(predicted_words) - true_positives
    false_negatives = len(actual_words) - true_positives

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    accuracy = true_positives / len(actual_words) if len(actual_words) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    wer = (false_positives + false_negatives) / len(actual_words) if len(actual_words) > 0 else 0
    ser = int(predicted_text != actual_text)

    return accuracy, precision, recall, f1_score, wer, ser


drive.mount('/content/drive')

if __name__ == "__main__":
    main_audio_dir = '/content/drive/MyDrive/wave_corrected'
    main_transcription_dir = '/content/drive/MyDrive/TXTdata'

    model = AutoModelForCTC.from_pretrained("Jzuluaga/wav2vec2-large-960h-lv60-self-en-atc-uwb-atcc-and-atcosim")
    processor = Wav2Vec2Processor.from_pretrained("Jzuluaga/wav2vec2-large-960h-lv60-self-en-atc-uwb-atcc-and-atcosim")

    total_accuracy = 0
    total_precision = 0
    total_recall = 0
    total_f1_score = 0
    total_wer = 0
    total_ser = 0
    num_files = 0

    for root, dirs, files in os.walk(main_audio_dir):
        for filename in files:
            if filename.endswith('.wav'):
                audio_path = os.path.join(root, filename)
                transcription_path = os.path.join(main_transcription_dir, os.path.relpath(audio_path, main_audio_dir)).replace('.wav', '.txt')

                if not os.path.exists(transcription_path):
                    print(f"Transcription file not found for {filename} at {transcription_path}. Skipping.")
                    continue

                with open(transcription_path, 'r') as f:
                    actual_transcription = f.read().strip()

                predicted_transcription = transcribe_with_wav2vec(model, processor, audio_path)
                accuracy, precision, recall, f1_score, wer, ser = calculate_metrics(predicted_transcription, actual_transcription)

                print(f"File: {filename}")
                print(f"Actual Transcription: {actual_transcription}")
                print(f"Predicted Transcription: {predicted_transcription}")
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1 Score: {f1_score}")
                print(f"WER: {wer}")
                print(f"SER: {ser}\n")

                total_accuracy += accuracy
                total_precision += precision
                total_recall += recall
                total_f1_score += f1_score
                total_wer += wer
                total_ser += ser
                num_files += 1

    if num_files > 0:
        overall_accuracy = total_accuracy / num_files
        overall_precision = total_precision / num_files
        overall_recall = total_recall / num_files
        overall_f1_score = total_f1_score / num_files
        overall_wer = total_wer / num_files
        overall_ser = total_ser / num_files
        print(f"Overall Accuracy: {overall_accuracy}")
        print(f"Overall Precision: {overall_precision}")
        print(f"Overall Recall: {overall_recall}")
        print(f"Overall F1 Score: {overall_f1_score}")
        print(f"Overall WER: {overall_wer}")
        print(f"Overall SER: {overall_ser}")
    else:
        print("No .wav files found in the specified directory.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of the model checkpoint at Jzuluaga/wav2vec2-large-960h-lv60-self-en-atc-uwb-atcc-and-atcosim were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Jzuluaga/wav2vec2-large-960h-lv60-self-en-atc-uwb-atcc-and-atcosim and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
File: sm4_03_090.wav
Actual Transcription: tag alitalia two nine one identified
Predicted Transcription: tag alitalia two nine one identified
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
WER: 0.0
SER: 0

File: sm4_03_096.wav
Actual Transcription: tag nato ten identified
Predicted Transcription: tag nato ten identified
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
WER: 0.0
SER: 0

File: sm4_03_097.wav
Actual Transcription: and nato one zero proceed direct to norvenich november oscar romeo sierra
Predicted Transcription: and nato one zero proceed direct to norvenich november oscar romeo sierra
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
WER: 0.0
SER: 0

File: sm4_03_105.wav
Actual Transcription: tag gulf air five identified
Predicted Transcription: tag gulf air five identified
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
WER: 0.0
SER: 0

File: sm4_03_100.wav
Actual Transcription: tag 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import librosa
from scipy.signal import spectrogram
from scipy.linalg import norm

def compute_metrics(audio1_path, audio2_path):
    y1, sr1 = librosa.load(audio1_path, sr=16000)
    y2, sr2 = librosa.load(audio2_path, sr=16000)

    if sr1 != sr2:
        raise ValueError("Sample rates do not match.")

    f1, t1, Sxx1 = spectrogram(y1, sr1)
    f2, t2, Sxx2 = spectrogram(y2, sr2)

    min_len = min(Sxx1.shape[1], Sxx2.shape[1])
    Sxx1 = Sxx1[:, :min_len]
    Sxx2 = Sxx2[:, :min_len]

    mse = np.mean((Sxx1 - Sxx2) ** 2)

    signal_power = np.sum(Sxx1 ** 2)
    noise_power = np.sum((Sxx1 - Sxx2) ** 2)
    snr = 10 * np.log10(signal_power / noise_power)

    sc = norm(Sxx1 - Sxx2) / norm(Sxx1)

    return {
        'MSE': mse,
        'SNR': snr,
        'Spectral Convergence': sc
    }

metrics = compute_metrics( '/content/denoised_audio.wav','/content/gm1_02_033.wav')
print(metrics)


{'MSE': 3.1334737e-12, 'SNR': 5.813494920730591, 'Spectral Convergence': 0.5120652}


In [None]:
import numpy as np
import librosa

noisy_audio, sr_noisy = librosa.load("/content/gm1_02_033.wav", sr=16000)
denoised_audio, sr_denoised = librosa.load("/content/denoised_audio.wav", sr=16000)

if sr_noisy != sr_denoised:
    raise ValueError("Sample rates of the noisy and denoised audio files do not match")


min_length = min(len(noisy_audio), len(denoised_audio))
noisy_audio = noisy_audio[:min_length]
denoised_audio = denoised_audio[:min_length]


noise = noisy_audio - denoised_audio


signal_power = np.mean(denoised_audio ** 2)
noise_power = np.mean(noise ** 2)


snr = 10 * np.log10(signal_power / noise_power)

print(f"SNR: {snr:.2f} dB")


SNR: -2.56 dB
