In [14]:
import pydub
import numpy as np
import os
import sounddevice as sd
from pesq import pesq
from copy import deepcopy
import librosa
import plotly.graph_objects as go 

In [2]:
audios=[]
def find_(name:str):
    result=[]
    for idx,ch in enumerate(name):
        if ch=="_":
            result.append(idx)
    return result
for dirpath, dirnames, filenames in os.walk("./audio1"):
    for file_name in filenames:
        if file_name==".DS_Store":
            continue
        audio_data={}
        audio_data["format"]=file_name[-3:]
        i_=find_(file_name)
        audio_data["type"]=file_name[:i_[0]]
        audio_data["bits"]=file_name[i_[-1]+1:-4]
        full_path=os.path.join(dirpath, file_name)
        dub_temp=pydub.AudioSegment.from_file(full_path, format=audio_data["format"]) 
        samples=dub_temp.get_array_of_samples()
        sample_rate=dub_temp.frame_rate
        audio_data["freq"]=sample_rate
        audio_data["data"]=samples
        audio_data["size"]=os.path.getsize(full_path)
        audios.append(audio_data)

In [3]:
for audio in audios:
    if audio["type"]=="原始音乐":
        music_origin=audio
    elif audio["type"]=="原始语音":
        speech_origin=audio

In [10]:
def compare_with_mel(ref:dict, deg:dict, n_mels=128, frame_len=0.025, hop_len=0.01, eps=1e-10):
    ref_data=np.array(ref["data"], dtype=np.float64)
    deg_data=np.array(deg["data"], dtype=np.float64)
    ref_data=ref_data/np.max(np.abs(ref_data))
    deg_data=deg_data/np.max(np.abs(deg_data)) # 归一化
    if ref["freq"]!=deg["freq"]:
        deg_data=librosa.resample(deg_data, orig_sr=deg["freq"], target_sr=ref["freq"]) # 把采样率设置为相同
    min_len=min(len(ref["data"]), len(deg["data"]))
    ref_data=ref_data[:min_len] # 保证长度相同
    deg_data=deg_data[:min_len]
    n_fft=int(frame_len*ref["freq"])
    hop=int(hop_len*ref["freq"])
    S_orig=librosa.feature.melspectrogram(y=deg_data, sr=ref["freq"], n_mels=n_mels, n_fft=n_fft, hop_length=hop)
    S_deg=librosa.feature.melspectrogram(y=deg_data, sr=deg["freq"], n_mels=n_mels, n_fft=n_fft, hop_length=hop)
    los_S_orig=np.log(S_orig+eps)
    log_S_deg=np.log(S_deg+eps)
    P_signal=np.mean(los_S_orig**2)
    D=los_S_orig-log_S_deg
    P_noise=np.mean(D**2)
    if P_noise==0:
        mel_snr=100
    else:
        mel_snr=10*np.log10(P_signal/P_noise)
    return mel_snr

In [11]:
for idx,audio in enumerate(audios):
    if audio["type"]=="音乐":
        score=compare_with_mel(ref=music_origin, deg=audio)
        audios[idx]["score"]=score 
    elif audio["type"]=="语音":
        score=compare_with_mel(ref=speech_origin, deg=audio)
        audios[idx]["score"]=score 
    else:
        audios[idx]["score"]=100

In [None]:
for idx,audio in enumerate(audios):
    print(f"idx:{idx}, Type:{audio["type"]}, format:{audio["format"]}, freq:{audio["freq"]}, score:{audio["score"]}, ")

idx:0, Type:语音, format:wav, freq:16000, score:18.314466718611957, 
idx:1, Type:音乐, format:mp3, freq:44100, score:18.524451023312423, 
idx:2, Type:音乐, format:wav, freq:48000, score:100, 
idx:3, Type:音乐, format:mp3, freq:44100, score:18.426748281117302, 
idx:4, Type:语音, format:wav, freq:44100, score:27.050768329974176, 
idx:5, Type:语音, format:mp3, freq:44100, score:27.03502176179284, 
idx:6, Type:语音, format:mp3, freq:16000, score:20.066914738866966, 
idx:7, Type:音乐, format:wav, freq:8000, score:7.467198641688583, 
idx:8, Type:音乐, format:wav, freq:32000, score:11.67471811955007, 
idx:9, Type:语音, format:wav, freq:16000, score:20.37355769216617, 
idx:10, Type:语音, format:wav, freq:22050, score:18.685708516542235, 
idx:11, Type:音乐, format:wav, freq:48000, score:100, 
idx:12, Type:原始音乐, format:wav, freq:48000, score:100, 
idx:13, Type:语音, format:wav, freq:8000, score:100, 
idx:14, Type:语音, format:mp3, freq:44100, score:26.964155764511744, 
idx:15, Type:语音, format:wav, freq:44100, score:28.0897

In [17]:
def analyse_size(audio_data:list):
    sizes=[]
    scores=[]
    for audio in audio_data:
        sizes.append(audio["size"])
        scores.append(audio["score"])
    fig=go.Figure()
    fig.add_trace(go.Scatter(x=sizes, y=scores, mode="markers"))
    fig.show()

In [18]:
analyse_size(audio_data=audios)

In [21]:
import torch, torchaudio

In [None]:
import torchaudio.prototype
import torchaudio.prototype.datasets



dataset_music=torchaudio.prototype.datasets.Musan(root="./musan", subset="music")

IndexError: list index out of range