# Importing Required Modules

In [2]:
import os
import numpy as np
import librosa
import joblib
import matplotlib.pyplot as plt

# Extract MFCC

In [1]:
# Method to do padding 
def pad_audio(y, max_samples=1589449):
    padding_length = max_samples - len(y)
    if padding_length > 0:
        y = np.pad(y, (0, padding_length), 'constant')
    return y

# Method to extract MFCC 

def extract_mfcc(directory_path,audio_type,n_mfcc=13):
    # directory: The path to the directory containing the .wav files.
    # audio_type: Type of audio file NAM\WHSP
    # n_mfcc: The number of MFCCs to extract from each audio file (default is 13).  
    
    file_end="_"+str(audio_type)+".wav"  # _nam.wav / _headset.wav
    mfcc_features=[] # This will Store the Extracted mfcc Features for All Audio File 
    
    count = 0
    for filename in os.listdir(directory_path):
        if filename.endswith(file_end):
            count+=1
            # Construct the full path directory + File Name 
            file_path = os.path.join(directory_path, filename) 
            # 'librosa.load'  loads the audio file, returning the audio time series (y) and the sampling rate (sr)
            signal,sr=librosa.load(file_path, sr=None)
            signal=pad_audio(signal)
            # Computes the MFCCs of the audio signal. MFCCs are a representation of the short-term power spectrum of sound.
            mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc).flatten()
            # MFCC Shape 13 co-effictent* 3105  time steps
            # The mean of the MFCCs is calculated along the time axis and appended to the features list.
            # mfcc_features.append(np.mean(mfccs.T, axis=0))
            mfcc_features.append(mfccs)
            print("Extracting Feature : ",filename,"Mfcc_size:",mfccs.shape)
    mfcc_features = np.array(mfcc_features)
    print("Total Files Processed : ",count)
    print("Total MFCC Extracted ",len(mfcc_features))
    
    output_path=r"C:\Users\SID\Documents\Speech\voice_conversion_gan\data\processed\mfcc_"+str(audio_type)+".pkl"
    joblib.dump(mfcc_features, output_path)
    print(f'Features saved to {output_path}')

def make_equal_length(mfcc_features):
    max_len=0
    for mfcc in mfcc_features:
        if len(mfcc)>max_len:
            max_len=len(mfcc)
    return max_len

In [3]:
directory_path=r"C:\Users\SID\Documents\Speech\voice_conversion_gan\data\raw"
audio_type="nam"
# Extract Nam MFCC Features 
extract_mfcc(directory_path,audio_type)


Extracting Feature :  001_nam.wav Mfcc_size: (40365,)
Extracting Feature :  002_nam.wav Mfcc_size: (40365,)
Extracting Feature :  003_nam.wav Mfcc_size: (40365,)
Extracting Feature :  004_nam.wav Mfcc_size: (40365,)
Extracting Feature :  005_nam.wav Mfcc_size: (40365,)
Extracting Feature :  006_nam.wav Mfcc_size: (40365,)
Extracting Feature :  007_nam.wav Mfcc_size: (40365,)
Extracting Feature :  008_nam.wav Mfcc_size: (40365,)
Extracting Feature :  009_nam.wav Mfcc_size: (40365,)
Extracting Feature :  010_nam.wav Mfcc_size: (40365,)
Extracting Feature :  011_nam.wav Mfcc_size: (40365,)
Extracting Feature :  012_nam.wav Mfcc_size: (40365,)
Extracting Feature :  013_nam.wav Mfcc_size: (40365,)
Extracting Feature :  014_nam.wav Mfcc_size: (40365,)
Extracting Feature :  015_nam.wav Mfcc_size: (40365,)
Extracting Feature :  016_nam.wav Mfcc_size: (40365,)
Extracting Feature :  017_nam.wav Mfcc_size: (40365,)
Extracting Feature :  018_nam.wav Mfcc_size: (40365,)
Extracting Feature :  019_na

In [4]:
directory_path=r"C:\Users\SID\Documents\Speech\voice_conversion_gan\data\raw"
audio_type="headset"
# Extract Nam MFCC Features 
extract_mfcc(directory_path,audio_type)

Extracting Feature :  001_headset.wav Mfcc_size: (40365,)
Extracting Feature :  002_headset.wav Mfcc_size: (40365,)
Extracting Feature :  003_headset.wav Mfcc_size: (40365,)
Extracting Feature :  004_headset.wav Mfcc_size: (40365,)
Extracting Feature :  005_headset.wav Mfcc_size: (40365,)
Extracting Feature :  006_headset.wav Mfcc_size: (40365,)
Extracting Feature :  007_headset.wav Mfcc_size: (40365,)
Extracting Feature :  008_headset.wav Mfcc_size: (40365,)
Extracting Feature :  009_headset.wav Mfcc_size: (40365,)
Extracting Feature :  010_headset.wav Mfcc_size: (40365,)
Extracting Feature :  011_headset.wav Mfcc_size: (40365,)
Extracting Feature :  012_headset.wav Mfcc_size: (40365,)
Extracting Feature :  013_headset.wav Mfcc_size: (40365,)
Extracting Feature :  014_headset.wav Mfcc_size: (40365,)
Extracting Feature :  015_headset.wav Mfcc_size: (40365,)
Extracting Feature :  016_headset.wav Mfcc_size: (40365,)
Extracting Feature :  017_headset.wav Mfcc_size: (40365,)
Extracting Fea

In [5]:
mfcc_nam=r"C:\Users\SID\Documents\Speech\voice_conversion_gan\data\processed\mfcc_nam.pkl"
joblib.load(mfcc_nam)

array([[-542.82416, -557.2578 , -571.2298 , ...,    0.     ,    0.     ,
           0.     ],
       [-594.4711 , -583.30566, -574.22455, ...,    0.     ,    0.     ,
           0.     ],
       [-588.13916, -582.9486 , -576.38477, ...,    0.     ,    0.     ,
           0.     ],
       ...,
       [-463.45377, -489.07196, -503.75198, ...,    0.     ,    0.     ,
           0.     ],
       [-530.47577, -531.6202 , -529.6559 , ...,    0.     ,    0.     ,
           0.     ],
       [-573.5546 , -557.78687, -563.3709 , ...,    0.     ,    0.     ,
           0.     ]], dtype=float32)

In [6]:
mfcc_nam=r"C:\Users\SID\Documents\Speech\voice_conversion_gan\data\processed\mfcc_headset.pkl"
joblib.load(mfcc_nam)[0]

array([-647.44745, -651.4236 , -666.2001 , ...,    0.     ,    0.     ,
          0.     ], dtype=float32)