## Converting .m4a files to .wav files of Female Voices

In [1]:
import os
from pydub import AudioSegment
input_folder = '/kaggle/input/voice-data/VoxCeleb_gender/females'
output_folder = '/kaggle/working/converted_files'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
formats_to_convert = ['.m4a']
for root, dirs, files in os.walk(input_folder):
    for filename in files:
        if filename.endswith(tuple(formats_to_convert)):
            filepath = os.path.join(root, filename)
            file_extension = os.path.splitext(filepath)[1]
            try:
                track = AudioSegment.from_file(filepath, file_extension.replace('.', ''))
                output_filename = os.path.join(output_folder, filename.replace(file_extension, '.wav'))
                print('CONVERTING: ' + str(filepath))
                track.export(output_filename, format='wav')
            except Exception as e:
                print("ERROR CONVERTING " + str(filepath) + ": " + str(e))


CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/412.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/2099.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/398.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/1749.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/115.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/842.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/952.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/1543.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/1374.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/305.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/2285.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/354.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/1132.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/females/1769.m4a
CONVERTING: /kaggle/input/v

## Converting .m4a files to .wav files of Male Voices

In [2]:


input_folder = '/kaggle/input/voice-data/VoxCeleb_gender/males'

output_folder = '/kaggle/working/Males'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

formats_to_convert = ['.m4a']

for root, dirs, files in os.walk(input_folder):
    for filename in files:
        if filename.endswith(tuple(formats_to_convert)):
            filepath = os.path.join(root, filename)
            file_extension = os.path.splitext(filepath)[1]
            try:
                track = AudioSegment.from_file(filepath, file_extension.replace('.', ''))
                
                output_filename = os.path.join(output_folder, filename.replace(file_extension, '.wav'))
                
                print('CONVERTING: ' + str(filepath))
                track.export(output_filename, format='wav')
            except Exception as e:
                print("ERROR CONVERTING " + str(filepath) + ": " + str(e))


CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/412.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/2099.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/398.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/1749.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/115.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/842.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/952.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/1543.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/1374.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/3370.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/3103.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/305.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/2873.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/males/2285.m4a
CONVERTING: /kaggle/input/voice-data/VoxCeleb_gender/m

## Defining the function for extracting the features from .wav files created

In [45]:
def features_extractor(file):
    audio, sample_rate = librosa.load(file) 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    return mfccs_scaled_features

In [46]:
import librosa 
import numpy as np
import pandas as pd

## Defining the list of files of Male Voices

In [48]:
male_files = os.listdir('/kaggle/working/Males')


## Extracting the features from the Male Voices

In [51]:
male_extracted_features = list() 
for i in male_files:
    features = features_extractor(os.path.join("/kaggle/working/Males/",i))
    male_extracted_features.append([features,1])

## Defining the list of files of female voices

In [53]:
female_files = os.listdir('/kaggle/working/converted_files')

## Extracting the features from the Male Voices

In [59]:
female_extracted_features = list() 
for i in female_files:
    features = features_extractor(os.path.join("/kaggle/working/converted_files/",i))
    female_extracted_features.append([features,0])

## Creating the dataframe for male voice extracted features

In [63]:
male_extracted_features_df=pd.DataFrame(male_extracted_features,columns=['feature','class'])
male_extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-177.96971, 118.529945, -51.271824, 60.918034...",1
1,"[-206.07872, 132.78241, -68.26469, 44.039234, ...",1
2,"[-366.4721, 95.902275, -22.518526, 60.652664, ...",1
3,"[-289.46902, 156.66214, -7.5638566, 48.567062,...",1
4,"[-263.6032, 155.9379, -17.009192, 45.20595, -1...",1


## Creating the dataframe for female voice extracted features

In [64]:
female_extracted_features_df=pd.DataFrame(female_extracted_features,columns=['feature','class'])
female_extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-362.34073, 119.35963, -16.452494, 36.799488,...",0
1,"[-248.36684, 133.98819, -18.578735, 39.28384, ...",0
2,"[-308.10703, 117.68871, -29.507095, 27.962593,...",0
3,"[-308.06207, 146.96207, -39.447163, 44.44801, ...",0
4,"[-322.19952, 121.23923, -58.719273, 26.004354,...",0


## Combining  male voice extracted features dataframe and female voice extracted features dataframe and shuffling the rows

In [65]:
combined_df = pd.concat([male_extracted_features_df, female_extracted_features_df], ignore_index=True)
voice_gender_df = combined_df.sample(frac=1).reset_index(drop=True)


## Showing the head of Combined and shuffled dataframe of voice features of males and females

In [66]:
voice_gender_df.head()

Unnamed: 0,feature,class
0,"[-162.06631, 124.78082, -22.167408, 61.56928, ...",1
1,"[-412.52313, 128.56348, -8.435583, 38.695747, ...",0
2,"[-241.20815, 111.89663, -79.51047, 64.198654, ...",0
3,"[-304.68054, 135.51631, -18.998655, 38.288166,...",0
4,"[-255.4628, 150.92754, -27.820621, 54.325863, ...",0


## Separating the input features and output features of dataframe created

In [67]:
x=np.array(voice_gender_df['feature'].tolist())
y=np.array(voice_gender_df['class'].tolist())

## Saving the dataframe for future use

In [69]:
voice_gender_df.to_csv('/kaggle/working/voice_gender.csv')

## Applying the train test split on the input and output features of voice data

In [73]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

## Importing required libraries

In [98]:
import tensorflow as tf
import warnings as warn
warn.filterwarnings('ignore')
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

## Defining the Architecture of DL model

In [77]:

num_labels = 1  

model = Sequential()

model.add(Dense(100, input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(num_labels))
model.add(Activation('sigmoid'))  

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


## Printing the summary of the above defined deep learning architecture

In [78]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 100)               4100      
                                                                 
 activation_4 (Activation)   (None, 100)               0         
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dense_5 (Dense)             (None, 200)               20200     
                                                                 
 activation_5 (Activation)   (None, 200)               0         
                                                                 
 dropout_4 (Dropout)         (None, 200)               0         
                                                                 
 dense_6 (Dense)             (None, 100)              

## Setting up model check point callback to save the best model found upon trainin

In [76]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
num_epochs = 100
num_batch_size = 32
checkpointer = ModelCheckpoint(filepath='saved_models/voice_clzfier.hdf5',verbose=1, save_best_only=True)

## Compiling the model 

In [79]:

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)

Epoch 1/100


I0000 00:00:1708452301.092330   37905 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 1: val_loss improved from inf to 0.71134, saving model to saved_models/voice_clzfier.hdf5
Epoch 2/100

  saving_api.save_model(


Epoch 2: val_loss improved from 0.71134 to 0.68429, saving model to saved_models/voice_clzfier.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 0.68429 to 0.66789, saving model to saved_models/voice_clzfier.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 0.66789 to 0.61158, saving model to saved_models/voice_clzfier.hdf5
Epoch 5/100
Epoch 5: val_loss improved from 0.61158 to 0.50146, saving model to saved_models/voice_clzfier.hdf5
Epoch 6/100
Epoch 6: val_loss improved from 0.50146 to 0.42708, saving model to saved_models/voice_clzfier.hdf5
Epoch 7/100
Epoch 7: val_loss improved from 0.42708 to 0.38640, saving model to saved_models/voice_clzfier.hdf5
Epoch 8/100
Epoch 8: val_loss improved from 0.38640 to 0.35142, saving model to saved_models/voice_clzfier.hdf5
Epoch 9/100
Epoch 9: val_loss improved from 0.35142 to 0.32144, saving model to saved_models/voice_clzfier.hdf5
Epoch 10/100
Epoch 10: val_loss improved from 0.32144 to 0.30555, saving model to saved_models/voice_clzfier.hdf5
Ep

<keras.src.callbacks.History at 0x7e37bf288e80>

## Creating the prediction setup and converting mp3 to wav file

In [17]:
import tensorflow as tf
import librosa
import numpy as np

## Defining the prediction method

In [20]:
def predict(audio_file):
    model = tf.keras.models.load_model('voice_clzfier.hdf5')
    audio, sample_rate = librosa.load(audio_file) 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    features = np.array(mfccs_scaled_features)
    features = np.expand_dims(features, axis=0)
    return int(np.round(model.predict(features)))

In [21]:
predict('test_audios/prabhas.mp3')



1