<a href="https://www.kaggle.com/code/sarathmohan9469/malayalam-male-and-female-voice-classification?scriptVersionId=236178939" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Importing the general purpose libraries

In [None]:
import pandas as pd
import glob
import os
import numpy as np
import matplotlib.pyplot as plt

# Importing the audio libraries

In [None]:
import librosa
import librosa.display
import IPython.display as ipd
import warnings
warnings.filterwarnings('ignore')

In [None]:
f_df = pd.read_csv('../input/malayalam-multispeaker-speech-data-set/line_index_female.tsv', sep='\t')

In [None]:
f_df.head()

In [None]:
l=list(f_df.columns)
f_df.columns=["path","sentence"]
l=pd.DataFrame([l],columns=["path","sentence"])
f_df=pd.concat([f_df,l])
f_df.reset_index(drop=True,inplace=True)
f_df["path"]=f_df["path"]+".wav"
f_df.head()

In [None]:
m_df = pd.read_csv('/kaggle/input/malayalam-multispeaker-speech-data-set/line_index_male.tsv', sep='\t')

In [None]:
m_df.head()

In [None]:
l=list(m_df.columns)
m_df.columns=["path","sentence"]
l=pd.DataFrame([l],columns=["path","sentence"])
m_df=pd.concat([m_df,l])
m_df.reset_index(drop=True,inplace=True)
m_df["path"]=m_df["path"]+".wav"
m_df.head()

In [None]:
f_df.info()

In [None]:
m_df.info()

In [None]:
voice_f=glob.glob("/kaggle/input/malayalam-multispeaker-speech-data-set/ml_in_female/*.wav")
voice_m=glob.glob("/kaggle/input/malayalam-multispeaker-speech-data-set/ml_in_male/*.wav")

In [None]:
len(voice_f)

In [None]:
len(voice_m)

In [None]:
f_df.head()

In [None]:
fname = '/kaggle/input/malayalam-multispeaker-speech-data-set/ml_in_male/mlm_03915_02115782201.wav'
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveshow(data, sr=sampling_rate)

# Paly it again to refresh our memory
ipd.Audio(data, rate=sampling_rate)

In [None]:
fname = '/kaggle/input/malayalam-multispeaker-speech-data-set/ml_in_female/mlf_01130_01490862493.wav'
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveshow(data, sr=sampling_rate)

# Paly it again to refresh our memory
ipd.Audio(data, rate=sampling_rate)

In [None]:
for f_v in voice_f:
    b=os.path.basename(f_v)
    f_df["path"][f_df.index[f_df["path"]==b][0]]=f_v
    
f_df

In [None]:
for m_v in voice_m:
    b=os.path.basename(m_v)
    m_df["path"][m_df.index[m_df["path"]==b][0]]=m_v
    
m_df

In [None]:
import tensorflow as tf
import tensorflow_io as tfio
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

In [None]:
def load_wav_16k_mono(filename):
    file_contents=tf.io.read_file(filename)
    wav,sample_rate=tf.audio.decode_wav(file_contents,desired_channels=1)
    wav=tf.squeeze(wav,axis=-1)
    sample_rate=tf.cast(sample_rate,dtype=tf.int64)
    wav=tfio.audio.resample(wav,rate_in=sample_rate,rate_out=16000)
    return wav

In [None]:
same_sent_df=pd.DataFrame(columns=["male_path","female_path","sentence"],index=pd.RangeIndex(len(f_df)))

In [None]:
i=0
for sent in tqdm(list(m_df["sentence"])):
    if sent in list(f_df["sentence"]):
        mindex=m_df.index[m_df['sentence'] == sent]
        findex=f_df.index[f_df['sentence'] == sent]
        same_sent_df["sentence"][i]=sent
        same_sent_df["male_path"][i]=m_df["path"][mindex].to_list()[0]
        same_sent_df["female_path"][i]=f_df["path"][findex].to_list()[0]
        i+=1
        
same_sent_df.dropna(inplace=True)

# Dataframe showing same sentences in the two voices

In [None]:
same_sent_df

In [None]:
fname = same_sent_df["male_path"][0]
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveshow(data, sr=sampling_rate)

# Paly it again to refresh our memory
ipd.Audio(data, rate=sampling_rate)

In [None]:
fname = same_sent_df["female_path"][0]
data, sampling_rate = librosa.load(fname)
plt.figure(figsize=(15, 5))
librosa.display.waveshow(data, sr=sampling_rate)

# Paly it again to refresh our memory
ipd.Audio(data, rate=sampling_rate)

In [None]:
mwav=load_wav_16k_mono(same_sent_df["male_path"][0])
fwav=load_wav_16k_mono(same_sent_df["female_path"][0])

In [None]:
plt.plot(fwav)
plt.plot(mwav)

plt.show()

In [None]:
male_lengths,female_lengths=[],[]
for male_path,female_path in tqdm(zip(list(same_sent_df["male_path"]),list(same_sent_df["female_path"]))):
    male_lengths.append(len(load_wav_16k_mono(male_path)))
    female_lengths.append(len(load_wav_16k_mono(female_path)))

In [None]:
print(f"Male voice average length: {np.mean(male_lengths)}")
print(f"Female voice average length: {np.mean(female_lengths)}")

In [None]:
print(f"Male voice maximum length: {np.max(male_lengths)}")
print(f"Female voice maximum length: {np.max(female_lengths)}")
print(f"Male voice minimum length: {np.min(male_lengths)}")
print(f"Female voice minimum length: {np.min(female_lengths)}")

In [None]:
def preprocess(file_path):
    wav=load_wav_16k_mono(file_path)
    wav=wav[:70000]
    zero_padding=tf.zeros([70000]-tf.shape(wav),dtype=tf.float32)
    wav=tf.concat([zero_padding,wav],0)
    spectrogram=tf.signal.stft(wav,frame_length=320,frame_step=32)
    spectrogram=tf.abs(spectrogram)
    spectrogram=tf.expand_dims(spectrogram,axis=2)
    return spectrogram

# Spectrogram of the same sentence by male and female voice

In [None]:
plt.figure(figsize=(30,20))
plt.imshow(tf.transpose(preprocess(same_sent_df["male_path"][0]))[0])
plt.title("Male voice spectrogram")
plt.show()
plt.figure(figsize=(30,20))
plt.imshow(tf.transpose(preprocess(same_sent_df["female_path"][0]))[0])
plt.title("Female voice spectrogram")
plt.show()

In [None]:
f_df["label"]=[1 for i in range(len(f_df))]
m_df["label"]=[0 for i in range(len(m_df))]
f_df.head()

In [None]:
complete_dataframe=pd.concat([f_df,m_df])
complete_dataframe.reset_index(inplace=True,drop=True)
complete_dataframe

In [None]:
X_train, X_test, y_train, y_test = train_test_split(complete_dataframe["path"], complete_dataframe["label"], test_size=0.30, random_state=42)

In [None]:
X_train=X_train.reset_index(drop=True)
y_train=y_train.reset_index(drop=True)
X_test=X_test.reset_index(drop=True)
y_test=y_test.reset_index(drop=True)

# Building the model

In [None]:
def build_model():
    inp=tf.keras.layers.Input(shape=(2178,257,1))
    n=tf.keras.layers.Normalization()(inp)
    c1=tf.keras.layers.Conv2D(16,(3,3),activation="relu")(n)
    c2=c1=tf.keras.layers.Conv2D(16,(3,3),activation="relu")(c1)
    m1=tf.keras.layers.MaxPooling2D()(c2)
    
    c3=tf.keras.layers.Conv2D(8,(3,3),activation="relu")(m1)
    c4=c1=tf.keras.layers.Conv2D(8,(3,3),activation="relu")(c3)
    m2=tf.keras.layers.MaxPooling2D()(c4)
    
    c5=tf.keras.layers.Conv2D(8,(3,3),activation="relu")(m2)
    c6=c1=tf.keras.layers.Conv2D(8,(3,3),activation="relu")(c5)
    m3=tf.keras.layers.MaxPooling2D()(c6)
    
    c7=tf.keras.layers.Conv2D(4,(3,3),activation="relu")(m3)
    c8=c1=tf.keras.layers.Conv2D(4,(3,3),activation="relu")(c7)
    m4=tf.keras.layers.MaxPooling2D()(c8)
    
    f=tf.keras.layers.Flatten()(m4)
    d1=tf.keras.layers.Dense(128,activation="relu")(f)
    out=tf.keras.layers.Dense(2,activation="softmax")(d1)
    model=tf.keras.models.Model(inputs=inp,outputs=out)
    
    return model

In [None]:
model=build_model()
model.summary()

In [None]:
model.compile(loss="sparse_categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

In [None]:
EPOCHS=5

# Training the model

In [None]:
train_loss_per_epoch=[]
train_acc_per_epoch=[]

val_loss_per_epoch=[]
val_acc_per_epoch=[]

for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}:")
    train_loss_per_iter=[]
    train_acc_per_iter=[]
    for vpath,vlabel in tqdm(zip(X_train[:2000],y_train[:2000]),total=2000):
        x_tr=np.array([preprocess(vpath)])
        y_tr=np.array([vlabel])
        loss,acc=(model.train_on_batch(x_tr,y_tr))
        train_loss_per_iter.append(loss)
        train_acc_per_iter.append(acc)
    t_loss=np.mean(train_loss_per_iter)
    t_acc=np.mean(train_acc_per_iter)
    train_loss_per_epoch.append(t_loss)
    train_acc_per_epoch.append(t_acc)
    print(f"Train loss at epoch {epoch+1} is {t_loss}")
    print(f"Train accuracy at epoch {epoch+1} is {t_acc}")
    
    val_loss_per_iter=[]
    val_acc_per_iter=[]
    for vpath,vlabel in tqdm(zip(X_train[2000:],y_train[2000:]),total=887):
        x_val=np.array([preprocess(vpath)])
        y_val=np.array([vlabel])
        loss,acc=(model.train_on_batch(x_val,y_val))
        val_loss_per_iter.append(loss)
        val_acc_per_iter.append(acc)
    v_loss=np.mean(val_loss_per_iter)
    v_acc=np.mean(val_acc_per_iter)
    val_loss_per_epoch.append(v_loss)
    val_acc_per_epoch.append(v_acc)
    print(f"Validation loss at epoch {epoch+1} is {v_loss}")
    print(f"Validation accuracy at epoch {epoch+1} is {v_acc}")
    
    
    
    if epoch+1==1:
        modelpath=f"{epoch+1}.hdf5"
        model.save(modelpath)    
    elif val_acc_per_epoch[-1]>np.max(val_acc_per_epoch[:-1]):
        modelpath=f"{epoch+1}.hdf5"
        model.save(modelpath)
    
    
        
        
        
        
    

# Testing and inference

In [None]:

plt.plot(train_loss_per_epoch,label="train")
plt.plot(val_loss_per_epoch,label="validation")
plt.title("Loss")
plt.legend()
plt.show()

plt.plot(train_acc_per_epoch,label="train")
plt.plot(val_acc_per_epoch,label="validation")
plt.title("Accuracy")
plt.legend()
plt.show()

In [None]:
modelpath

In [None]:
x_test_final,labels=[],[]
for xtest,label in zip(list(X_test),list(y_test)):
    x_test_final.append(preprocess(xtest))
    labels.append(label)
    
x_test_final=np.array(x_test_final)
labels=np.array(labels)


print(np.shape(x_test_final))
print(len(labels))

# Using model with best validation accuracy

In [None]:
model=tf.keras.models.load_model(modelpath)

In [None]:
loss,acc=model.evaluate(x_test_final,labels)
print(f"Test accuracy is: {acc*100} %")
print(f"Test loss is: {loss}")

In [None]:
pr=model.predict(x_test_final)

In [None]:
predict=[]
for p in pr:
    predict.append(np.argmax(p))
    
    

In [None]:
len(predict)

In [None]:
print(classification_report(labels, predict, target_names=["Male","Female"]))

In [None]:
cm = confusion_matrix(labels, predict)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=["Male","Female"])

In [None]:
disp.plot()
plt.show()

In [None]:
model.save("best_model.hdf5")