In [None]:
!unzip /content/drive/MyDrive/Copy_of_Mosaic23_PS1_TrainData.zip

In [None]:
### importing libraries
import os
import librosa
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def extract(file_path):
    # Load audio file
    audio, sample_rate = librosa.load(file_path, duration=20)

    # Convert audio to spectrogram
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128, hop_length=512, n_fft=2048)

    # Convert spectrogram to log scale (dB)
    log_spectrogram = librosa.amplitude_to_db(spectrogram, ref=np.max)

    return log_spectrogram.T

In [None]:
## Getting all the Audio and Text file names
directory = '/content/ICBHI_final_database'
audio_file_names=[]
text_file_names=[]
for file in os.listdir(directory):
  if file.endswith('.wav'):
    audio_file_names.append(os.path.join(directory,file))
    name=file.split('.')[0]
    text_file_names.append(os.path.join(directory,name + '.txt'))

In [None]:
## Now that we have got all the file names with we should get into loading the audio file and extracting the features from the audio files
## And extract the labels from the corresponding text files
audio_features=[]
audio_labels=[]

for index in range(len(audio_file_names)):

  ## here we are receiving the features from the audio files as an array of size 128 X 862
  spec = extract(audio_file_names[index])

  ## Now we need to read each corresponding text files for the labels of crackles and wheezes
  df=pd.read_csv(text_file_names[index], sep='\t', header=None, names=['start_time', 'end_time', 'crackle', 'wheeze'])
  temp_label=[0,0]
  for i, row in df.iterrows():

    ## The start and end time are float values so we convert them into proper int values so as to take proper segments from it
    start_idx = int(float(row['start_time']) / 0.04644)
    end_idx = int(float(row['end_time']) / 0.04644)
    if row['crackle']==1:
      temp_label[0]=1
    if row['wheeze']==1:
      temp_label[1]=1
    audio_labels.append(temp_label)

    ## now that we have our labels we map the segment of the audio features array to its corresponding label
    ## we do this so that our model can understand which feature is a crackle and which is a wheeze
    audio_frame = spec[:, start_idx:end_idx]
    audio_features.append(audio_frame)


In [None]:
# Convert the list of features and labels to numpy arrays
max_len = 390  # 20s / (512/22050) ≈ 390
audio_features = np.zeros((len(audio_features), 128, max_len))
for i in range(len(audio_features)):
    audio_features[i] = librosa.util.fix_length(audio_features[i], size=max_len, axis=1)
audio_features = np.expand_dims(audio_features, axis=-1)
audio_labels = np.array(audio_labels)

In [None]:
audio_features.shape

(6110, 128, 390, 1)

In [None]:
audio_labels.shape

(6110, 2)

In [None]:
import tensorflow as tf

# Convert the numpy array to a TensorFlow tensor
features_tensor = tf.convert_to_tensor(audio_features)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(audio_features, audio_labels, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras import layers, models

## Defining the model
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 390, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(2, activation='sigmoid'))

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 126, 388, 32)      320       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 63, 194, 32)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 61, 192, 64)       18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 30, 96, 64)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 28, 94, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 14, 47, 128)      0

In [None]:
model.compile(optimizer='adam',  loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train,y_train,epochs=10,validation_split=0.05,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
y_pred = model.predict(X_test)
y_pred.flatten()



array([0.55787253, 0.39544937, 0.55787253, ..., 0.39544937, 0.55787253,
       0.39544937], dtype=float32)

In [None]:
model.evaluate(X_test,y_test)



[0.6780111789703369, 0.8420621752738953]

In [None]:
tf.keras.models.save_model(model,'model.h5')

In [None]:
print(audio_features)

[[[[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  ...

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]]


 [[[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  ...

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]]


 [[[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  ...

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  [[0.]
   [0.]
   [0.]
   ...
   [0.]
   [0.]
   [0.]]

  [[0.

In [None]:
y_test

array([[1, 0],
       [0, 0],
       [0, 1],
       ...,
       [1, 0],
       [1, 1],
       [1, 1]])