In [2]:
import os
import pandas as pd
import librosa
import numpy as np
import math
import scipy.io.wavfile, scipy.signal
from scipy.spatial import distance
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.naive_bayes import GaussianNB
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.svm import SVC

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


# Pre-Processing Data
- Load Ground Truth from annotation.csv at 44.1kHz
- Load audio files for each song
    1. Average L+R to convert to mono audio
    2. Divide into 5 second windows with 1 second hop\
    3. Label each hop based on whether its midpoint is within the start and end times annotated 

In [8]:
# BLOCK AUDIO
def block_audio(x,blockSize,hopSize,fs):    
    # allocate memory    
    numBlocks = math.ceil(x.size / hopSize)    
    xb = np.zeros([numBlocks, blockSize])    
    # compute time stamps    
    t = (np.arange(0, numBlocks) * hopSize) / fs   
    t_mid = t + (0.5*blockSize/fs)
    x = np.concatenate((x, np.zeros(blockSize)),axis=0)    
    for n in range(0, numBlocks):        
        i_start = n * hopSize        
        i_stop = np.min([x.size - 1, i_start + blockSize - 1])        
        xb[n][np.arange(0,blockSize)] = x[np.arange(i_start, i_stop + 1)]    
    return (xb,t,t_mid)


In [9]:
video_id = '_duhhVa-dk8'

In [10]:
x,sr = librosa.load('./resources/dataset/Audio/processed/'+video_id+'.wav',sr=44100,mono=True)#scipy.io.wavfile.read('./resources/dataset/Audio/processed/'+file_name)

In [11]:
blockSize = int(sr * 1)
hopSize = int(sr * 0.5)

xb,t,t_mid = block_audio(x,blockSize,hopSize,sr)

In [12]:
lut=pd.read_csv('./resources/dataset/Annotations/final/annotation.csv')

In [13]:
# Testing logic for labelling data based on ground truth

blocks = t_mid.size
i=0

for ts in t_mid:
    for idx,row in lut[lut['video_id'] == video_id].reset_index().iterrows():
        annotated_start = row['timestamp_start']
        annotated_end = row['timestamp_end']
        if annotated_start <= ts <= annotated_end:
            #print(f"ts - {ts}, start - {annotated_start} and end - {annotated_end}")
            i+=1
            break
    if ~(annotated_start <= ts <= annotated_end):
        i+=1         


Below paragraph takes about 282 s

In [14]:
lut=pd.read_csv('./resources/dataset/Annotations/final/annotation.csv')
i=0
for video_id in lut['video_id'].unique():
    x,sr = librosa.load('./resources/dataset/Audio/processed/'+video_id+'.wav',sr=44100,mono=True)
    blockSize = int(sr * 1)
    hopSize = int(sr * 0.5)

    xb,t,t_mid = block_audio(x,blockSize,hopSize,sr)
    labels=[]
    for ts in t_mid:
        for idx,row in lut[lut['video_id'] == video_id].reset_index().iterrows():
            annotated_start = row['timestamp_start']
            annotated_end = row['timestamp_end']
            if annotated_start <= ts <= annotated_end:
                labels.append(row['scream_type'])
                break
        if ~(annotated_start <= ts <= annotated_end):
            labels.append('no_vocals')
    # Create new dataframs
    if i == 0:
        df=pd.DataFrame()

        df['t'] = t
        df['t_mid'] = t_mid
        blocks=[]
        for blk in xb:
            blocks.append(blk)
        df.insert(0,'video_id',video_id)
        df.insert(3,'label',labels)
        df['xb'] = blocks
        i+=1
    else:
        tmp=pd.DataFrame()
        tmp['t'] = t
        tmp['t_mid'] = t_mid
        blocks=[]
        for blk in xb:
            blocks.append(blk)
        tmp['video_id'] = video_id
        tmp['label'] = labels
        tmp['xb'] = blocks
        df=df.append(tmp)
out = df.to_numpy()
np.save('./resources/working_data/data.npy', out)
    #df.to_csv('./resources/working_data/'+video_id+'.csv',header=True, index=False,encoding='utf-8-sig',sep='\t')


# Extract Features
## 13 delta_mfccs, ZCR, Spectral Crest, Spectral Centroid
- Normalize the features across the entire dataset
- Extract mean, std dev of the feature value per block 
- Calculate change in feature from one block to another


Below paragraph takes about 16 s

In [60]:
d=np.load('./resources/working_data/data.npy',allow_pickle=True)
df = pd.DataFrame(d,columns=['video_id','ts','mid_ts','label','audio'])

lut = pd.read_csv('./resources/dataset/lookup_new.csv')

from sklearn.model_selection import GroupShuffleSplit
train_inds, test_inds = next(GroupShuffleSplit(test_size=.33, n_splits=2, random_state = 0).split(lut, groups=lut['band_name']))

train = lut.iloc[train_inds]
test = lut.iloc[test_inds]

train_ids = train['video_id'].to_numpy()
test_ids = test['video_id'].to_numpy()

In [59]:
def agg_mfccs(x):
    mfccs = librosa.feature.mfcc(x,n_mfcc = 13)
    mean = [np.mean(feature) for feature in mfccs]
    std = [np.std(feature) for feature in mfccs]
    mfcc_delta = librosa.feature.delta(mfccs)
    delta_mean=[np.mean(feature) for feature in mfcc_delta]
    delta_std=[np.std(feature) for feature in mfcc_delta]
    return mean,std,delta_mean,delta_std

In [65]:
def extract_features(x):
    #MFCCs
    mfcc_mean,mfcc_std,delta_mfcc_mean,delta_mfcc_std = agg_mfccs(x)
    #ZCR
    zcr=librosa.feature.zero_crossing_rate(x)


    
    return mfcc_mean,mfcc_std,delta_mfcc_mean,delta_mfcc_std,zcr

Below paragraph takes about 420 s

In [61]:
df['zcr'] = ''
df['average_zcr'] = ''
df['zcr_stddev'] = ''

#df['mfccs'] = ''
df['mfcc_mean'] = ''
df['mfcc_std'] = ''

df['delta_mfcc_mean'] = ''
df['delta_mfcc_std'] = ''

for i in range(len(df)):
    # Calculate ZCR (3 features)
    zcr=librosa.feature.zero_crossing_rate(df['audio'][i])
    df['zcr'][i] = zcr
    df['average_zcr'][i] = np.mean(zcr)
    df['zcr_stddev'][i] = np.std(zcr)

    # Extract 13 MFCCs - get mean and std deviation for each (26 features) + Delta MFCCs (26 features) = total 52 Features
    mean,std,delta_mean,delta_std = agg_mfccs(df['audio'][i])
    #df['mfccs'][i] = mfccs[0]
    df['mfcc_mean'][i] = mean
    df['mfcc_std'][i] = std

    df['delta_mfcc_mean'][i] = delta_mean
    df['delta_mfcc_std'][i] = delta_std

# Find change in MFCC from one block to another (group by video_id)


In [62]:
df[['mfcc1_mean','mfcc2_mean','mfcc3_mean','mfcc4_mean','mfcc5_mean','mfcc6_mean','mfcc7_mean','mfcc8_mean','mfcc9_mean','mfcc10_mean','mfcc11_mean','mfcc12_mean','mfcc13_mean']]=pd.DataFrame(df.mfcc_mean.tolist(), index= df.index)
df[['mfcc1_std','mfcc2_std','mfcc3_std','mfcc4_std','mfcc5_std','mfcc6_std','mfcc7_std','mfcc8_std','mfcc9_std','mfcc10_std','mfcc11_std','mfcc12_std','mfcc13_std']]=pd.DataFrame(df.mfcc_std.tolist(), index= df.index)

df[['delta_mfcc1_mean','delta_mfcc2_mean','delta_mfcc3_mean','delta_mfcc4_mean','delta_mfcc5_mean','delta_mfcc6_mean','delta_mfcc7_mean','delta_mfcc8_mean','delta_mfcc9_mean','delta_mfcc10_mean','delta_mfcc11_mean','delta_mfcc12_mean','delta_mfcc13_mean']]=pd.DataFrame(df.delta_mfcc_mean.tolist(), index= df.index)
df[['delta_mfcc1_std','delta_mfcc2_std','delta_mfcc3_std','delta_mfcc4_std','delta_mfcc5_std','delta_mfcc6_std','delta_mfcc7_std','delta_mfcc8_std','delta_mfcc9_std','delta_mfcc10_std','delta_mfcc11_std','delta_mfcc12_std','delta_mfcc13_std']]=pd.DataFrame(df.delta_mfcc_std.tolist(), index= df.index)

In [63]:
df.columns

Index(['video_id', 'ts', 'mid_ts', 'label', 'audio', 'zcr', 'average_zcr',
       'zcr_stddev', 'mfcc_mean', 'mfcc_std', 'delta_mfcc_mean',
       'delta_mfcc_std', 'mfcc1_mean', 'mfcc2_mean', 'mfcc3_mean',
       'mfcc4_mean', 'mfcc5_mean', 'mfcc6_mean', 'mfcc7_mean', 'mfcc8_mean',
       'mfcc9_mean', 'mfcc10_mean', 'mfcc11_mean', 'mfcc12_mean',
       'mfcc13_mean', 'mfcc1_std', 'mfcc2_std', 'mfcc3_std', 'mfcc4_std',
       'mfcc5_std', 'mfcc6_std', 'mfcc7_std', 'mfcc8_std', 'mfcc9_std',
       'mfcc10_std', 'mfcc11_std', 'mfcc12_std', 'mfcc13_std',
       'delta_mfcc1_mean', 'delta_mfcc2_mean', 'delta_mfcc3_mean',
       'delta_mfcc4_mean', 'delta_mfcc5_mean', 'delta_mfcc6_mean',
       'delta_mfcc7_mean', 'delta_mfcc8_mean', 'delta_mfcc9_mean',
       'delta_mfcc10_mean', 'delta_mfcc11_mean', 'delta_mfcc12_mean',
       'delta_mfcc13_mean', 'delta_mfcc1_std', 'delta_mfcc2_std',
       'delta_mfcc3_std', 'delta_mfcc4_std', 'delta_mfcc5_std',
       'delta_mfcc6_std', 'delta_mfcc7_s

In [64]:
np.save('./resources/working_data/data_with_features.npy', df.to_numpy())

In [68]:
df.shape

(33820, 64)

In [69]:
train = df[df.video_id.isin(train_ids)]
test = df[df.video_id.isin(test_ids)]

In [70]:
train.groupby('label')['audio'].count()

label
clean         1988
highfry        659
layered        392
lowfry         161
midfry        5602
no_vocals    10836
Name: audio, dtype: int64

In [71]:
test.groupby('label')['audio'].count()

label
clean         474
highfry       664
layered       485
lowfry        394
midfry       3932
no_vocals    8233
Name: audio, dtype: int64

In [84]:
X_train = train[['average_zcr','zcr_stddev','mfcc1_mean', 'mfcc2_mean', 'mfcc3_mean','mfcc4_mean', 'mfcc5_mean', 'mfcc6_mean', 'mfcc7_mean', 'mfcc8_mean','mfcc9_mean', 'mfcc10_mean', 'mfcc11_mean', 'mfcc12_mean','mfcc13_mean', 'mfcc1_std', 'mfcc2_std', 'mfcc3_std', 'mfcc4_std','mfcc5_std', 'mfcc6_std', 'mfcc7_std', 'mfcc8_std', 'mfcc9_std','mfcc10_std', 'mfcc11_std', 'mfcc12_std', 'mfcc13_std','delta_mfcc1_mean', 'delta_mfcc2_mean', 'delta_mfcc3_mean','delta_mfcc4_mean', 'delta_mfcc5_mean', 'delta_mfcc6_mean','delta_mfcc7_mean', 'delta_mfcc8_mean', 'delta_mfcc9_mean','delta_mfcc10_mean', 'delta_mfcc11_mean', 'delta_mfcc12_mean','delta_mfcc13_mean', 'delta_mfcc1_std', 'delta_mfcc2_std','delta_mfcc3_std', 'delta_mfcc4_std', 'delta_mfcc5_std','delta_mfcc6_std', 'delta_mfcc7_std', 'delta_mfcc8_std','delta_mfcc9_std', 'delta_mfcc10_std', 'delta_mfcc11_std','delta_mfcc12_std', 'delta_mfcc13_std']].to_numpy()
y_train=train[['label']].to_numpy()

X_test = test[['average_zcr','zcr_stddev', 'mfcc1_mean', 'mfcc2_mean', 'mfcc3_mean','mfcc4_mean', 'mfcc5_mean', 'mfcc6_mean', 'mfcc7_mean', 'mfcc8_mean','mfcc9_mean', 'mfcc10_mean', 'mfcc11_mean', 'mfcc12_mean','mfcc13_mean', 'mfcc1_std', 'mfcc2_std', 'mfcc3_std', 'mfcc4_std','mfcc5_std', 'mfcc6_std', 'mfcc7_std', 'mfcc8_std', 'mfcc9_std','mfcc10_std', 'mfcc11_std', 'mfcc12_std', 'mfcc13_std','delta_mfcc1_mean', 'delta_mfcc2_mean', 'delta_mfcc3_mean','delta_mfcc4_mean', 'delta_mfcc5_mean', 'delta_mfcc6_mean','delta_mfcc7_mean', 'delta_mfcc8_mean', 'delta_mfcc9_mean','delta_mfcc10_mean', 'delta_mfcc11_mean', 'delta_mfcc12_mean','delta_mfcc13_mean', 'delta_mfcc1_std', 'delta_mfcc2_std','delta_mfcc3_std', 'delta_mfcc4_std', 'delta_mfcc5_std','delta_mfcc6_std', 'delta_mfcc7_std', 'delta_mfcc8_std','delta_mfcc9_std', 'delta_mfcc10_std', 'delta_mfcc11_std','delta_mfcc12_std', 'delta_mfcc13_std']].to_numpy()
y_test = test[['label']].to_numpy()

In [87]:
(test.shape[0] + train.shape[0])**0.5

183.90214789392755

# Classify!

Train the classifier with the X_train and Y_train, and then make predictions based on X_test\
Compare the predictions with Y_test and calculate accuracy score and show the confusion matrix

## Create Train Test Split

In [95]:
k = 500#13
KNN_model = KNeighborsClassifier(n_neighbors=k)
KNN_model.fit(X_train, y_train)
KNN_prediction = KNN_model.predict(X_test)
print(accuracy_score(KNN_prediction, y_test))

print(classification_report(KNN_prediction, y_test))

  return self._fit(X, y)


0.6536454660837682
              precision    recall  f1-score   support

       clean       0.00      0.00      0.00        64
     highfry       0.00      0.29      0.01         7
     layered       0.00      0.00      0.00         0
      lowfry       0.00      0.00      0.00         0
      midfry       0.71      0.49      0.58      5676
   no_vocals       0.79      0.77      0.78      8435

    accuracy                           0.65     14182
   macro avg       0.25      0.26      0.23     14182
weighted avg       0.75      0.65      0.69     14182



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [97]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
cm = confusion_matrix(y_test, KNN_prediction) #'midfry' :0, 'clean' :1, 'highfry' :2,'lowfry' :3, 'layered' :4}
print(cm)
score=precision_score(y_test, KNN_prediction, average='macro')
print(score)

[[   0    0    0    0  150  324]
 [   0    2    0    0  393  269]
 [   0    0    0    0  342  143]
 [   0    0    0    0  281  113]
 [  60    4    0    0 2775 1093]
 [   4    1    0    0 1735 6493]]
0.25739729005916406


  _warn_prf(average, modifier, msg_start, len(result))


In [99]:
k_plt=[]
score_plt=[]
for k in range(100,1000):
    
    k_plt.append(k)
    KNN_model = KNeighborsClassifier(n_neighbors=k)
    KNN_model.fit(X_train, y_train)
    KNN_prediction = KNN_model.predict(X_test)
    score=precision_score(y_test, KNN_prediction, average='macro')
    score_plt.append(score)

plt.plot(k_plt,score_plt)
    

  return self._fit(X, y)
  return self._fit(X, y)


In [None]:
from matplotlib import pyplot as plt 
k_plt=[]
score_plt=[]
k=10
if k <=1000:
    k_plt.append(k)
    KNN_model = KNeighborsClassifier(n_neighbors=k)
    KNN_model.fit(X_train, y_train)
    KNN_prediction = KNN_model.predict(X_test)
    score=precision_score(y_test, KNN_prediction, average='macro')
    score_plt.append(score)
    k=k*1.05
plt.plot(k_plt,score_plt)