In [None]:
# importing the libraries
import numpy as np
import librosa as lb 
import os
import librosa.display
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# path of the audio files directory
path = "../Project/recordings/"
print(os.listdir(path))

Checking out the meta datafile

In [None]:
df = pd.read_csv('../Project/speakers_all.csv')

print(df.shape)
print('-----------')
print(df.head())

Dropping columns 8-10 as they are not required 

In [None]:
df.drop(df.columns[8:11], axis=1, inplace=True)
df=df.fillna("NaN")
df.head()

In [None]:
# creating a new dataframe with only the required columns
newDf = pd.DataFrame()


In [None]:
dir = "../Project/recordings/"
features = {}
for index, row in df.iterrows():
    # if file does not exist in the recordings folder, skip it
    if os.path.isfile(dir+row['filename']+'.mp3')==False:
        print('File '+str(row['filename'])+".mp3 doesn't exist")
        # also remove it from the dataframe
        df=df.drop([index])
        continue

    audio_path = os.path.join(os.path.abspath(dir),row['filename']+".mp3")
    audio, sr = librosa.load(audio_path)

    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)

    features['label']=row['native_language']
    # mfccs
    for i in mfccs:
        features['mfccMed'+str(i)]=np.median(i)
        features['mfccMean'+str(i)]=np.mean(i)
        features['mfccVar'+str(i)]=np.var(i)
    
    # spectral_centroids
    features['spectral_centroidsMed'+str(0)]=np.median(spectral_centroids[0])
    features['spectral_centroidsMean'+str(0)]=np.mean(spectral_centroids[0])
    features['spectral_centroidsVar'+str(0)]=np.var(spectral_centroids[0])

    # spectral_rolloff
    features['spectral_rolloffMed'+str(0)]=np.median(spectral_rolloff[0])
    features['spectral_rolloffMean'+str(0)]=np.mean(spectral_rolloff[0])
    features['spectral_rolloffVar'+str(0)]=np.var(spectral_rolloff[0])

    # chroma
    features['chromaMed'+str(0)]=np.median(chroma[0])
    features['chromaMean'+str(0)]=np.mean(chroma[0])
    features['chromaVar'+str(0)]=np.var(chroma[0])

    newDf = newDf.append([features])

    print(index)

In [None]:
features = np.array(features)
labels = np.array(labels)

In [None]:
# perform one-hot encoding on the labels
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

onehot_encoder = OneHotEncoder(sparse=False)
labels_onehot = onehot_encoder.fit_transform(labels_encoded.reshape(-1, 1))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels_onehot, test_size=0.2, random_state=42)
max_len = max(len(x) for x in X_train)
X_train = np.array([x + [0] * (max_len - len(x)) for x in X_train])
X_test = np.array([x + [0] * (max_len - len(x)) for x in X_test])

In [None]:
print(type(X_train[0]))
print(type(X_test[0]))

In [None]:

# train a random forest classifier on the training data
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train.tolist(), y_train.tolist())

In [None]:
print(y_test.shape)
X_test.shape

In [None]:
X_train.shape

In [None]:
# c = 0
# for i, x in enumerate(X_test):
#     if isinstance(x, list):
#         c+=1
#         print(f"Element {i} is a list: {x}")
#         if c>10:
#             break

In [None]:
# import itertools

# # flatten nested list
# X_test_flat = list(itertools.chain.from_iterable(X_test))

# # convert flattened list to numpy array
# X_test_flat_arr = np.array(X_test_flat)

# # reshape the array to the original shape of X_test
# X_test_arr = X_test_flat_arr.reshape(X_test.shape)

In [None]:
# X_test = np.expand_dims(X_test, axis=1)

In [None]:
# X_test_flat = []
# for sample in X_test:
#     if isinstance(sample, list):
#         X_test_flat.append(np.array(sample).flatten())
#     else:
#         X_test_flat.append(sample.flatten())

In [None]:
# X_test_flat_arr = np.array(X_test_flat).reshape(len(X_test_flat), -1)

In [None]:
# X_test_flat_arr = np.array(X_test_flat).astype('float32')

In [None]:
# # make predictions on the testing data
y_pred = clf.predict(X_test)

# # calculate accuracy of the model on the testing data
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
# make predictions on the testing data
print("Making predictions...")
y_pred = clf.predict(X_test.astype(np.float64))

# calculate accuracy of the model on the testing data
print("Calculating accuracy...")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# predicting the native language of from an audio file
# selecting random recordings from the recordings folder


### Model 2
Gradient boosted random forests

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [None]:
# train a GBRF classifier on the training data
clf1 = GradientBoostingClassifier(n_estimators=100, subsample=0.8, random_state=42)
clf1.fit(X_train, np.argmax(y_train, axis=1).flatten())

In [None]:
# make predictions on the testing data
y_pred = clf1.predict(X_test)

# calculate accuracy of the model on the testing data
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")