In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import librosa.display as dis
from scipy.io import wavfile
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import os

### Input data

In [None]:
pdir = os.getcwd() # Get the current project working directory
pathTo3SecFeatures = os.path.join(pdir, "Input", "Data", "features_3_sec.csv")
input_data = pd.read_csv(pathTo3SecFeatures)
#print(pathTo3SecFeatures)

In [None]:
# List of features:
input_data.columns[2:59]

### Exploratory data analysis

- Descrptive statistics

In [None]:
input_data.describe()

- Correlation computation

In [None]:
# We consider the features 'mfcc1_mean' and 'mfcc2_mean'
input_data['mfcc1_mean'].corr(input_data['mfcc2_mean'])

- Null values inspection

In [None]:
input_data.isnull().sum()

### Data preprocessing

In [None]:
# Remove duplicate rows
input_data = input_data.drop_duplicates()

In [None]:
# Visualizing outliers using boxplot
# Example: chroma_stft_mean
sns.boxplot(x=input_data['chroma_stft_mean'])

In [None]:
# Detect outliers and replace them with lower/upper whisker
# Method used: Interquartile range
for col_name in input_data.select_dtypes(exclude=['object']).columns:
    Q1 = input_data[col_name].quantile(0.25)
    Q3 = input_data[col_name].quantile(0.75)
    IQR = Q3 - Q1
    whisker_width = 1.5
    lower_whisker = Q1 -(whisker_width*IQR)
    upper_whisker = Q3 +(whisker_width*IQR)
    #Replacing with upper whisker for upper values and lower whisker for lower values
    count = 0
    for val in input_data[col_name].values:
        if val>upper_whisker or val<lower_whisker:
            count += 1
    input_data[col_name]=np.where(input_data[col_name]>upper_whisker,upper_whisker,np.where(input_data[col_name]<lower_whisker,lower_whisker,input_data[col_name]))

In [None]:
input_data.to_csv(os.path.join(pdir,"Preprocessed Data",r'preprocessed_data_3_secs.csv'))

In [None]:
pathTo3SecFeaturesPreprocessed = os.path.join(pdir,"Preprocessed Data","preprocessed_data_3_secs.csv")
input_data = pd.read_csv(pathTo3SecFeaturesPreprocessed)

### Creating labels, Scaling data and Splitting training and testing data

In [None]:
input_data.shape

In [None]:
select_cols = input_data.iloc[:, 3:60]
# Creating labels
labels = input_data.iloc[:, [60]]
le = preprocessing.LabelEncoder()
le.fit(labels.label)
labels['categorical_label'] = le.transform(labels.label)

In [None]:
# Scale data
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(select_cols)
X_scaled = pd.DataFrame(np_scaled, columns = select_cols.columns)

In [None]:
# Split into training and testing
#select_cols = input_data.iloc[:, 3:60]
X_train, X_test, y_train, y_test = train_test_split(select_cols, labels['categorical_label'], test_size=0.33)

In [None]:
select_cols

### Establishing baseline performance

In [None]:
# Random forest classifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
predict_y = clf.predict(X_test)


y_pred_train = clf.predict(X_train)

print(f'Training accuracy: {accuracy_score(y_train,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test,predict_y)}')

print(classification_report(y_test, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test, predict_y)
plt.show()

In [None]:
# Knn neighbors
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
predict_y = knn_clf.predict(X_test)

y_pred_train = knn_clf.predict(X_train)

print(f'Training accuracy: {accuracy_score(y_train,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test,predict_y)}')

print(classification_report(y_test, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test, predict_y)
plt.show()

In [None]:
# Gaussian NB
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
predict_y = nb_clf.predict(X_test)

y_pred_train = nb_clf.predict(X_train)

print(f'Training accuracy: {accuracy_score(y_train,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test,predict_y)}')

print(classification_report(y_test, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test, predict_y)
plt.show()

### Performing PCA

In [None]:
# PCA

X_train_pca = X_train
X_test_pca = X_test
y_train_pca = y_train
y_test_pca = y_test

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X_train_pca)

# Apply transform to both the training set and the test set.
X_train_pca = scaler.transform(X_train_pca)
X_test_pca = scaler.transform(X_test_pca)

from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.95)
pca.fit(X_train_pca)

X_train_pca = pca.transform(X_train_pca)
X_test_pca = pca.transform(X_test_pca)

#### Classification after PCA

In [None]:
# Random Forest after PCA
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train_pca, y_train_pca)
predict_y = clf.predict(X_test_pca)
print(classification_report(y_test_pca, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test_pca, predict_y)
plt.show()

In [None]:
# Knn neighbors after PCA
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_pca, y_train_pca)
predict_y = knn_clf.predict(X_test_pca)
print(classification_report(y_test_pca, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test_pca, predict_y)
plt.show()

In [None]:
# Gaussian NB after PCA
nb_clf = GaussianNB()
nb_clf.fit(X_train_pca, y_train_pca)
predict_y = nb_clf.predict(X_test_pca)

y_pred_train = nb_clf.predict(X_train_pca)

print(f'Training accuracy: {accuracy_score(y_train_pca,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test_pca,predict_y)}')

print(classification_report(y_test_pca, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test_pca, predict_y)
plt.show()

### XGBClassifier

In [None]:
# XGBClassifier
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
predict_y = xgb_clf.predict(X_test)

y_pred_train = xgb_clf.predict(X_train)

print(f'Training accuracy: {accuracy_score(y_train,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test,predict_y)}')
# print(accuracy(y_test, predict_y))
print(classification_report(y_test, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test, predict_y)
plt.show()

##### Recursive feature elimination (RFE) on XGBClassifier

In [None]:
from sklearn.feature_selection import RFECV,mutual_info_regression
estimator = XGBClassifier(eval_metric='merror')
rfecv = RFECV(estimator, step=1, cv=5,scoring='accuracy',verbose=1)
rfecv.fit(X_train, y_train)

features_drop_array = list(np.where(rfecv.support_ == False)[0])
X_train.columns[features_drop_array]

##### features dropped : 

    ['zero_crossing_rate_var', 'mfcc11_var', 'mfcc13_var', 'mfcc14_var',
       'mfcc15_var', 'mfcc16_var', 'mfcc17_var', 'mfcc18_var', 'mfcc19_mean',
       'mfcc20_mean']

In [None]:
X_train.drop(X_train.columns[features_drop_array], axis=1, inplace=True)
X_test.drop(X_test.columns[features_drop_array], axis=1, inplace=True)

##### Running XGBClassifier after dropping features

In [None]:
# aa
# XGBClassifier
xgb_clf = XGBClassifier(n_estimators=1000)
xgb_clf.fit(X_train, y_train, eval_metric='merror')
predict_y = xgb_clf.predict(X_test)
# print(accuracy(y_test, predict_y))
y_pred_train = xgb_clf.predict(X_train)

print(f'Training accuracy: {accuracy_score(y_train,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test,predict_y)}')

print(classification_report(y_test, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test, predict_y)
plt.show()

##### Hypertuning the parametres of XGBClassifier 

In [None]:
#For hyperparameter tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

space={
    'n_estimators': hp.quniform('n_estimators', 0,3000,1),
    'reg_lambda' : hp.quniform('reg_lambda', 0,500,1),
    }

def objective(space):
    clf=XGBClassifier(
                    n_estimators =int(space['n_estimators']),
                    reg_lambda = int(space['reg_lambda']),
                    )
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    return {'loss': -accuracy, 'status': STATUS_OK }


trials = Trials()
best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

print(f"best params: {best_hyperparams}")

##### Running XGBClassifier using best params : 'n_estimators': 1659.0, 'reg_lambda': 92.0

In [None]:
# aa
# XGBClassifier
xgb_clf = XGBClassifier(n_estimators=1659, reg_lambda=92.0)
xgb_clf.fit(X_train, y_train, eval_metric='merror')
predict_y = xgb_clf.predict(X_test)
# print(accuracy(y_test, predict_y))
y_pred_train = xgb_clf.predict(X_train)

print(f'Training accuracy: {accuracy_score(y_train,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test,predict_y)}')

print(classification_report(y_test, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test, predict_y)
plt.show()

#### Permutation Importance Feature Selection on knn_clf

In [None]:
input_data = pd.read_csv(file_path_3)
select_cols = input_data.iloc[:, 3:60]

# Creating labels
labels = input_data.iloc[:, [60]]
le = preprocessing.LabelEncoder()
le.fit(labels.label)
labels['categorical_label'] = le.transform(labels.label)

#Scaled data
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(select_cols)
X_scaled = pd.DataFrame(np_scaled, columns = select_cols.columns)

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, labels['categorical_label'], test_size=0.33)

import eli5
from eli5.sklearn import PermutationImportance

seed = 12

perm = PermutationImportance(knn_clf, random_state=seed).fit(X_train, y_train, n_iter=10)
print("Feature Importances using Permutation Importance")
eli5.show_weights(perm, feature_names = X_train.columns.tolist())

In [None]:
# plot the permutation importances
perm_indices = np.argsort(perm.feature_importances_)[::-1]
perm_features = [X_train.columns.tolist()[xx] for xx in perm_indices]
plt.figure(figsize=(14, 14))
plt.title("Knn feature importance via permutation importance")
plt.barh(range(X_train.shape[1]), perm.feature_importances_[perm_indices])
plt.yticks(range(X_train.shape[1]), perm_features)
plt.ylim([X_train.shape[1], -1])
plt.show()

### Trying few more classifiers

In [None]:
input_data = pd.read_csv(file_path_3)
select_cols = input_data.iloc[:, 3:60]

# Creating labels
labels = input_data.iloc[:, [60]]
le = preprocessing.LabelEncoder()
le.fit(labels.label)
labels['categorical_label'] = le.transform(labels.label)

#Scaled data
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(select_cols)
X_scaled = pd.DataFrame(np_scaled, columns = select_cols.columns)

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, labels['categorical_label'], test_size=0.33)

##### SVM

In [None]:
# import support vector classifier 

from sklearn.svm import SVC  
clf = SVC(kernel='linear') 
  
clf.fit(X_train, y_train)
predict_y = clf.predict(X_test)


y_pred_train = clf.predict(X_train)

print(f'Training accuracy: {accuracy_score(y_train,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test,predict_y)}')

print(classification_report(y_test, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test, predict_y)
plt.show()

##### MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
predict_y = clf.predict(X_test)


y_pred_train = clf.predict(X_train)

print(f'Training accuracy: {accuracy_score(y_train,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test,predict_y)}')

print(classification_report(y_test, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test, predict_y)
plt.show()

##### LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)
predict_y = clf.predict(X_test)


y_pred_train = clf.predict(X_train)

print(f'Training accuracy: {accuracy_score(y_train,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test,predict_y)}')

print(classification_report(y_test, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test, predict_y)
plt.show()

##### LogisticRegression with L1 and L2

In [None]:
from sklearn.linear_model import LogisticRegression

C = [0.01, 0.10, 1.00]
for c in C:
    clf_l1_LR = LogisticRegression(C=c, penalty="l1", tol=0.01, solver="saga")
    clf_l2_LR = LogisticRegression(C=c, penalty="l2", tol=0.01, solver="saga")
    clf_l1_LR.fit(X_train, y_train)
    clf_l2_LR.fit(X_train, y_train)
    predict_y_l1 = clf_l1_LR.predict(X_test)
    predict_y_l2 = clf_l2_LR.predict(X_test)
    y_pred_train_l1 = clf_l1_LR.predict(X_train)
    y_pred_train_l2 = clf_l2_LR.predict(X_train)
    print('C = ', c)
    print(f'Training accuracy L1: {accuracy_score(y_train,y_pred_train_l1)}')
    print(f'Testing accuracy L1: {accuracy_score(y_test,predict_y_l1)}')
    print(f'Training accuracy L2: {accuracy_score(y_train,y_pred_train_l2)}')
    print(f'Testing accuracy L2: {accuracy_score(y_test,predict_y_l2)}')

### RandomForestClassifier (with hypertuning)

##### RandomForestClassifier (max_depth=2)

In [None]:
# Random forest classifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
predict_y = clf.predict(X_test)


y_pred_train = clf.predict(X_train)

print(f'Training accuracy: {accuracy_score(y_train,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test,predict_y)}')

print(classification_report(y_test, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test, predict_y)
plt.show()

##### RandomForestClassifier (without any parameters)

In [None]:
# Random forest classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
predict_y = clf.predict(X_test)


y_pred_train = clf.predict(X_train)

print(f'Training accuracy: {accuracy_score(y_train,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test,predict_y)}')

print(classification_report(y_test, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test, predict_y)
plt.show()

##### Hypertuning the parametres of RandomForestClassifier 

In [None]:
# define parameter space

space = {
    "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500,600]),
    "max_depth": hp.quniform("max_depth", 1, 15,1),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
}

# define objective function
from sklearn.model_selection import cross_val_score
def hyperparameter_tuning(params):
    clf = RandomForestClassifier(**params,n_jobs=-1)
    acc = cross_val_score(clf, X_train, y_train,scoring="accuracy").mean()
    return {"loss": -acc, "status": STATUS_OK}

# Fine tune the model
trials = Trials()

best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials
)

print("Best: {}".format(best))

##### RandomForestClassifier with the best parameters :  {'criterion': 1, 'max_depth': 14.0, 'n_estimators': 2}

In [None]:
# Random forest classifier
clf = RandomForestClassifier(n_estimators= 300, criterion= 'entropy', max_depth= 14, )
clf.fit(X_train, y_train)
predict_y = clf.predict(X_test)


y_pred_train = clf.predict(X_train)

print(f'Training accuracy: {accuracy_score(y_train,y_pred_train)}')
print(f'Testing accuracy: {accuracy_score(y_test,predict_y)}')

print(classification_report(y_test, predict_y, target_names=labels.label.unique()))
ConfusionMatrixDisplay.from_predictions(y_test, predict_y)
plt.show()