# Novelty Detection

## Part 1
Deep purple & Johnny Cash samples

In [197]:
# Part 1

from pandas import DataFrame
import pandas as pd

dp_jc_df = pd.read_csv('./dp_jc_samples.csv')
jc_dp_df = pd.read_csv('./jc_dp_samples.csv')

# Shuffle rows as for each csv file, novelty rows are appended to the end of dataset
# dp_jc_ds = dp_jc_df.sample(frac=1)
dp_jc_ds = dp_jc_df
# jc_dp_ds = jc_dp_df.sample(frac=1)
jc_dp_ds = jc_dp_df

# Drop columns 'Class' and 'Path'
dp_jc_ds_cleaned = dp_jc_ds.drop(['Path', 'Class'], axis=1)
jc_dp_ds_cleaned = jc_dp_ds.drop(['Path', 'Class'], axis=1)

dp_jc_train_samples = dp_jc_ds_cleaned.values.tolist()
jc_dp_train_samples = jc_dp_ds_cleaned.values.tolist()

In [None]:
from collections import Counter
from sklearn import svm
import numpy as np

estimator = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)

# Deep Purple with Johnny Cash novelty samples
estimator.fit(dp_jc_train_samples)
dp_jc_pred = estimator.predict(dp_jc_train_samples)

print("Deep Purple - Johnny Cash Outlier Analysis:")
dp_jc_outlier_count = Counter(dp_jc_pred)
print("Outlier Count: ", dp_jc_outlier_count[-1])
print("Inlier Count: ", dp_jc_outlier_count[1])

# Add outliers data as last column to data sample
dp_jc_ds["Outliers"] = dp_jc_pred

# Get only Johnny Cash rows (should be 10 rows)
jc_samples = dp_jc_ds.loc[dp_jc_ds["Class"] == 'Johnny_cash']

for sample in jc_samples.values.tolist():
    print("{} ---> {}".format(sample[0], "Outlier (-1)" if sample[-1] == -1 else "Inlier (1)"))

jc_outliers_count = len(jc_samples.loc[jc_samples["Outliers"] == -1])

print("\nOut of 10 Johnny Cash samples, {} are outliers".format(jc_outliers_count))
print("Detection rate: {}%".format((jc_outliers_count/10)*100))


Deep Purple - Johnny Cash Outlier Analysis:
Outlier Count:  66
Inlier Count:  44
Johnny_cash/01 - Cold Lonesome Morning.mp3.wav ---> Outlier (-1)
Johnny_cash/01 - Flesh and Blood.mp3.wav ---> Outlier (-1)
Johnny_cash/01 - From Sea To Shining Sea.mp3.wav ---> Outlier (-1)
Johnny_cash/01 - Gone Girl.mp3.wav ---> Outlier (-1)
Johnny_cash/01 - Hiawathas Vision.mp3.wav ---> Outlier (-1)
Johnny_cash/01 - Let There Be Country.mp3.wav ---> Outlier (-1)
Johnny_cash/01 - Rollin Free.mp3.wav ---> Outlier (-1)
Johnny_cash/01 - Southwind.mp3.wav ---> Outlier (-1)
Johnny_cash/02 - Ballad of Little Fauss and Big Halsy.mp3.wav ---> Inlier (1)
Johnny_cash/02 - I Walk the Line (Full Version).mp3.wav ---> Inlier (1)

Out of 10 Johnny Cash samples, 8 are outliers
Detection rate: 80.0%


Unnamed: 0,Path,Class,Spectral Centroid Overall Average 1,Spectral Centroid Overall Standard Deviation 1,Spectral Rolloff Point Overall Average 1,Spectral Rolloff Point Overall Standard Deviation 1,Spectral Flux Overall Average 1,Spectral Flux Overall Standard Deviation 1,Compactness Overall Average 1,Compactness Overall Standard Deviation 1,...,Method of Moments Overall Average 2,Method of Moments Overall Average 3,Method of Moments Overall Average 4,Method of Moments Overall Average 5,Method of Moments Overall Standard Deviation 1,Method of Moments Overall Standard Deviation 2,Method of Moments Overall Standard Deviation 3,Method of Moments Overall Standard Deviation 4,Method of Moments Overall Standard Deviation 5,Outliers
100,Johnny_cash/01 - Cold Lonesome Morning.mp3.wav,Johnny_cash,30.2,15.4,0.213,0.139,0.00147,0.00196,1658,170.0,...,0.291,74.0,18770,4775000,0.136,0.316,80.0,20140,5102000,-1
101,Johnny_cash/01 - Flesh and Blood.mp3.wav,Johnny_cash,20.8,13.3,0.146,0.126,0.00167,0.00174,1687,184.0,...,0.143,36.5,9274,2362000,0.179,0.173,44.0,11140,2832000,-1
102,Johnny_cash/01 - From Sea To Shining Sea.mp3.wav,Johnny_cash,20.6,13.2,0.125,0.12,0.000398,0.00046,1637,165.0,...,0.0935,23.8,6053,1542000,0.0926,0.15,38.1,9644,2449000,-1
103,Johnny_cash/01 - Gone Girl.mp3.wav,Johnny_cash,27.1,13.1,0.171,0.109,0.00272,0.00263,1684,161.0,...,0.222,56.4,14290,3631000,0.224,0.393,96.7,22990,5685000,-1
104,Johnny_cash/01 - Hiawathas Vision.mp3.wav,Johnny_cash,18.0,15.2,0.105,0.125,0.00223,0.00369,1598,185.0,...,0.0946,24.1,6125,1560000,0.186,0.156,39.5,10000,2541000,-1
105,Johnny_cash/01 - Let There Be Country.mp3.wav,Johnny_cash,19.1,8.25,0.119,0.0761,0.000497,0.00068,1654,175.0,...,0.122,31.1,7916,2017000,0.11,0.114,29.1,7392,1881000,-1
106,Johnny_cash/01 - Rollin Free.mp3.wav,Johnny_cash,19.7,11.6,0.135,0.123,0.00158,0.00189,1689,228.0,...,0.108,27.6,7026,1790000,0.172,0.134,34.0,8623,2193000,-1
107,Johnny_cash/01 - Southwind.mp3.wav,Johnny_cash,22.3,16.4,0.161,0.143,0.000597,0.000809,1628,189.0,...,0.195,49.8,12660,3223000,0.0965,0.169,43.0,10890,2770000,-1
108,Johnny_cash/02 - Ballad of Little Fauss and Bi...,Johnny_cash,19.3,12.5,0.137,0.122,0.00167,0.00315,1599,192.0,...,0.135,34.3,8737,2226000,0.216,0.103,26.1,6640,1690000,1
109,Johnny_cash/02 - I Walk the Line (Full Version...,Johnny_cash,14.4,10.9,0.0866,0.0908,0.00126,0.00201,1657,227.0,...,0.1,25.5,6477,1649000,0.189,0.147,37.5,9503,2417000,1


In [None]:
# Johnny Cash with Deep Purple novelty samples
estimator.fit(jc_dp_train_samples)
jc_dp_pred = estimator.predict(jc_dp_train_samples)

# Add outliers data as last column to data sample
jc_dp_ds["Outliers"] = jc_dp_pred

print("\nJohnny Cash - Deep Purple Outlier Analysis:")
jc_dp_outlier_count = Counter(jc_dp_pred)
print("Outlier Count: ", jc_dp_outlier_count[-1])
print("Inlier Count: ", jc_dp_outlier_count[1])

# Get only Deep Purple rows (should be 10 rows)
dp_samples = jc_dp_ds.loc[jc_dp_ds["Class"] == 'deep_purple']

for sample in dp_samples.values.tolist():
    print("{} ---> {}".format(sample[0], "Outlier (-1)" if sample[-1] == -1 else "Inlier (1)"))

dp_outliers_count = len(dp_samples.loc[dp_samples["Outliers"] == -1])

print("\nOut of 10 Deep Purple samples, {} are outliers".format(dp_outliers_count))
print("Detection rate: {}%".format((dp_outliers_count/10)*100))


Johnny Cash - Deep Purple Outlier Analysis:
Outlier Count:  66
Inlier Count:  44
deep_purple/01 - Highway Star.mp3.wav ---> Outlier (-1)
deep_purple/02 - Maybe I m A Leo.mp3.wav ---> Outlier (-1)
deep_purple/03 - Pictures Of Hom.mp3.wav ---> Outlier (-1)
deep_purple/04 - Never Before.mp3.wav ---> Outlier (-1)
deep_purple/05 - Smoke On The Wa.mp3.wav ---> Outlier (-1)
deep_purple/06 - Lazy.mp3.wav ---> Outlier (-1)
deep_purple/07 - Space Truckin.mp3.wav ---> Outlier (-1)
deep_purple/DEEP PURPLE - COME TASTE THE BAND - 01 - COMIN HOME.mp3.wav ---> Outlier (-1)
deep_purple/DEEP PURPLE - COME TASTE THE BAND - 02 - LADY LUCK.mp3.wav ---> Inlier (1)
deep_purple/DEEP PURPLE - COME TASTE THE BAND - 03 - GETTIN TIGHTER.mp3.wav ---> Inlier (1)

Out of 10 Deep Purple samples, 8 are outliers
Detection rate: 80.0%


Unnamed: 0,Path,Class,Spectral Centroid Overall Average 1,Spectral Centroid Overall Standard Deviation 1,Spectral Rolloff Point Overall Average 1,Spectral Rolloff Point Overall Standard Deviation 1,Spectral Flux Overall Average 1,Spectral Flux Overall Standard Deviation 1,Compactness Overall Average 1,Compactness Overall Standard Deviation 1,...,Method of Moments Overall Average 2,Method of Moments Overall Average 3,Method of Moments Overall Average 4,Method of Moments Overall Average 5,Method of Moments Overall Standard Deviation 1,Method of Moments Overall Standard Deviation 2,Method of Moments Overall Standard Deviation 3,Method of Moments Overall Standard Deviation 4,Method of Moments Overall Standard Deviation 5,Outliers
100,deep_purple/01 - Highway Star.mp3.wav,deep_purple,32.8,16.4,0.24,0.179,0.00126,0.00128,1580,156.0,...,0.531,135.0,34140,8675000,0.129,0.449,114.0,28530.0,7220000.0,-1
101,deep_purple/02 - Maybe I m A Leo.mp3.wav,deep_purple,26.1,23.3,0.192,0.206,0.000425,0.000573,1582,172.0,...,0.498,126.0,31960,8115000,0.109,0.535,135.0,33730.0,8518000.0,-1
102,deep_purple/03 - Pictures Of Hom.mp3.wav,deep_purple,29.3,15.9,0.203,0.183,0.000658,0.000714,1580,164.0,...,0.474,121.0,30530,7762000,0.103,0.381,96.3,24160.0,6114000.0,-1
103,deep_purple/04 - Never Before.mp3.wav,deep_purple,29.6,18.8,0.219,0.207,0.000697,0.000911,1598,166.0,...,0.502,128.0,32310,8209000,0.117,0.466,118.0,29530.0,7469000.0,-1
104,deep_purple/05 - Smoke On The Wa.mp3.wav,deep_purple,18.8,14.3,0.122,0.145,0.000661,0.00098,1541,171.0,...,0.414,105.0,26660,6779000,0.123,0.356,90.2,22710.0,5755000.0,-1
105,deep_purple/06 - Lazy.mp3.wav,deep_purple,29.6,27.5,0.186,0.179,0.000131,0.00021,1633,203.0,...,0.218,55.3,13990,3552000,0.132,0.44,111.0,27550.0,6941000.0,-1
106,deep_purple/07 - Space Truckin.mp3.wav,deep_purple,27.0,18.8,0.2,0.184,0.000692,0.000879,1560,161.0,...,0.439,112.0,28260,7185000,0.103,0.375,94.9,23880.0,6049000.0,-1
107,deep_purple/DEEP PURPLE - COME TASTE THE BAND ...,deep_purple,26.0,9.0,0.176,0.0699,0.00274,0.00214,1570,138.0,...,0.153,39.1,9940,2532000,0.153,0.12,30.6,7757.0,1974000.0,-1
108,deep_purple/DEEP PURPLE - COME TASTE THE BAND ...,deep_purple,26.1,9.69,0.185,0.0868,0.00245,0.00223,1574,144.0,...,0.245,62.3,15830,4031000,0.175,0.207,52.5,13290.0,3376000.0,1
109,deep_purple/DEEP PURPLE - COME TASTE THE BAND ...,deep_purple,30.4,15.6,0.231,0.154,0.00245,0.00251,1589,149.0,...,0.318,80.8,20530,5225000,0.2,0.249,63.2,15970.0,4053000.0,1


In [194]:
gammas = [0.05, 0.2, 0.3, 0.45, 0.63, 0.78]

predictions: dict = {"Gamma": [], "Musician 1 among musician 2": [], "Musician 2 among musician 1": []}
for gamma in gammas:
    # Add gammas to gamma list
    predictions["Gamma"].append(gamma)

    # Instantiate estimator
    estimator = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=gamma)
    
    # Predict JC outliers in DP data set
    estimator.fit(dp_jc_train_samples)
    dp_jc_pred = estimator.predict(dp_jc_train_samples)

    # Add outliers as column in data sample
    dp_jc_ds["Outliers"] = dp_jc_pred

    # Get only Johnny Cash rows (should be 10 rows)
    jc_samples = dp_jc_ds.loc[dp_jc_ds["Class"] == 'Johnny_cash']

    # Get outlier count
    jc_outliers_count = len(dp_samples.loc[jc_samples["Outliers"] == -1])

    predictions["Musician 2 among musician 1"].append(jc_outliers_count)

    # Predict DP outliers in JC data set
    estimator.fit(jc_dp_train_samples)
    jc_dp_pred = estimator.predict(jc_dp_train_samples)

    # Add outliers as column in data sample
    jc_dp_ds["Outliers"] = jc_dp_pred

    # Get only Deep Purple rows (should be 10 rows)
    dp_samples = jc_dp_ds.loc[jc_dp_ds["Class"] == 'deep_purple']

    dp_outliers_count = len(dp_samples.loc[dp_samples["Outliers"] == -1])

    predictions["Musician 1 among musician 2"].append(dp_outliers_count)

output_df = DataFrame(data=predictions)
output_df.style.set_caption("Predictions with Variable Gammas")

Unnamed: 0,Gamma,Musician 1 among musician 2,Musician 2 among musician 1
0,0.05,8,8
1,0.2,8,8
2,0.3,8,8
3,0.45,8,8
4,0.63,8,8
5,0.78,8,8


In [198]:
# Repeat process from above (using same data) using LOF instead

from sklearn.neighbors import LocalOutlierFactor
from collections import Counter

lof = LocalOutlierFactor(novelty=True, n_neighbors=20)

# Deep Purple with Johnny Cash novelty samples
lof.fit(dp_jc_train_samples)
dp_jc_pred = lof.predict(dp_jc_train_samples)

print("Deep Purple - Johnny Cash Outlier Analysis:")
dp_jc_outlier_count = Counter(dp_jc_pred)
print("Outlier Count: ", dp_jc_outlier_count[-1])
print("Inlier Count: ", dp_jc_outlier_count[1])

# Add outliers data as last column to data sample
dp_jc_ds["Outliers"] = dp_jc_pred

# Get only Johnny Cash rows (should be 10 rows)
jc_samples = dp_jc_ds.loc[dp_jc_ds["Class"] == 'Johnny_cash']

for sample in jc_samples.values.tolist():
    print("{} ---> {}".format(sample[0], "Outlier (-1)" if sample[-1] == -1 else "Inlier (1)"))

jc_outliers_count = len(jc_samples.loc[jc_samples["Outliers"] == -1])

print("\nOut of 10 Johnny Cash samples, {} are outliers".format(jc_outliers_count))
print("Detection rate: {}%".format((jc_outliers_count/10)*100))

Deep Purple - Johnny Cash Outlier Analysis:
Outlier Count:  12
Inlier Count:  98
Johnny_cash/01 - Cold Lonesome Morning.mp3.wav ---> Inlier (1)
Johnny_cash/01 - Flesh and Blood.mp3.wav ---> Inlier (1)
Johnny_cash/01 - From Sea To Shining Sea.mp3.wav ---> Inlier (1)
Johnny_cash/01 - Gone Girl.mp3.wav ---> Inlier (1)
Johnny_cash/01 - Hiawathas Vision.mp3.wav ---> Inlier (1)
Johnny_cash/01 - Let There Be Country.mp3.wav ---> Inlier (1)
Johnny_cash/01 - Rollin Free.mp3.wav ---> Inlier (1)
Johnny_cash/01 - Southwind.mp3.wav ---> Inlier (1)
Johnny_cash/02 - Ballad of Little Fauss and Big Halsy.mp3.wav ---> Inlier (1)
Johnny_cash/02 - I Walk the Line (Full Version).mp3.wav ---> Inlier (1)

Out of 10 Johnny Cash samples, 0 are outliers
Detection rate: 0.0%


In [199]:
# Johnny Cash with Deep Purple novelty samples
lof.fit(jc_dp_train_samples)
jc_dp_pred = lof.predict(jc_dp_train_samples)

# Add outliers data as last column to data sample
jc_dp_ds["Outliers"] = jc_dp_pred

print("\nJohnny Cash - Deep Purple Outlier Analysis:")
jc_dp_outlier_count = Counter(jc_dp_pred)
print("Outlier Count: ", jc_dp_outlier_count[-1])
print("Inlier Count: ", jc_dp_outlier_count[1])

# Get only Deep Purple rows (should be 10 rows)
dp_samples = jc_dp_ds.loc[jc_dp_ds["Class"] == 'deep_purple']

for sample in dp_samples.values.tolist():
    print("{} ---> {}".format(sample[0], "Outlier (-1)" if sample[-1] == -1 else "Inlier (1)"))

dp_outliers_count = len(dp_samples.loc[dp_samples["Outliers"] == -1])

print("\nOut of 10 Deep Purple samples, {} are outliers".format(dp_outliers_count))
print("Detection rate: {}%".format((dp_outliers_count/10)*100))


Johnny Cash - Deep Purple Outlier Analysis:
Outlier Count:  9
Inlier Count:  101
deep_purple/01 - Highway Star.mp3.wav ---> Outlier (-1)
deep_purple/02 - Maybe I m A Leo.mp3.wav ---> Outlier (-1)
deep_purple/03 - Pictures Of Hom.mp3.wav ---> Outlier (-1)
deep_purple/04 - Never Before.mp3.wav ---> Outlier (-1)
deep_purple/05 - Smoke On The Wa.mp3.wav ---> Inlier (1)
deep_purple/06 - Lazy.mp3.wav ---> Inlier (1)
deep_purple/07 - Space Truckin.mp3.wav ---> Inlier (1)
deep_purple/DEEP PURPLE - COME TASTE THE BAND - 01 - COMIN HOME.mp3.wav ---> Inlier (1)
deep_purple/DEEP PURPLE - COME TASTE THE BAND - 02 - LADY LUCK.mp3.wav ---> Inlier (1)
deep_purple/DEEP PURPLE - COME TASTE THE BAND - 03 - GETTIN TIGHTER.mp3.wav ---> Inlier (1)

Out of 10 Deep Purple samples, 4 are outliers
Detection rate: 40.0%


In [200]:
num_of_neighbors = [25, 38, 46, 51, 64, 79]

predictions: dict = {"Number of Neighbors": [], "Musician 1 among musician 2": [], "Musician 2 among musician 1": []}

for num in num_of_neighbors:
    # Add gammas to gamma list
    predictions["Number of Neighbors"].append(num)

    # Instantiate estimator
    lof = LocalOutlierFactor(novelty=True, n_neighbors=num)
    
    # Predict JC outliers in DP data set
    lof.fit(dp_jc_train_samples)
    dp_jc_pred = lof.predict(dp_jc_train_samples)

    # Add outliers as column in data sample
    dp_jc_ds["Outliers"] = dp_jc_pred

    # Get only Johnny Cash rows (should be 10 rows)
    jc_samples = dp_jc_ds.loc[dp_jc_ds["Class"] == 'Johnny_cash']

    # Get outlier count
    jc_outliers_count = len(dp_samples.loc[jc_samples["Outliers"] == -1])

    predictions["Musician 2 among musician 1"].append(jc_outliers_count)

    # Predict DP outliers in JC data set
    estimator.fit(jc_dp_train_samples)
    jc_dp_pred = estimator.predict(jc_dp_train_samples)

    # Add outliers as column in data sample
    jc_dp_ds["Outliers"] = jc_dp_pred

    # Get only Deep Purple rows (should be 10 rows)
    dp_samples = jc_dp_ds.loc[jc_dp_ds["Class"] == 'deep_purple']

    dp_outliers_count = len(dp_samples.loc[dp_samples["Outliers"] == -1])

    predictions["Musician 1 among musician 2"].append(dp_outliers_count)

output_df = DataFrame(data=predictions)
output_df.style.set_caption("Predictions with Variable Gammas")

Unnamed: 0,Number of Neighbors,Musician 1 among musician 2,Musician 2 among musician 1
0,25,8,0
1,38,8,0
2,46,8,0
3,51,8,0
4,64,8,0
5,79,8,0


## Part 2
OneClassSVM provided better performance (LOF incorrectly implies that all novelty samples are inliers)

In [202]:
import pandas as pd

from collections import Counter
from sklearn import svm

dp_df = pd.read_csv('./deep_purple_data.csv')

# Drop columns 'Class' and 'Path'
dp_df_cleaned = dp_df.drop(['Path', 'Class'], axis=1)

dp_df_train_samples = dp_df_cleaned.values.tolist()

estimator = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
estimator.fit(dp_df_train_samples)
dp_pred = estimator.predict(dp_df_train_samples)

print("Deep Purple Outlier Analysis:")
dp_outlier_count = Counter(dp_pred)
print("Outlier Count: ", dp_outlier_count[-1])
print("Inlier Count: ", dp_outlier_count[1])

# Add outliers data as last column to data sample
dp_df["Outliers"] = dp_pred

for sample in dp_df.values.tolist():
    if (sample[-1] == -1):
        print(sample[0])

dp_outliers_count = len(dp_df.loc[dp_df["Outliers"] == -1])

print("\nOut of {} Deep Purple samples, {} are outliers".format(dp_pred.shape[0], dp_outliers_count))


Deep Purple Outlier Analysis:
Outlier Count:  107
Inlier Count:  50
deep_purple/DEEP PURPLE - COME TASTE THE BAND - 09 - YOU KEEP ON MOVING.mp3.wav
deep_purple/Deep Purple - Abandon - 01 - Any Fule Kno That.mp3.wav
deep_purple/Deep Purple - Abandon - 02 - Almost Human.mp3.wav
deep_purple/Deep Purple - Abandon - 03 - Dont Make Me Happy.mp3.wav
deep_purple/Deep Purple - Abandon - 04 - Seventh Heaven.mp3.wav
deep_purple/Deep Purple - Abandon - 05 - Watching The Sky.mp3.wav
deep_purple/Deep Purple - Abandon - 06 - Fingers To The Bone.mp3.wav
deep_purple/Deep Purple - Abandon - 07 - Jack Ruby.mp3.wav
deep_purple/Deep Purple - Abandon - 08 - She Was.mp3.wav
deep_purple/Deep Purple - Abandon - 09 - Whatsername.mp3.wav
deep_purple/Deep Purple - Abandon - 10 - 69.mp3.wav
deep_purple/Deep Purple - Abandon - 11 - Evil Louie.mp3.wav
deep_purple/Deep Purple - Abandon - 12 - Bludsucker.mp3.wav
deep_purple/Deep Purple - Bananas - 05 - Silver Tongue.mp3.wav
deep_purple/Deep Purple - Bananas - 06 - Wal

In [203]:
import pandas as pd

from collections import Counter
from sklearn import svm

jc_df = pd.read_csv('./johnny_cash_data.csv')

# Drop columns 'Class' and 'Path'
jc_df_cleaned = jc_df.drop(['Path', 'Class'], axis=1)

jc_df_train_samples = jc_df_cleaned.values.tolist()

estimator = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
estimator.fit(jc_df_train_samples)
jc_pred = estimator.predict(jc_df_train_samples)

print("Deep Purple Outlier Analysis:")
jc_outlier_count = Counter(jc_pred)
print("Outlier Count: ", jc_outlier_count[-1])
print("Inlier Count: ", jc_outlier_count[1])

# Add outliers data as last column to data sample
jc_df["Outliers"] = jc_pred

for sample in jc_df.values.tolist():
    if (sample[-1] == -1):
        print(sample[0])

jc_outlier_count = len(jc_df.loc[jc_df["Outliers"] == -1])

print("\nOut of {} Johnny Cash samples, {} are outliers".format(jc_pred.shape[0], jc_outlier_count))


Deep Purple Outlier Analysis:
Outlier Count:  77
Inlier Count:  30
Johnny_cash/02 - I Will Rock And Roll With You.mp3.wav
Johnny_cash/02 - One Piece At A Time.mp3.wav
Johnny_cash/02 - Paul Revere.mp3.wav
Johnny_cash/02 - The Devil To Pay.mp3.wav
Johnny_cash/02 - The Road to Kaintuck.mp3.wav
Johnny_cash/02 - Without Love.mp3.wav
Johnny_cash/03 - Ballad of Little Fauss and Big Halsy (Instrumental).mp3.wav
Johnny_cash/03 - Call Daddy From The Mines.mp3.wav
Johnny_cash/03 - Cause I Love You.mp3.wav
Johnny_cash/03 - Hungry.mp3.wav
Johnny_cash/03 - In A Young Girls Mind.mp3.wav
Johnny_cash/03 - The Diplomat.mp3.wav
Johnny_cash/03 - W-o-m-a-n.mp3.wav
Johnny_cash/04 - Cowboy Who Started The Fight.mp3.wav
Johnny_cash/04 - Mountain Lady.mp3.wav
Johnny_cash/04 - Narration.mp3.wav
Johnny_cash/04 - No Expectations.mp3.wav
Johnny_cash/04 - See Ruby Fall.mp3.wav
Johnny_cash/04 - The Frozen Four Hundred Pound Fair To Middlin Cotton Picker.mp3.wav
Johnny_cash/04 - The Road to Kaintuck.mp3.wav
Johnny_ca

## Part 3
Eminem & Black Sabath

In [204]:
from pandas import DataFrame
import pandas as pd

e_bs_df = pd.read_csv('./e_bs.csv')
bs_e_df = pd.read_csv('./bs_e.csv')

# Shuffle rows as for each csv file, novelty rows are appended to the end of dataset
# e_bs_ds = e_bs_df.sample(frac=1)
e_bs_ds = e_bs_df
# bs_e_ds = bs_e_df.sample(frac=1)
bs_e_ds = bs_e_df

# Drop columns 'Class' and 'Path'
e_bs_ds_cleaned = e_bs_ds.drop(['Path', 'Class'], axis=1)
bs_e_ds_cleaned = bs_e_ds.drop(['Path', 'Class'], axis=1)

e_bs_train_samples = e_bs_ds_cleaned.values.tolist()
bs_e_train_samples = bs_e_ds_cleaned.values.tolist()

In [205]:
from collections import Counter
from sklearn import svm

estimator = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)

# Eminem with Black Sabbath novelty samples
estimator.fit(e_bs_train_samples)
e_bs_pred = estimator.predict(e_bs_train_samples)

print("Eminem - Black Sabath Outlier Analysis:")
e_bs_outlier_count = Counter(e_bs_pred)
print("Outlier Count: ", e_bs_outlier_count[-1])
print("Inlier Count: ", e_bs_outlier_count[1])

# Add outliers data as last column to data sample
e_bs_ds["Outliers"] = e_bs_pred

# Get only Black Sabath rows (should be 10 rows)
bs_samples = e_bs_ds.loc[e_bs_ds["Class"] == 'black_sabath']

for sample in bs_samples.values.tolist():
    print("{} ---> {}".format(sample[0], "Outlier (-1)" if sample[-1] == -1 else "Inlier (1)"))

bs_outliers_count = len(bs_samples.loc[bs_samples["Outliers"] == -1])

print("\nOut of 10 Black Sabath samples, {} are outliers".format(bs_outliers_count))
print("Detection rate: {}%".format((bs_outliers_count/10)*100))

Eminem - Black Sabath Outlier Analysis:
Outlier Count:  73
Inlier Count:  37
black_sabath/02 - After Forever.flac.mp3.wav ---> Outlier (-1)
black_sabath/02 - Behind The Wall Of Sleep.flac.mp3.wav ---> Outlier (-1)
black_sabath/02 - Dont Start (Too Late).flac.mp3.wav ---> Outlier (-1)
black_sabath/02 - Fairies Wear Boots.flac.mp3.wav ---> Outlier (-1)
black_sabath/02 - Hole In The Sky.flac.mp3.wav ---> Outlier (-1)
black_sabath/02 - Johnny Blade.flac.mp3.wav ---> Outlier (-1)
black_sabath/02 - Orchid-Lord Of This World.flac.mp3.wav ---> Outlier (-1)
black_sabath/02 - Paranoid.flac.mp3.wav ---> Outlier (-1)
black_sabath/02 - Sweet Leaf.flac.mp3.wav ---> Outlier (-1)
black_sabath/02 - The Wizard.flac.mp3.wav ---> Outlier (-1)

Out of 10 Black Sabath samples, 10 are outliers
Detection rate: 100.0%


In [206]:
# Black Sabath with Eminem novelty samples
estimator.fit(bs_e_train_samples)
bs_e_pred = estimator.predict(bs_e_train_samples)

# Add outliers data as last column to data sample
bs_e_ds["Outliers"] = bs_e_pred

print("\nBlack Sabath - Eminem Outlier Analysis:")
bs_e_outlier_count = Counter(bs_e_pred)
print("Outlier Count: ", bs_e_outlier_count[-1])
print("Inlier Count: ", bs_e_outlier_count[1])

# Get only Deep Purple rows (should be 10 rows)
e_samples = bs_e_ds.loc[bs_e_ds["Class"] == 'Eminem']

for sample in e_samples.values.tolist():
    print("{} ---> {}".format(sample[0], "Outlier (-1)" if sample[-1] == -1 else "Inlier (1)"))

e_outliers_count = len(e_samples.loc[e_samples["Outliers"] == -1])

print("\nOut of 10 Deep Purple samples, {} are outliers".format(e_outliers_count))
print("Detection rate: {}%".format((e_outliers_count/10)*100))


Black Sabath - Eminem Outlier Analysis:
Outlier Count:  66
Inlier Count:  44
Eminem/9-07 I Am (Woogie Blend).mp3.wav ---> Outlier (-1)
Eminem/9-08 Bullys Pt.2.mp3.wav ---> Outlier (-1)
Eminem/9-09 Encore.mp3.wav ---> Outlier (-1)
Eminem/9-10 Mic Check (Crazy).mp3.wav ---> Outlier (-1)
Eminem/9-11 Freestyle.mp3.wav ---> Outlier (-1)
Eminem/9-12 Soilder (Remix).mp3.wav ---> Outlier (-1)
Eminem/9-13 Hellbound.mp3.wav ---> Outlier (-1)
Eminem/9-14 Oh Shit.mp3.wav ---> Outlier (-1)
Eminem/9-15 Pills.mp3.wav ---> Inlier (1)
Eminem/9-16 Came 2 Party.mp3.wav ---> Inlier (1)

Out of 10 Deep Purple samples, 8 are outliers
Detection rate: 80.0%
