In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
import xgboost as xgb

In [4]:
df = pd.read_csv('data.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,name,tempo,beats,chroma_stft,rmse,spec_cent,spec_bw,rolloff,zcr,...,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,label
0,0,Amilcare Ponchielli - Dance of the Hours.mp3,130.813953,394,0.29234,0.042382,1637.235974,2228.71292,2925.815507,0.038808,...,4.918759,4.76719,3.798379,-2.531283,-0.907902,-2.287565,0.083561,-2.888623,-3.455121,Classical
1,1,Rossini - William Tell Overture - Final.mp3,160.714286,443,0.388742,0.057814,1357.984086,1875.937831,2628.860882,0.028938,...,22.16519,5.195708,9.989042,-3.362946,3.713702,-3.914152,-1.442259,-3.176913,-2.75651,Classical


In [5]:
df.label.unique()

array(['Classical', 'Electronic', 'Metal', 'Hip_Hop', 'Country', 'Jazz'],
      dtype=object)

In [6]:
df.columns

Index(['Unnamed: 0', 'name', 'tempo', 'beats', 'chroma_stft', 'rmse',
       'spec_cent', 'spec_bw', 'rolloff', 'zcr', 'mfcc_1', 'mfcc_2', 'mfcc_3',
       'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10',
       'mfcc_11', 'label'],
      dtype='object')

In [7]:
#Assign int to genres
genre_dict = {
    'Country' : 0,
    'Classical' : 1,
    'Metal' : 2,
    'Jazz' : 3,
    'Hip_Hop' : 4,
    'Electronic': 5,
}

X = df.drop(['Unnamed: 0', 'name', 'label'], axis=1).astype(float)
y = df.label.map(genre_dict)
#split data

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=10)

In [8]:
scaler = StandardScaler()

In [9]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_scaled = scaler.fit_transform(X)

In [10]:
#Logisitic Regression
logreg = LogisticRegression(solver='lbfgs', multi_class='multinomial')
log_score = np.mean(cross_val_score(logreg,X_scaled,y,cv=3))
print("Mean Cross Validation Score: ", log_score)

Mean Cross Validation Score:  0.7219640723063226


In [11]:
logreg.fit(X_train_scaled,y_train)
score = logreg.score(X_test_scaled, y_test)
print("Logistic Regression: ",score)

Logistic Regression:  0.654320987654321


In [12]:
#Decision Trees
dtc = DecisionTreeClassifier()
dtc_cv_score = np.mean(cross_val_score(dtc,X,y,cv=3))
dtc.fit(X_train,y_train)
dtc_acc_score = dtc.score(X_test,y_test)
print("Accuracy Score: ",dtc_acc_score)
print("Mean Cross Val Score: ",dtc_cv_score)

Accuracy Score:  0.5370370370370371
Mean Cross Val Score:  0.5459987159985731


In [None]:
# param_grid = {
#     'max_depth' : []
#     'min_samples_splt' : []
#     'min_samples_leaf' : []
# }

# grid_search = GridSearchCV(dtc, param_grid, cv=3, return_train_score=True)
# grid_search.fit(scaled_X_train,y_train)

In [None]:
# gs_train_score = np.mean(grid_search.cv_results_['mean_train_score'])
# gs_test_score = grid_search.score(X_test,y_test)
# print("Mean training score: ", gs_train_score)
# print("Mean test score: ", gs_test_score)
# print("Best params: ")
# gs_grid_search.best_params_

In [13]:
#Random Forests
#Bagged
bag = BaggingClassifier(n_estimators=100)
bag.fit(X_train, y_train)
print("Bagged score: ", bag.score(X_test,y_test))
#Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)
print("Random Forest Score: ", rf.score(X_test,y_test))

Bagged score:  0.6481481481481481
Random Forest Score:  0.691358024691358


In [14]:
#k-nearest neighbors
n = 1
p = 100
knn = KNeighborsClassifier(n_neighbors=n, p=p)
knn.fit(X_train_scaled,y_train)
print("K_Nearest Neightbors: ", knn.score(X_test_scaled,y_test))

K_Nearest Neightbors:  0.4506172839506173


In [None]:
#XGBoost
# #Scale?
# scaler = StandardScaler()
# scaler.fit(X_train)
# scaled_X_train = scaler.transform(X_train)

#Classifier
xgb_clf = xgb.XGBClassifier(n_jobs=-1,)
xgb_clf.fit(X_train_scaled,y_train)
train_preds = xgb_clf.predict(X_train_scaled)
test_preds = xgb_clf.predict(X_test_scaled)
train_acc = accuracy_score(y_train,train_preds)
test_acc = accuracy_score(y_test,test_preds)
print("XGBoost Scores")
print("Train Score: ", train_acc)
print("Test Score: ", test_acc)