In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingRegressor
import xgboost as xgb

In [2]:
df = pd.read_csv('data.csv', index_col='Unnamed: 0')
df['label'].value_counts()

Jazz          122
Metal         109
Electronic    108
Hip_Hop       108
Country       104
Classical      97
Name: label, dtype: int64

In [4]:
#Assign int to genres
genre_dict = {
    'Country' : 0,
    'Classical' : 1,
    'Metal' : 2,
    'Jazz' : 3,
    'Hip_Hop' : 4,
    'Electronic' : 5
}

#split data
X = df.drop(['name', 'label'], axis=1).astype(float)
y = df['label'].map(genre_dict)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

# Standard Scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
#Logisitic Regression

logreg = LogisticRegression(solver='lbfgs', multi_class='multinomial')
log_score = np.mean(cross_val_score(logreg, X_scaled, y, cv=3))
print("Mean Cross Validation Score: ", log_score)

Mean Cross Validation Score:  0.7219640723063226


In [21]:
logreg.fit(X_train_scaled, y_train)
logist_score = logreg.score(X_test_scaled, y_test)
print("Logistic Regression: ", logist_score)

Logistic Regression:  0.654320987654321


In [25]:
#Decision Trees
dtc = DecisionTreeClassifier()
dtc_cv_score = np.mean(cross_val_score(dtc, X, y, cv=3))
dtc.fit(X_train, y_train)
dtc_acc_score = dtc.score(X_test,y_test)
print("Accuracy Score: ", dtc_acc_score)
print("Mean Cross Val Score: ", dtc_cv_score)

Accuracy Score:  0.48148148148148145
Mean Cross Val Score:  0.5398680855252921


In [None]:
param_grid = {
    'max_depth' : []
    'min_samples_splt' : []
    'min_samples_leaf' : []
}

grid_search = GridSearchCV(dtc, param_grid, cv=3, return_train_score=True)
grid_search.fit(scaled_X_train,y_train)

In [None]:
gs_train_score = np.mean(grid_search.cv_results_['mean_train_score'])
gs_test_score = grid_search.score(X_test,y_test)
print("Mean training score: ", gs_train_score)
print("Mean test score: ", gs_test_score)
print("Best params: ")
gs_grid_search.best_params_

In [28]:
#Random Forests
#Bagged
bag = BaggingRegressor(n_estimators=100)
bag.fit(X_train, y_train)
print("Bagged score: ", bag.score(X_test, y_test))
#Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
print("Random Forest Score: ", rf.score(X_test, y_test))

Bagged score:  0.4523648273739921
Random Forest Score:  0.7037037037037037


In [30]:
#k-nearest neighbors
n = 1
p = 100
knn = KNeighborsClassifier(n_neighbors=n, p=p)
knn.fit(X_train_scaled, y_train)
print("K_Nearest Neightbors: ", knn.score(X_test_scaled, y_test))

K_Nearest Neightbors:  0.4506172839506173


In [None]:
#XGBoost

#Classifier
xgb_clf = xgb.XGBClassifier(n_jobs=-1)
xgb_clf.fit(X_train_scaled, y_train)
train_preds = xgb_clf.predict(X_train_scaled)
test_preds = xgb_clf.predict(X_test_scaled)
train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)
print("XGBoost Scores")
print("Train Score: ", train_acc)
print("Test Score: ", test_acc)