In [14]:
# import any libraries we will use for this classification problem
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [2]:
# Clean the data of any weird or unneccessary categories
cleaned_data = pd.read_csv('Data/CleanSpotifyData.csv')

In [3]:
# Get all genres to compare

rap_classical = cleaned_data.loc[cleaned_data['genre'].isin(['Rap', 'Classical'])]
rap_classical.replace(to_replace=['Rap', 'Classical'], value=[0,1], inplace=True)


electronic_jazz = cleaned_data.loc[cleaned_data['genre'].isin(['Electronic', 'Jazz'])]
electronic_jazz.replace(to_replace=['Electronic', 'Jazz'], value=[0,1], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [4]:
# Rap Classical train and test data
X_rc = rap_classical.drop('genre', axis=1)
y_rc = rap_classical['genre']

# Get trian and test sets 80% train 20% test
X_train_rc, X_test_rc, y_train_rc, y_test_rc= train_test_split(X_rc, y_rc, test_size = 0.20)

In [5]:
# Electronic Jazz train test data
X_ej = electronic_jazz.drop('genre', axis=1)
y_ej = electronic_jazz['genre']

# Get trian and test sets 80% train 20% test
X_train_ej, X_test_ej, y_train_ej, y_test_ej = train_test_split(X_ej, y_ej, test_size = 0.20)


### SVC on diffirent sets of genres

#### Rap vs Classical

In [6]:
# Train classifier
svclassifier_rc = SVC(kernel='linear')
svclassifier_rc.fit(X_train_rc, y_train_rc)

# Make predictions on test set
y_pred_rc = svclassifier_rc.predict(X_test_rc)


print(confusion_matrix(y_test_rc,y_pred_rc))
print(classification_report(y_test_rc,y_pred_rc))

[[1840   25]
 [  74 1759]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1865
           1       0.99      0.96      0.97      1833

    accuracy                           0.97      3698
   macro avg       0.97      0.97      0.97      3698
weighted avg       0.97      0.97      0.97      3698



#### Electronic vs Jazz

In [8]:
# Train classifier
svclassifier_ej = SVC(kernel='linear')
svclassifier_ej.fit(X_train_ej, y_train_ej)

# Make predictions on test set
y_pred_ej = svclassifier_ej.predict(X_test_ej)


print(confusion_matrix(y_test_ej,y_pred_ej))
print(classification_report(y_test_ej,y_pred_ej))

[[1297  556]
 [ 607 1304]]
              precision    recall  f1-score   support

           0       0.68      0.70      0.69      1853
           1       0.70      0.68      0.69      1911

    accuracy                           0.69      3764
   macro avg       0.69      0.69      0.69      3764
weighted avg       0.69      0.69      0.69      3764



### KNN on diffirent sets of genres

#### Rap vs Classical

In [20]:
knn_rc = KNeighborsClassifier(n_neighbors = 3)

knn_rc.fit(X_train_rc,y_train_rc)

knn_rc.predict(X_test_rc)

knn_rc.score(X_test_rc, y_test_rc)

# CV Score
cv_scores_rc = cross_val_score(knn_rc, X_rc, y_rc, cv=5)
print(cv_scores_rc)
print("cv_scores_rc mean: {}".format(np.mean(cv_scores_rc)))

# Grid Search
knn2_rc = KNeighborsClassifier()

param_grid_rc = {"n_neighbors": np.arange(1, 25)}

knn_gscv_rc = GridSearchCV(knn2_rc, param_grid_rc, cv=5)

knn_gscv_rc.fit(X_rc, y_rc)

print(knn_gscv_rc.best_params_)

print(knn_gscv_rc.best_score_)

[0.76561233 0.74526771 0.72680552 0.74005951 0.71760887]
cv_scores_rc mean: 0.7390707875375931
{'n_neighbors': 1}
0.7710947641713544


#### Electronic vs Jazz

In [21]:
knn_ej = KNeighborsClassifier(n_neighbors = 3)

knn_ej.fit(X_train_ej,y_train_ej)

knn_ej.predict(X_test_ej)

knn_ej.score(X_test_ej, y_test_ej)

# CV Score
cv_scores_ej = cross_val_score(knn_ej, X_ej, y_ej, cv=5)
print(cv_scores_ej)
print("cv_scores_ej mean: {}".format(np.mean(cv_scores_ej)))

# Grid Search
knn2_ej = KNeighborsClassifier()

param_grid_ej = {"n_neighbors": np.arange(1, 25)}

knn_gscv_ej = GridSearchCV(knn2_ej, param_grid_ej, cv=5)

knn_gscv_ej.fit(X_ej, y_ej)

print(knn_gscv_ej.best_params_)

print(knn_gscv_ej.best_score_)

[0.58804781 0.56827843 0.55487643 0.55461068 0.54424661]
cv_scores_ej mean: 0.5620119918126636
{'n_neighbors': 3}
0.5620150919332554
