In [16]:
import pandas as pd

# Read in track metadata with genre labels
tracks = pd.read_csv("./fma-rock-vs-hiphop.csv")
print(tracks.columns)

# Read in track metrics with the features
echonest_metrics = pd.read_json("./echonest-metrics.json",precise_float=True)
print(echonest_metrics.head())

# Merge the relevant columns of tracks and echonest_metrics
echo_tracks = echonest_metrics.merge(tracks[["track_id","genre_top"]],left_on='track_id', right_on='track_id')

# Inspect the resultant dataframe
print(echo_tracks.shape)
print(echo_tracks.info())

Index(['track_id', 'bit_rate', 'comments', 'composer', 'date_created',
       'date_recorded', 'duration', 'favorites', 'genre_top', 'genres',
       'genres_all', 'information', 'interest', 'language_code', 'license',
       'listens', 'lyricist', 'number', 'publisher', 'tags', 'title'],
      dtype='object')
   track_id  acousticness  danceability    energy  instrumentalness  liveness  \
0         2      0.416675      0.675894  0.634476          0.010628  0.177647   
1         3      0.374408      0.528643  0.817461          0.001851  0.105880   
2         5      0.043567      0.745566  0.701470          0.000697  0.373143   
3        10      0.951670      0.658179  0.924525          0.965427  0.115474   
4       134      0.452217      0.513238  0.560410          0.019443  0.096567   

   speechiness    tempo   valence  
0     0.159310  165.922  0.576661  
1     0.461818  126.957  0.269240  
2     0.124595  100.260  0.621661  
3     0.032985  111.562  0.963590  
4     0.525519  114.2

In [5]:
# Define our features 
features = echo_tracks.drop(["genre_top","track_id"],axis=1)
print(features.head())
print()
# Define our labels
labels = echo_tracks["genre_top"]
print(labels.head())

# Import the StandardScaler
from sklearn.preprocessing import StandardScaler

# Scale the features and set the values to a new variable
scaler = StandardScaler()
scaled_train_features = scaler.fit_transform(features)
print(scaled_train_features)

   acousticness  danceability    energy  instrumentalness  liveness  \
0      0.416675      0.675894  0.634476          0.010628  0.177647   
1      0.374408      0.528643  0.817461          0.001851  0.105880   
2      0.043567      0.745566  0.701470          0.000697  0.373143   
3      0.452217      0.513238  0.560410          0.019443  0.096567   
4      0.988306      0.255661  0.979774          0.973006  0.121342   

   speechiness    tempo   valence  
0     0.159310  165.922  0.576661  
1     0.461818  126.957  0.269240  
2     0.124595  100.260  0.621661  
3     0.525519  114.290  0.894072  
4     0.051740   90.241  0.034018  

0    Hip-Hop
1    Hip-Hop
2    Hip-Hop
3    Hip-Hop
4       Rock
Name: genre_top, dtype: object
[[-0.19121034  1.30442004  0.03831594 ...  0.37303429  1.15397908
   0.46228696]
 [-0.30603598  0.50188641  0.78817624 ...  2.44615517  0.00791367
  -0.69081137]
 [-1.20481276  1.68413943  0.31285194 ...  0.13513049 -0.77731688
   0.63107745]
 ...
 [-1.2947043

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# Split our data
train_features, test_features, train_labels, test_labels = train_test_split(scaled_train_features,labels,random_state=10)

# Train our decision tree
tree = DecisionTreeClassifier(random_state=10)
tree.fit(train_features, train_labels)


# Predict the labels for the test data
pred_labels_tree = tree.predict(test_features)

In [7]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report 

# Train our logistic regression and predict labels for the test set
logreg = LogisticRegression(random_state=10)
logreg.fit(train_features, train_labels)
pred_labels_logit = logreg.predict(test_features)

# Create the classification report for both models
from sklearn.metrics import classification_report
class_rep_tree = classification_report(test_labels,pred_labels_tree)
class_rep_log = classification_report(test_labels,pred_labels_logit)

print("Decision Tree: \n", class_rep_tree)
print("Logistic Regression: \n", class_rep_log)

Decision Tree: 
               precision    recall  f1-score   support

     Hip-Hop       0.66      0.68      0.67       235
        Rock       0.92      0.92      0.92       966

    accuracy                           0.87      1201
   macro avg       0.79      0.80      0.79      1201
weighted avg       0.87      0.87      0.87      1201

Logistic Regression: 
               precision    recall  f1-score   support

     Hip-Hop       0.80      0.61      0.69       235
        Rock       0.91      0.96      0.94       966

    accuracy                           0.89      1201
   macro avg       0.86      0.79      0.82      1201
weighted avg       0.89      0.89      0.89      1201



In [8]:
# Subset only the hip-hop tracks, and then only the rock tracks
print(echo_tracks.head())
hop_only = echo_tracks.loc[echo_tracks["genre_top"] == "Hip-Hop"]
rock_only = echo_tracks.loc[echo_tracks["genre_top"] == "Rock"]
print(hop_only.shape)
print(rock_only.shape)

# sample the rocks songs to be the same number as there are hip-hop songs
rock_only = rock_only.sample(n=hop_only.shape[0],random_state=10)
print(hop_only.shape)
print(rock_only.shape)

# concatenate the dataframes rock_only and hop_only
rock_hop_bal = pd.concat([rock_only,hop_only])

# The features, labels, and pca projection are created for the balanced dataframe
features = rock_hop_bal.drop(['genre_top', 'track_id'], axis=1) 
labels = rock_hop_bal['genre_top']
#pcap = pca.fit_transform(scaler.fit_transform(features))

# Redefine the train and test set with the pca_projection from the balanced data
train_features, test_features, train_labels, test_labels = train_test_split(features,labels
                                                                            ,random_state=10)

   track_id  acousticness  danceability    energy  instrumentalness  liveness  \
0         2      0.416675      0.675894  0.634476          0.010628  0.177647   
1         3      0.374408      0.528643  0.817461          0.001851  0.105880   
2         5      0.043567      0.745566  0.701470          0.000697  0.373143   
3       134      0.452217      0.513238  0.560410          0.019443  0.096567   
4       153      0.988306      0.255661  0.979774          0.973006  0.121342   

   speechiness    tempo   valence genre_top  
0     0.159310  165.922  0.576661   Hip-Hop  
1     0.461818  126.957  0.269240   Hip-Hop  
2     0.124595  100.260  0.621661   Hip-Hop  
3     0.525519  114.290  0.894072   Hip-Hop  
4     0.051740   90.241  0.034018      Rock  
(910, 10)
(3892, 10)
(910, 10)
(910, 10)


In [10]:
# Train our decision tree on the balanced data
tree = DecisionTreeClassifier(random_state=10)
tree.fit(train_features, train_labels)
pred_labels_tree = tree.predict(test_features)
# Train our logistic regression on the balanced data

logreg = LogisticRegression(random_state=10)
logreg.fit(train_features, train_labels)
pred_labels_logit = logreg.predict(test_features)
# Compare the models
print("Decision Tree: \n", classification_report(test_labels,pred_labels_tree))
print("Logistic Regression: \n", classification_report(test_labels,pred_labels_logit))

Decision Tree: 
               precision    recall  f1-score   support

     Hip-Hop       0.81      0.85      0.83       230
        Rock       0.84      0.79      0.81       225

    accuracy                           0.82       455
   macro avg       0.82      0.82      0.82       455
weighted avg       0.82      0.82      0.82       455

Logistic Regression: 
               precision    recall  f1-score   support

     Hip-Hop       0.84      0.82      0.83       230
        Rock       0.82      0.84      0.83       225

    accuracy                           0.83       455
   macro avg       0.83      0.83      0.83       455
weighted avg       0.83      0.83      0.83       455



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [11]:
from sklearn.svm import SVC
svc=SVC(kernel="linear",gamma="auto")
svc.fit(train_features, train_labels)
pred=svc.predict(test_features)
print("SVC: \n", classification_report(test_labels,pred))

SVC: 
               precision    recall  f1-score   support

     Hip-Hop       0.90      0.83      0.86       230
        Rock       0.84      0.90      0.87       225

    accuracy                           0.86       455
   macro avg       0.87      0.86      0.86       455
weighted avg       0.87      0.86      0.86       455

