In [51]:
# importing librarys


import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.model_selection import StratifiedKFold



# IMPORTING DATA

In [52]:


hiphop_track = pd.read_csv('../input/classify-song-genres-from-audio-data/fma-rock-vs-hiphop.csv')

echonest_m = pd.read_json('../input/classify-song-genres-from-audio-data/echonest-metrics.json',precise_float=True)

In [53]:
hiphop_track.head()

In [54]:
echonest_m.head()

# MERGING RELEVANT COLUMN OF HIPHOP_TRACK AND ECHONEST_M AS BOTH HAVE TRACK_ID

In [55]:
merger_track = echonest_m.merge(hiphop_track[['track_id','genre_top']],on='track_id')

# INSPECTING RESULTANT DATAFRAME

In [56]:
merger_track.isnull().sum()

# THERE IS NO NULL VALUE IN OUR DATA

In [57]:
merger_track.info()

In [58]:
merger_track.head(100)

In [59]:
merger_track.describe()

# NOW CREATING CORRELATION METRICS

In [60]:
corr_metrics = merger_track.corr()
corr_metrics.style.background_gradient()

# STANDARDIZATION 

In [61]:
feature = merger_track.drop(['genre_top','track_id'],axis=1)

labels = merger_track['genre_top']

#scaling feature

scaler = StandardScaler()
scaled_features = scaler.fit_transform(feature)

# PCA ON SCALED DATA

In [15]:
pca = PCA()
pca.fit(scaled_features)

our_exp_variance = pca.explained_variance_ratio_ 


fig, ax = plt.subplots()
ax.bar(range(pca.n_components_),our_exp_variance)
ax.set_xlabel('PCA')

In [62]:
cum_variance = np.cumsum(our_exp_variance)

fig, ax = plt.subplots()
ax.plot(cum_variance)
ax.axhline(y=0.9, linestyle=(0, (3, 1, 1, 1)))

n_components = ((np.where(cum_variance > 0.9))[0][0])

pca = PCA(n_components, random_state=10)
pca.fit(scaled_features)
pca_projection = pca.transform(scaled_features)

# NOW ITS TIME TO TRAIN DECISION TREE TO CLASSIFY GENRE

In [63]:
train_features, test_features, train_labels, test_labels = train_test_split(pca_projection, labels,random_state=10)

d_tree = DecisionTreeClassifier(random_state=10)
d_tree.fit(train_features,train_labels)

predict_labels_tree = d_tree.predict(test_features)

# comparing which model is best for our data decision tree to a logistic regression

In [64]:
logreg = LogisticRegression(random_state=10)
logreg.fit(train_features,train_labels)
predict_labels_logit = logreg.predict(test_features)

class_report_tree = classification_report(test_labels,predict_labels_tree)
class_report_log = classification_report(test_labels,predict_labels_logit)

print("Decision Tree: \n", class_report_tree)
print("Logistic Regression: \n", class_report_log)

# balance our model for better performance

In [65]:
hip_hop = merger_track.loc[merger_track['genre_top'] == 'Hip-Hop']
rock = merger_track.loc[merger_track['genre_top'] == 'Rock']
print(hip_hop.head())

rock = rock.sample(n=len(hip_hop),random_state=10)

rock_hiphop_balance = pd.concat([hip_hop,rock])

feature = rock_hiphop_balance.drop(['genre_top', 'track_id'], axis=1) 
labels = rock_hiphop_balance['genre_top']
pca_projection = pca.fit_transform(scaler.fit_transform(feature))

train_features, test_features, train_labels, test_labels = train_test_split(pca_projection,labels,random_state=10)

# now fitting our data to logistic regression and decision tree for best prediction

In [66]:
d_tree = DecisionTreeClassifier(random_state=10)
d_tree.fit(train_features,train_labels)
predict_labels_tree = d_tree.predict(test_features)

logregresion = LogisticRegression(random_state=10)
logregresion.fit(train_features,train_labels)
predict_labels_logistic = logregresion.predict(test_features)

# comparing models

In [67]:
print("Decision Tree: \n", classification_report(test_labels,predict_labels_tree))
print("Logistic Regression: \n", classification_report(test_labels,predict_labels_logistic))

#  since our data is split into train and test sets can impact model performance we have to use CV methond called K-fold

In [70]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)

d_tree = DecisionTreeClassifier(random_state=10)
logregresion = LogisticRegression(random_state=10)

tree_score = cross_val_score(d_tree,pca_projection,labels,cv=kf)
logistic_score = cross_val_score(logregresion,pca_projection,labels,cv=kf)

In [71]:
print("Decision Tree:", np.mean(tree_score), "Logistic Regression:", np.mean(logistic_score))

# Finally, we get aggregate results from each fold for a final model performance score