In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold

In [21]:
df = pd.read_csv('Spotify-Dupes.csv',index_col=0)
df.drop(['track_id', 'album_name'], axis = 1, inplace = True)

In [22]:
df["artists"] = df["artists"].str.split(';')

In [23]:
df['explicit'] = df['explicit'].astype(bool)

In [24]:
track_genre_dict = {genre: i for i, genre in enumerate(df['track_genre'].unique())}
df['track_genre'] = df['track_genre'].apply(lambda x: track_genre_dict[x])

In [25]:
df2 = df.groupby(["track_name"]).agg({"artists":lambda x: x.iloc[0],
                                      "popularity":lambda x: x.iloc[0],
                                      "duration_ms":lambda x: x.iloc[0],
                                      "explicit":lambda x: x.iloc[0],
                                      "danceability":lambda x: x.iloc[0],
                                      "energy":lambda x: x.iloc[0],
                                      "key":lambda x: x.iloc[0],
                                      "loudness":lambda x: x.iloc[0],
                                      "mode":lambda x: x.iloc[0],
                                      "speechiness":lambda x: x.iloc[0],
                                      "acousticness":lambda x: x.iloc[0],
                                      "instrumentalness":lambda x: x.iloc[0],
                                      "liveness":lambda x: x.iloc[0],
                                      "track_genre": list})


In [26]:
all_artists = df2["artists"].apply(pd.Series).stack().unique()
artist_dict = {artist: i for i, artist in enumerate(all_artists)}
df2['artists'] = df2['artists'].apply(lambda x: [artist_dict[a] for a in x])

In [27]:
df2 = df2.dropna()

In [28]:
mlb = MultiLabelBinarizer()
df3 = df2.sample(8000)
artists_bin = mlb.fit_transform(df3['artists'])
track_genre_bin = mlb.fit_transform(df3['track_genre'])

result = []
for x in df3.columns:
    if (x != 'track_genre' and x!='artists'):
        result.append(x)

In [29]:
X = df3[result].values
y = track_genre_bin

print(X.shape)
print(y.shape)

(8000, 12)
(8000, 114)


In [30]:
X = np.concatenate((X, artists_bin), axis=1)

In [31]:
dt = MultiOutputClassifier(DecisionTreeClassifier(criterion="entropy"))

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6000, 7137)
(2000, 7137)
(6000, 114)
(2000, 114)


In [33]:
dt.fit(X_train,y_train)

MultiOutputClassifier(estimator=DecisionTreeClassifier(criterion='entropy'))

In [34]:
# Make predictions on the testing data
y_pred = dt.predict(X_test)

In [35]:
print(y_pred[:5])
print(y_test[:5])

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [36]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %.2f' % accuracy)

Accuracy: 0.18


In [40]:
mprecision = precision_score(y_test, y_pred, average='micro')
print('The Micro Precision is: ',mprecision*100,'%')

The Micro Precision is:  40.27149321266968 %


In [41]:
# Use 5-fold split
kf = KFold(5,shuffle=True)

fold = 1
# The data is split five ways, for each fold, the 
# decision tree is trained, tested and evaluated for accuracy
for train_index, validate_index in kf.split(X,y):
    dt.fit(X[train_index],y[train_index])
    y_test = y[validate_index]
    y_pred = dt.predict(X[validate_index])
    print(f"Fold #{fold}, Training Size: {len(X[train_index])}, Validation Size: {len(X[validate_index])}")
    print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
    print('Micro Precision is: ',mprecision*100,'%')
    fold += 1

Fold #1, Training Size: 6400, Validation Size: 1600
Accuracy: 0.18
Micro Precision is:  40.27149321266968 %
Fold #2, Training Size: 6400, Validation Size: 1600
Accuracy: 0.20
Micro Precision is:  40.27149321266968 %
Fold #3, Training Size: 6400, Validation Size: 1600
Accuracy: 0.18
Micro Precision is:  40.27149321266968 %
Fold #4, Training Size: 6400, Validation Size: 1600
Accuracy: 0.19
Micro Precision is:  40.27149321266968 %
Fold #5, Training Size: 6400, Validation Size: 1600
Accuracy: 0.17
Micro Precision is:  40.27149321266968 %
