# Enriched Approach

In [11]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

## Cleaning the data

In [2]:
data_set = pd.read_csv("Full_data.csv")

original_size = len(data_set)

# Dropping unnecessary columns
drop_columns = ["Unnamed: 0", "lyrics", "data", "analysis_url", "id", "track_href", "type", "uri"]
data_set = data_set.drop(drop_columns, axis=1)

# Dropping invalid songs
null_elem = data_set.isnull().any(axis=1)
data_set = data_set[~null_elem]
data_set = data_set.reset_index(drop=True)
reduced_size = len(data_set)

# Encoding author, style and title
to_encode = ["author", "style", "title"]
new_col_names = ["code_author", "code_style", "code_title"]
new_col_index = [1, 3, 5]
encoders = []
for i in range(len(to_encode)):
    values = data_set[to_encode[i]]
    encoder = LabelEncoder()
    encoders.append(encoder)
    new_col = encoder.fit_transform(values) 
    data_set.insert(new_col_index[i], new_col_names[i], new_col)
    
# Dropping author, style and title
drop_columns = ["author", "style", "title"]
data_set = data_set.drop(drop_columns, axis=1)

reduction = (original_size-reduced_size)/original_size
print("Data set reduction = ", reduction)
data_set

Data set reduction =  0.286112349531877


Unnamed: 0,code_author,code_style,code_title,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,2348,9,17672,0.277000,0.517,293593.0,0.590,0.000000,4.0,0.1130,-6.670,1.0,0.0393,129.800,4.0,0.129
1,986,9,19130,0.163000,0.599,263400.0,0.448,0.000000,8.0,0.1060,-6.312,1.0,0.0232,95.050,3.0,0.168
2,1941,6,28053,0.388000,0.680,264013.0,0.640,0.000000,5.0,0.4080,-7.803,1.0,0.1340,81.974,4.0,0.775
3,507,6,16700,0.678000,0.779,429273.0,0.605,0.005550,4.0,0.1850,-9.522,0.0,0.0386,110.008,4.0,0.571
4,507,6,17608,0.500000,0.404,194667.0,0.833,0.000000,2.0,0.9670,-6.245,1.0,0.1360,171.969,4.0,0.706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32020,1186,15,10224,0.056400,0.302,169373.0,0.920,0.000000,2.0,0.4540,-3.566,1.0,0.0925,179.956,4.0,0.407
32021,541,15,23865,0.000330,0.355,139400.0,0.955,0.000000,1.0,0.1320,-3.045,0.0,0.0800,179.043,4.0,0.654
32022,2820,15,6453,0.021700,0.596,282133.0,0.903,0.000000,4.0,0.6280,-2.908,0.0,0.1450,133.173,4.0,0.309
32023,3123,15,11225,0.000202,0.372,218004.0,0.961,0.000002,8.0,0.0945,-2.407,1.0,0.2210,165.054,4.0,0.337


In [7]:
Y = data_set["code_style"]
X = data_set.drop("code_style", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
clf = MLPClassifier(solver="sgd", alpha=0.001, activation="logistic", hidden_layer_sizes=(16, 50))

clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(predictions)
print(classification_report(y_test, predictions))

22417 9608
[0 0 0 ... 0 0 0]
              precision    recall  f1-score   support

           0       0.03      1.00      0.05       260
           1       0.00      0.00      0.00       236
           2       0.00      0.00      0.00       170
           3       0.00      0.00      0.00       202
           4       0.00      0.00      0.00       240
           5       0.00      0.00      0.00       189
           6       0.00      0.00      0.00       221
           7       0.00      0.00      0.00       223
           8       0.00      0.00      0.00       188
           9       0.00      0.00      0.00       193
          10       0.00      0.00      0.00       220
          11       0.00      0.00      0.00       206
          12       0.00      0.00      0.00       268
          13       0.00      0.00      0.00       152
          14       0.00      0.00      0.00       238
          15       0.00      0.00      0.00       248
          16       0.00      0.00      0.00       16

  'precision', 'predicted', average, warn_for)


## Using grid approach

In [8]:
mlp = MLPClassifier(max_iter=100)

In [9]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [None]:
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=7)
clf.fit(X_train, y_train)