In [1]:
import numpy as np
import pandas as pd

from keras.utils import to_categorical
from sklearn.model_selection import train_test_split




In [2]:
pd = pd.read_csv("./data/balanced_df.csv")

In [3]:
pd.head()

Unnamed: 0,lyrics,playlist_genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,"Need you forever, forever, forever Need you fo...",pop,0.714,0.705,10,-6.479,1,0.0995,0.292,0.000386,0.126,0.471,114.994,219521
1,"We don't talk anymore, we don't talk anymore W...",pop,0.728,0.563,1,-8.053,0,0.134,0.621,0.0,0.179,0.352,100.017,217707
2,It almost feels like it was just a dream All t...,pop,0.669,0.593,7,-6.628,1,0.0406,0.14,0.0,0.119,0.281,87.965,145413
3,It's my direction It's my proposal It's so har...,pop,0.741,0.742,1,-7.557,1,0.0306,0.497,0.885,0.0863,0.845,102.8,219507
4,Voy a ponerme mi traje plomo Al que en su tela...,pop,0.807,0.68,2,-7.226,1,0.0353,0.0358,0.000194,0.0834,0.84,113.977,248960


In [4]:
print(f"Shape of the dataframe: {pd.shape}")
print(f"Number of unique classes: {pd['playlist_genre'].nunique()}")

Shape of the dataframe: (8000, 14)
Number of unique classes: 5


In [5]:
genre_id = {genre: i for i, genre in enumerate(pd["playlist_genre"].unique())}
pd["playlist_genre_id"] = pd["playlist_genre"].map(genre_id)

print(f"Genre id mapping: {genre_id}")

Genre id mapping: {'pop': 0, 'rap': 1, 'rock': 2, 'r&b': 3, 'edm': 4}


In [6]:
encoded_genre = pd["playlist_genre"].map(genre_id)
num_classes = len(genre_id)
encoded_genre = to_categorical(encoded_genre, num_classes)

print(f"Shape of the encoded genre: {encoded_genre.shape}")

Shape of the encoded genre: (8000, 5)


In [7]:
# Split the data into train and test 70-30
X_train, X_test, y_train, y_test = train_test_split(
    pd,
    encoded_genre,
    test_size=0.3,
    train_size=0.7,
    shuffle=True,
    random_state=42,
    stratify=encoded_genre,
)

In [8]:
print(f"Shape of the training data: {X_train.shape}")
print(f"Shape of the test data: {X_test.shape}")

Shape of the training data: (5600, 15)
Shape of the test data: (2400, 15)


In [9]:
X_train.head()

Unnamed: 0,lyrics,playlist_genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,playlist_genre_id
7070,"A gente terminou, eu te bloqueei Mas você aind...",edm,0.654,0.951,9,-2.935,0,0.0558,0.288,0.0,0.121,0.913,158.904,156913,4
5356,She's a very kinky girl The kind you don't tak...,r&b,0.515,0.851,10,-4.213,1,0.0333,0.476,0.00346,0.128,0.967,148.232,244320,3
4209,Someone knockin' at the door Somebody ringin' ...,rock,0.761,0.482,10,-11.849,1,0.0572,0.491,0.31,0.0864,0.626,88.054,311880,2
7257,"Let's light it up, let's light it up Until our...",edm,0.645,0.891,6,-2.505,0,0.0387,0.0932,4e-06,0.379,0.568,124.915,203520,4
6739,"Sé que te gusto a ti todavía Tres, dos, uno St...",edm,0.887,0.852,8,-5.224,1,0.0581,0.00783,0.000472,0.0697,0.621,122.989,208659,4


In [10]:
genre_count = X_train["playlist_genre"].value_counts()

print(f"Genre count: {genre_count}")

Genre count: playlist_genre
edm     1120
r&b     1120
rock    1120
pop     1120
rap     1120
Name: count, dtype: int64


In [11]:
X_train.to_csv("./data/train_clean.csv.zip", index=False, compression="zip")
X_test.to_csv("./data/test_clean.csv.zip", index=False, compression="zip")