In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

In [2]:
df_artists = pd.read_csv('../notebooks/artists.csv', low_memory=False)
df_artists.head().T

FileNotFoundError: [Errno 2] No such file or directory: '../.data/NB_03_artists.csv'

In [None]:
df_tracks = pd.read_csv('../notebooks/tracks.csv', low_memory=False)
df_tracks.head().T

Unnamed: 0,0,1,2,3,4
id,35iwgR4jXetI318WEWsa1Q,021ht4sdgPcrDgSk7JTbKY,07A5yehtSnoedViJAZkNnc,08FmqUhxtyLTn6pAh6bk45,08y9GfoqCWfOGsKdwojr5e
name,Carve,Capítulo 2.16 - Banquero Anarquista,Vivo para Quererte - Remasterizado,El Prisionero - Remasterizado,Lady of the Evening
popularity,6,0,0,0,0
duration_ms,126903,98200,181640,176907,163080
explicit,0,0,0,0,0
artists,['Uli'],['Fernando Pessoa'],['Ignacio Corsini'],['Ignacio Corsini'],['Dick Haymes']
id_artists,['45tIt06XoI0Iio4LBEVpls'],['14jtPCOoNZwquk5wd9DxrY'],['5LiOoJbxVSAMkBS2fUm3X2'],['5LiOoJbxVSAMkBS2fUm3X2'],['3BiJGZsyX9sJchTqcSA7Su']
release_date,1922-02-22,1922-06-01,1922-03-21,1922-03-21,1922
danceability,0.645,0.695,0.434,0.321,0.402
energy,0.445,0.263,0.177,0.0946,0.158


## Data Overview Artists

| column | additional information |
|--------|------------------------|
| id | id of artist |
| followers | number of followers | 
| genres | genres associated with artist |
| name | name of artist |
| popularity | popularity of artist in range 0 to 100 |

## Data Overview Tracks

| column | additional information |
|--------|------------------------|
| id | id of track |
| name | name of track | 
| popularity | popularity of track in range 0 to 100 |
| duration_ms | duration of songs in ms |
| explicit | whether it contains explicit content or not |
| artists | artists who created the track | 
| id_artists | id of artists who created the track |
| release_date | date of release |
| danceability | how danceable a song is in range 0 to 1 |
| energy | how energized a song is in range 0 to 1 |
| key | The key the track is in. Integers map to pitches using standard Pitch Class notation. E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on. If no key was detected, the value is -1 |
| loudness | The overall loudness of a track in decibels (dB) |
| mode |  Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0 |
| speechiness | Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. Values below 0.33 most likely represent music and other non-speech-like tracks |
| acousticness | A confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic |
| instrumentalness | Predicts whether a track contains no vocals. "Ooh" and "aah" sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly "vocal". The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content |
| liveness | Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live |
| valence | A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry) |
| tempo | The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration | 
| time_signature | An estimated time signature. The time signature (meter) is a notational convention to specify how many beats are in each bar (or measure). The time signature ranges from 3 to 7 indicating time signatures of 3/4, to 7/4. | 

In [None]:
df_artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1162095 entries, 0 to 1162094
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   id          1162095 non-null  object 
 1   followers   1162084 non-null  float64
 2   genres      1162095 non-null  object 
 3   name        1162092 non-null  object 
 4   popularity  1162095 non-null  int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 44.3+ MB


&rarr; Some missing values in columns _followers_ and _name_!

In [None]:
df_tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586672 entries, 0 to 586671
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                586672 non-null  object 
 1   name              586601 non-null  object 
 2   popularity        586672 non-null  int64  
 3   duration_ms       586672 non-null  int64  
 4   explicit          586672 non-null  int64  
 5   artists           586672 non-null  object 
 6   id_artists        586672 non-null  object 
 7   release_date      586672 non-null  object 
 8   danceability      586672 non-null  float64
 9   energy            586672 non-null  float64
 10  key               586672 non-null  int64  
 11  loudness          586672 non-null  float64
 12  mode              586672 non-null  int64  
 13  speechiness       586672 non-null  float64
 14  acousticness      586672 non-null  float64
 15  instrumentalness  586672 non-null  float64
 16  liveness          58

&rarr; Some missing values in column _name_!

In [None]:
df_artists.nunique()

id            1162095
followers       51998
genres          49155
name          1134429
popularity         99
dtype: int64

In [None]:
df_tracks.nunique()

id                  586672
name                446474
popularity             101
duration_ms         123122
explicit                 2
artists             114030
id_artists          115062
release_date         19700
danceability          1285
energy                2571
key                     12
loudness             29196
mode                     2
speechiness           1655
acousticness          5217
instrumentalness      5402
liveness              1782
valence               1805
tempo               122706
time_signature           5
dtype: int64

In [None]:
df_tracks.shape

(586672, 20)

In [None]:
df_tracks.isnull().sum()

id                   0
name                71
popularity           0
duration_ms          0
explicit             0
artists              0
id_artists           0
release_date         0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
dtype: int64

The tracks-csv has all the data we need. From this time on we will only work with the tracks data.
We will split the data to avoid data leakage.

In [None]:
# Drop NaNs in column name

df_tracks = df_tracks.dropna()

In [None]:
df_tracks.isnull().sum()

id                  0
name                0
popularity          0
duration_ms         0
explicit            0
artists             0
id_artists          0
release_date        0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
dtype: int64

In [None]:
df_tracks.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
586667    False
586668    False
586669    False
586670    False
586671    False
Length: 586601, dtype: bool

In [None]:
df_tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


We have duplicated track names. Are these tracks with the same name from different artists, or du we have duplicates for the same track?

In [None]:
# Check for completely identical rows
identical_rows = df_tracks[df_tracks.duplicated(keep=False)]

# Print completely identical rows
print("Completely identical rows:")
print(identical_rows)

Completely identical rows:
Empty DataFrame
Columns: [id, name, popularity, duration_ms, explicit, artists, id_artists, release_date, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, time_signature]
Index: []


In [None]:
display(df_tracks.duplicated().value_counts())

False    586601
Name: count, dtype: int64

In [None]:
df_tracks.head().T

Unnamed: 0,0,1,2,3,4
id,35iwgR4jXetI318WEWsa1Q,021ht4sdgPcrDgSk7JTbKY,07A5yehtSnoedViJAZkNnc,08FmqUhxtyLTn6pAh6bk45,08y9GfoqCWfOGsKdwojr5e
name,Carve,Capítulo 2.16 - Banquero Anarquista,Vivo para Quererte - Remasterizado,El Prisionero - Remasterizado,Lady of the Evening
popularity,6,0,0,0,0
duration_ms,126903,98200,181640,176907,163080
explicit,0,0,0,0,0
artists,['Uli'],['Fernando Pessoa'],['Ignacio Corsini'],['Ignacio Corsini'],['Dick Haymes']
id_artists,['45tIt06XoI0Iio4LBEVpls'],['14jtPCOoNZwquk5wd9DxrY'],['5LiOoJbxVSAMkBS2fUm3X2'],['5LiOoJbxVSAMkBS2fUm3X2'],['3BiJGZsyX9sJchTqcSA7Su']
release_date,1922-02-22,1922-06-01,1922-03-21,1922-03-21,1922
danceability,0.645,0.695,0.434,0.321,0.402
energy,0.445,0.263,0.177,0.0946,0.158


In [None]:
# Train-test Split

# Defining X and y
features = df_tracks.columns.tolist()
features.remove('name')

X = df_tracks[features]
y = df_tracks['name']

print(X.shape)
print(y.shape)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)  # , stratify=y)

# Check the shape of the data sets
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

(586601, 19)
(586601,)
X_train: (439950, 19)
y_train: (439950,)
X_test: (146651, 19)
y_test: (146651,)


When we use 

 ```
stratify=y 
```

we get a ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2. 

This because of the nature of stratification. The stratify parameter set it to split data in a way to allocate test_size amount of data to each class. In this case, you don't have sufficient class labels of one of your classes to keep the data splitting ratio equal to test_size.

I confirm the above explanation. I have encountered this situation when dealing with a class that has a very low count . You can either take a random sample (not stratified) or try different test_size values, to be able to have an adequate size that could hold all your various labels.

In [None]:
# make new dataframe for EDA
X_train.head()

Unnamed: 0,id,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
181579,5rvOGr6TfBtdVZw6gF4zDE,17,305667,0,['Holger Czukay'],['58nPlJ5CNYu0nGLOuE1Uuk'],1979,0.767,0.443,9,-13.221,0,0.0609,0.265,0.00263,0.0593,0.928,108.82,4
284654,36JLbOILRf2ELnpByw44qk,52,239467,0,['Babado Novo'],['2jGuS7w8SfDzRNbxW1Lo2c'],2005-01-01,0.705,0.656,11,-8.296,0,0.0745,0.468,0.0,0.0659,0.807,90.057,4
568339,7vo9Jo7RNswCh5o2wfBQ9Z,39,206869,1,['Paluch'],['462yq5vpZnO172v3nK9ibv'],2011-10-01,0.698,0.694,8,-7.766,1,0.291,0.412,0.0,0.208,0.414,89.978,4
297070,3IOQZRcEkplCXg6LofKqE9,67,355304,0,['Black Sabbath'],['5M52tdBnJaKSvOpJGz8mfZ'],1970-09-18,0.336,0.792,9,-10.875,1,0.084,0.00351,0.00175,0.0437,0.427,155.587,4
188218,6ExL9NmP0keIA99TJBcUBz,1,257800,0,['Workout Music '],['5njMahLKD1tLJ5bejvqseV'],2017-12-29,0.773,0.701,9,-4.374,1,0.0816,0.00794,0.0,0.0413,0.971,134.951,4


In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 439950 entries, 181579 to 121958
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                439950 non-null  object 
 1   popularity        439950 non-null  int64  
 2   duration_ms       439950 non-null  int64  
 3   explicit          439950 non-null  int64  
 4   artists           439950 non-null  object 
 5   id_artists        439950 non-null  object 
 6   release_date      439950 non-null  object 
 7   danceability      439950 non-null  float64
 8   energy            439950 non-null  float64
 9   key               439950 non-null  int64  
 10  loudness          439950 non-null  float64
 11  mode              439950 non-null  int64  
 12  speechiness       439950 non-null  float64
 13  acousticness      439950 non-null  float64
 14  instrumentalness  439950 non-null  float64
 15  liveness          439950 non-null  float64
 16  valence           43

In [None]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
popularity,439950.0,27.576413,18.393918,0.0,13.0,27.0,41.0,100.0
duration_ms,439950.0,229849.510858,126048.282979,3344.0,174973.0,214867.0,263640.0,5621218.0
explicit,439950.0,0.044214,0.205571,0.0,0.0,0.0,0.0,1.0
danceability,439950.0,0.563732,0.16626,0.0,0.453,0.578,0.686,0.991
energy,439950.0,0.54177,0.252079,0.0,0.343,0.549,0.748,1.0
key,439950.0,5.220232,3.518163,0.0,2.0,5.0,8.0,11.0
loudness,439950.0,-10.211345,5.096548,-60.0,-12.898,-9.251,-6.48125,5.109
mode,439950.0,0.658732,0.474136,0.0,0.0,1.0,1.0,1.0
speechiness,439950.0,0.105105,0.180236,0.0,0.034,0.0443,0.0765,0.971
acousticness,439950.0,0.450319,0.348959,0.0,0.097,0.423,0.785,0.996


In [None]:
X_train.duplicated().sum()

0

What would be interesting to look at?

* Distribution of number of artists
    * How many artists do we have in this dataset
    * How many tracks/artist in this dataset
* Distribution of genre
* Distribution of valence

In [None]:
# Join artists and tracks to one df without duplicate columns