## Importamos las librerías

In [1]:
import pandas as pd

## Leemos los datos

In [2]:
df = pd.read_json("../Data/endsong.json") # cargamos el archivo json

# Mostramos los nombres de las columnas
df.columns

Index(['ts', 'username', 'platform', 'ms_played', 'conn_country',
       'ip_addr_decrypted', 'user_agent_decrypted',
       'master_metadata_track_name', 'master_metadata_album_artist_name',
       'master_metadata_album_album_name', 'spotify_track_uri', 'episode_name',
       'episode_show_name', 'spotify_episode_uri', 'reason_start',
       'reason_end', 'shuffle', 'skipped', 'offline', 'offline_timestamp',
       'incognito_mode'],
      dtype='object')

In [3]:
# Eliminamos las columnas innecesarias
df.drop(
    ['username', 'conn_country',
       'ip_addr_decrypted', 'user_agent_decrypted', 'spotify_track_uri', 'episode_name',
       'episode_show_name', 'spotify_episode_uri',
       'incognito_mode'], 
    axis=1,
    inplace=True
)
df.head()

Unnamed: 0,ts,platform,ms_played,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp
0,2022-05-22T16:54:58Z,Windows 10 (10.0.22000; x64; AppX),166706,So Good (feat. Ty Dolla $ign),Zara Larsson,So Good,trackdone,trackdone,True,,False,1653238330199
1,2022-08-16T12:48:28Z,"Android OS 11 API 30 (Xiaomi, Redmi Note 8)",239894,Love Me Again,John Newman,Tribute,fwdbtn,trackdone,False,,False,1660653867377
2,2022-08-16T03:08:14Z,"Android OS 11 API 30 (Xiaomi, Redmi Note 8)",2900,Víctima y Verdugo,Porta,Algo Ha Cambiado,fwdbtn,fwdbtn,True,,False,1660619290523
3,2023-03-10T02:52:41Z,android,0,En Boca De Tantos,Porta,En Boca De Tantos,fwdbtn,fwdbtn,True,1.0,True,1678416760
4,2022-07-16T17:10:05Z,Windows 10 (10.0.22622; x64; AppX),144686,Final del Juego (Avengers Endgame Rap),Keyblade,Final del Juego (Avengers Endgame Rap),trackdone,remote,True,,False,1657986674400


In [4]:
# renombramos las columnas

df.rename(columns={'master_metadata_track_name': 'track_name', 'master_metadata_album_artist_name': 'album_artist_name', 'master_metadata_album_album_name': 'album_name'}, inplace=True)
df.sample(10)

Unnamed: 0,ts,platform,ms_played,track_name,album_artist_name,album_name,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp
2442,2022-08-16T03:14:08Z,"Android OS 11 API 30 (Xiaomi, Redmi Note 8)",180066,Logro Desbloqueado,Porta,Algo Ha Cambiado,trackdone,trackdone,True,,False,1660619466937
370,2022-07-14T12:25:44Z,"Android OS 11 API 30 (Xiaomi, Redmi Note 8)",74375,Favor Con Favor,Santa RM,Mala Ortografía,trackdone,logout,True,,False,1657757776969
2330,2022-09-09T20:21:10Z,Partner amazon_salmon Amazon;Echo_Show_5;27d4d...,83069,Words Ain't Enough,Tessa Violet,Bad Ideas,trackdone,remote,False,,False,0
6165,2022-09-01T00:10:31Z,"Android OS 11 API 30 (Xiaomi, Redmi Note 8)",23,Los Más Buscados - Remix,Santa RM,Listo para Lo Que Venga,fwdbtn,fwdbtn,True,,False,1661991029335
7796,2022-08-19T04:09:56Z,Partner amazon_salmon Amazon;Echo_Show_5;27d4d...,169724,Interlude III,Tessa Violet,Bad Ideas,trackdone,trackdone,True,,False,0
8068,2022-08-01T03:34:16Z,"Android OS 11 API 30 (Xiaomi, Redmi Note 8)",253781,Daño Colateral,Santa RM,Listo para Lo Que Venga,trackdone,trackdone,True,,False,1659324599391
9216,2023-03-07T09:44:39Z,android,119880,Dragon Ball Rap,Porta,En Boca De Tantos,trackdone,trackdone,True,0.0,True,1678182552
12465,2023-03-31T20:36:47Z,windows,162360,YES MOM (I'm The One To Beat),Tessa Violet,YES MOM (I'm The One To Beat),trackdone,trackdone,True,0.0,False,1680294844
11391,2022-08-21T19:45:39Z,Partner amazon_salmon Amazon;Echo_Show_5;27d4d...,186850,decide to be happy,MisterWives,SUPERBLOOM,clickrow,trackdone,False,,False,0
6993,2022-10-23T23:39:06Z,android,6204,Same Drugs,MisterWives,Same Drugs,appload,remote,True,0.0,False,1666568298


In [5]:
# Verificamos si tenemos valores faltantes
df.isnull().sum()

ts                      0
platform                0
ms_played               0
track_name              3
album_artist_name       3
album_name              3
reason_start            0
reason_end              0
shuffle                 0
skipped              9295
offline                 0
offline_timestamp       0
dtype: int64

In [6]:
# Verificamos que valores toma la variable skipped
df["skipped"].value_counts()

0.0    3759
1.0    1683
Name: skipped, dtype: int64

Dado que la variable se refiere a si saltamos una canción y toma el valor 1 si fue asi y 0 en el caso contrario, podemos deducir que una `NaN` se refiere a que no aplica el salto de la canción.

Un ejemplo puede ser, porque se finalizo la reproducción. Por lo que podemos asignar el valor de 0, dado que si no se aplica el salto podemos entender que no se ha saltado la canción.

In [7]:
# Remplazamos los valores nulos por 0 en la variable "skipped"
df["skipped"].fillna(0, inplace=True)

In [8]:
# Verificamos nuevamente los valores nulos
df.isnull().sum()

ts                   0
platform             0
ms_played            0
track_name           3
album_artist_name    3
album_name           3
reason_start         0
reason_end           0
shuffle              0
skipped              0
offline              0
offline_timestamp    0
dtype: int64

In [9]:
# verificamos si donde hay valores nulos en una columna los hay en alguna otra
valores_nulos = df['album_artist_name'].isnull()
df[valores_nulos]

Unnamed: 0,ts,platform,ms_played,track_name,album_artist_name,album_name,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp
129,2022-07-01T22:07:18Z,web_player windows 10;chrome 102.0.5005.115;de...,455500,,,,remote,unexpected-exit-while-paused,False,0.0,False,0
6754,2022-07-01T18:30:38Z,web_player windows 10;chrome 101.0.4951.67;des...,90979,,,,remote,unexpected-exit,False,0.0,False,0
12990,2022-07-01T22:07:18Z,web_player windows 10;chrome 102.0.5005.115;de...,558338,,,,remote,unexpected-exit,False,0.0,False,0


In [10]:
# Eliminamos las entradas con valores nulos
df.dropna(inplace=True, how="any")

In [11]:
# Verificamos nuestras variables para saber si es necesario hacer un cambio
df.dtypes

ts                    object
platform              object
ms_played              int64
track_name            object
album_artist_name     object
album_name            object
reason_start          object
reason_end            object
shuffle                 bool
skipped              float64
offline                 bool
offline_timestamp      int64
dtype: object

In [13]:
# observamos el contenido de ts
df["ts"].head()

0    2022-05-22T16:54:58Z
1    2022-08-16T12:48:28Z
2    2022-08-16T03:08:14Z
3    2023-03-10T02:52:41Z
4    2022-07-16T17:10:05Z
Name: ts, dtype: object

In [14]:
# Utilizamos los datos de ts para agregar las columnas de dia, mes y año
df['year'] = pd.DatetimeIndex(df["ts"]).year
df['month'] = pd.DatetimeIndex(df["ts"]).month
df['day'] = pd.DatetimeIndex(df["ts"]).day

In [15]:
# Eliminamos la columna ts
df.drop(columns=['ts'], inplace=True)

In [16]:
# Mostramos una muestra
df.sample(10)

Unnamed: 0,platform,ms_played,track_name,album_artist_name,album_name,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,year,month,day
304,Partner amazon_salmon Amazon;Echo_Show_5;27d4d...,179173,Bad Ideas,Tessa Violet,Bad Ideas,trackdone,trackdone,False,0.0,False,0,2022,8,14
7084,Windows 10 (10.0.22622; x64; AppX),142893,Jingle Bell Rock -Spotify Singles Holiday,MisterWives,Jingle Bell Rock -Spotify Singles Holiday,fwdbtn,trackdone,True,0.0,False,1659907601866,2022,8,7
1857,"Android OS 12 API 31 (Xiaomi, Redmi Note 8)",157560,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Hollywood's Bleeding,fwdbtn,trackdone,True,0.0,True,1648137661727,2022,3,24
8175,"Android OS 11 API 30 (Xiaomi, Redmi Note 8)",180066,Logro Desbloqueado,Porta,Algo Ha Cambiado,trackdone,trackdone,True,0.0,False,1659742696382,2022,8,5
14004,Windows 10 (10.0.22000; x64; AppX),19955,En Este Track,Santa RM,Bajo Cero,trackdone,fwdbtn,True,0.0,False,1650059141697,2022,4,15
12274,"Android OS 11 API 30 (Xiaomi, Redmi Note 8)",2747,Logro Desbloqueado,Porta,Algo Ha Cambiado,fwdbtn,fwdbtn,True,0.0,False,1660870701973,2022,8,19
9408,"Android OS 11 API 30 (Xiaomi, Redmi Note 8)",253200,Riptide,MisterWives,Spotify Sessions,trackdone,trackdone,True,0.0,True,1649155983265,2022,4,5
5889,not_applicable,197337,In Your Eyes - Radio Edit,INNA,In Your Eyes,trackdone,trackdone,False,0.0,False,0,2022,11,1
727,android,203573,La Locura Está en Mí - Streaming Remaster,Santaflow,Ave Fénix,trackdone,trackdone,True,0.0,True,1678168983,2023,3,7
8088,"Android OS 12 API 31 (Xiaomi, Redmi Note 8)",232186,Our Own House,MisterWives,Our Own House,trackdone,trackdone,True,0.0,False,1647967059327,2022,3,22


In [17]:
# Cambiamos ms_played a s_played convirtiendo de milisegundos a segundos

def convert_ms_to_s(ms_played):
    return ms_played / 1000

# Aplicamos la función
df["s_played"] = df["ms_played"].apply(convert_ms_to_s)

# Eliminamos la columna ms_played
df.drop("ms_played", axis=1, inplace=True)

In [18]:
# revisamos el dataset
df.sample(10)

Unnamed: 0,platform,track_name,album_artist_name,album_name,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,year,month,day,s_played
11450,"Android OS 11 API 30 (Xiaomi, Redmi Note 8)",No Sé en Qué Te Fallé,Santa RM,Mala Ortografía,trackdone,fwdbtn,True,0.0,False,1658532523878,2022,7,22,28.717
13762,"Android OS 11 API 30 (Xiaomi, Redmi Note 8)",Realidad Virtual,Porta,Equilibrio,fwdbtn,fwdbtn,True,0.0,False,1650366060099,2022,4,19,0.023
5195,android,Un Par De Balas,C-Kan,Clasificacion C Vol.1,fwdbtn,fwdbtn,True,1.0,True,1677896799,2023,3,4,1.843
3479,windows,Bang Bang,K'NAAN,Troubadour,trackdone,trackdone,False,0.0,False,1674414726,2023,1,22,186.533
5247,android,La Locura Está en Mí - Streaming Remaster,Santaflow,Ave Fénix,trackdone,trackdone,True,0.0,True,1671720277,2022,12,22,203.573
2316,android,Chasing This,MisterWives,Connect The Dots,fwdbtn,fwdbtn,True,1.0,False,1667939355,2022,11,8,0.957
12689,windows,I Really Want to Stay at Your House,Rosa Walton,"Cyberpunk 2077: Radio, Vol. 2 (Original Soundt...",trackdone,trackdone,False,0.0,False,0,2022,12,19,246.422
10237,Partner amazon_salmon Amazon;Echo_Show_5;27d4d...,Kitchen Song,Tessa Violet,Kitchen Song,trackdone,trackdone,False,0.0,False,0,2022,8,14,155.08
11297,"Android OS 12 API 31 (Xiaomi, Redmi Note 8)",FRIENDS,Marshmello,Speak Your Mind,trackdone,trackdone,True,0.0,True,1648468721865,2022,3,28,202.62
5910,"Android OS 11 API 30 (Xiaomi, Redmi Note 8)",Come to Me Slowly,Margo Guryan,27 Demos,trackdone,trackdone,False,0.0,False,1661048586565,2022,8,21,142.813


In [19]:
# inspeccionamos los valores que toma la columna "platform"
df["platform"].unique()

array(['Windows 10 (10.0.22000; x64; AppX)',
       'Android OS 11 API 30 (Xiaomi, Redmi Note 8)', 'android',
       'Windows 10 (10.0.22622; x64; AppX)',
       'Android OS 12 API 31 (Xiaomi, Redmi Note 8)',
       'Windows 10 (10.0.19042; x64; AppX)',
       'Partner amazon_salmon Amazon;Echo_Show_5;27d4dfe427b34d57995b463e5d63198d;;tpapi',
       'windows', 'web_player windows 10;chrome 101.0.4951.67;desktop',
       'Windows 10 (10.0.22621; x64; AppX)',
       'web_player windows 10;chrome 102.0.5005.115;desktop',
       'web_player windows 10;chrome 100.0.4896.127;desktop',
       'Android OS 12 API 32 (Xiaomi, Redmi Note 8)', 'not_applicable',
       'Linux [x86-64 0]',
       'web_player windows 10;chrome 103.0.5060.114;desktop',
       'web_player windows 10;chrome 103.0.5060.134;desktop',
       'web_player windows 10;chrome 99.0.4844.84;desktop',
       'Linux Ubuntu Core 18 (snap package) [x86-64 0]',
       'web_player windows 10;chrome 104.0.5112.102;desktop',
       'Wind

In [20]:
# Definimos una función para agrupar las plataformas
# evitando la distinción entre versiones de android y windows
# juntando los navegadores web
# Definir la función reasignar()
def reasignar(valor):
    if 'android' in valor.lower():
        return 'Android'
    elif 'windows' in valor.lower():
        return 'Windows'
    elif 'edge' in valor.lower() or 'chrome' in valor.lower():
        return 'Web-browser'
    elif 'linux' in valor.lower():
        return 'Linux'
    else:
        return 'Otros'

# Aplicar la función a la columna "platform"
df['platform'] = df['platform'].apply(reasignar)

In [21]:
# Visualizamos la forma final de nuestro dataset
df.sample(10)

Unnamed: 0,platform,track_name,album_artist_name,album_name,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,year,month,day,s_played
10390,Android,Bored,Tessa Violet,Bored,clickrow,endplay,False,0.0,False,1637719690723,2021,11,24,6.43
9861,Android,Zombies on Your Lawn,Laura Shigihara,Plants Vs. Zombies (Original Video Game Soundt...,clickrow,endplay,False,0.0,False,1646577238724,2022,3,6,8.05
5011,Android,FRIENDS,Marshmello,Speak Your Mind,trackdone,trackdone,True,0.0,True,1677879876,2023,3,3,202.62
3252,Android,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,Spider-Man: Into the Spider-Verse,fwdbtn,fwdbtn,True,1.0,True,1678421174,2023,3,10,2.03
9922,Android,Sé Tu Mismo,Porta,Sé Tu Mismo,fwdbtn,fwdbtn,True,0.0,False,1659398091404,2022,8,1,2.335
6038,Otros,My Brother,MisterWives,Connect The Dots,fwdbtn,fwdbtn,True,0.0,False,0,2022,10,8,9.896
6617,Android,Quiero Que Sepas (feat. MC Magic),C-Kan,Clasificacion C Vol.1,trackdone,trackdone,True,0.0,True,1678172255,2023,3,7,218.075
13585,Android,"The Spectacular Spider-Man Main Theme (From ""T...",Geek Music,"The Spectacular Spider-Man Main Theme (From ""T...",trackdone,trackdone,True,0.0,False,1638446818216,2021,12,2,150.879
5829,Android,Estados,Porta,No Hay Truco (Maqueta 2007),trackdone,trackdone,True,0.0,True,1678164282,2023,3,7,300.826
12264,Android,Crush,Tessa Violet,Crush,trackdone,trackdone,True,0.0,False,1678412346,2023,3,10,216.266


In [22]:
# Exportamos el dataset en un archivo csv
df.to_csv('../Data/Spotify_data.csv', index=False)