In [None]:
import requests
import json
import time
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy import stats

API_KEY = '1a9ea75b5f92dfda381d817fc00e5458' 
USER_AGENT = '808e0a8bbaf9d30bf181edb60c742824'  
LIMIT = 20  ##Número de resultados a obtener

headers = {
    'user-agent': USER_AGENT
}

##sacar top artists
def get_top_artists():
    params = {
        'method': 'chart.getTopArtists',
        'api_key': API_KEY,
        'format': 'json',
        'limit': LIMIT
    }
    response = requests.get('https://ws.audioscrobbler.com/2.0/', headers=headers, params=params)
    if response.status_code == 200:
        data = response.json()
        df = pd.json_normalize(data['artists']['artist'])
    else:
        print(f"Error en la solicitud de top artistas globales: {response.status_code}")
    time.sleep(1)
    return df

##sacar top artistas por pais
def get_top_artists_by_country(country):
    params = {
        'method': 'geo.getTopArtists',
        'country': country,
        'api_key': API_KEY,
        'format': 'json',
        'limit': LIMIT
    }
    response = requests.get('https://ws.audioscrobbler.com/2.0/', headers=headers, params=params)
    if response.status_code == 200:
        data = response.json()
        df = pd.json_normalize(data['topartists']['artist'])
    else:
        print(f"Error en la solicitud de top artistas por país: {response.status_code}")
    time.sleep(1)
    return df

##sacar top canciones por pais
def get_top_tracks_by_country(country):
    params = {
        'method': 'geo.getTopTracks',
        'country': country,
        'api_key': API_KEY,
        'format': 'json',
        'limit': LIMIT
    }
    response = requests.get('https://ws.audioscrobbler.com/2.0/', headers=headers, params=params)
    if response.status_code == 200:
        data = response.json()
        df = pd.json_normalize(data['tracks']['track'])
    else:
        print(f"Error en la solicitud de top canciones por país: {response.status_code}")
    time.sleep(1)
    return df


##sacar top album de x artista
def get_top_tracks_by_artist(artist):
    
    params = {
        'method': 'artist.getTopTracks',
        'artist': artist,
        'api_key': API_KEY,
        'format': 'json',
        'limit': LIMIT
    }
    response = requests.get('https://ws.audioscrobbler.com/2.0/', headers=headers, params=params)
    if response.status_code == 200:
        data = response.json()
        df = pd.json_normalize(data['toptracks']['track'])
    else:
        print(f"Error en la solicitud de top álbumes del artista: {response.status_code}")
    time.sleep(1)
    return df
    
##limpia y ordena dataframe
def limpiar_y_ordenar(df, columnas_a_eliminar=None):
    if columnas_a_eliminar:
        df = df.drop(columns=columnas_a_eliminar)
    if 'listeners' in df.columns:
        df = df.sort_values(by='listeners', ascending=False)
    df = df.reset_index(drop=True)
    return df

In [None]:
df_top_artist = get_top_artists()
df_top_artist

In [None]:
df_top_artist_filtered = limpiar_y_ordenar(df_top_artist, ["mbid", "url", "streamable", "image"])
df_top_artist_filtered

In [None]:
df_top_artist_filtered.info()

In [None]:
df_top_artist_filtered[["playcount", "listeners"]] = df_top_artist_filtered[["playcount", "listeners"]].astype(float)
df_top_artist_filtered.info()

In [None]:
media_playcount = df_top_artist_filtered["playcount"].mean()
media_playcount

In [None]:
media_listeners = df_top_artist_filtered["listeners"].mean()
media_listeners

In [None]:
mediana_playcount = df_top_artist_filtered["playcount"].median()
mediana_playcount

In [None]:
mediana_listeners = df_top_artist_filtered["listeners"].median()
mediana_listeners

In [None]:
sns.barplot(x = "name", y = "playcount", data = df_top_artist_filtered)
plt.xticks(rotation=90)
plt.axhline(mediana_playcount, color='red', linestyle='--', label=f'Mediana: {mediana_playcount}')
plt.axhline(media_playcount, color='green', linestyle='--', label=f'Media: {media_playcount}')
plt.legend()
plt.show()

In [None]:
sns.barplot(x = "name", y = "listeners", data = df_top_artist_filtered)
plt.xticks(rotation=90)
plt.axhline(mediana_listeners, color='red', linestyle='--', label=f'Mediana: {mediana_listeners}')
plt.axhline(media_listeners, color='green', linestyle='--', label=f'Media: {media_listeners}')
plt.legend()
plt.show()

In [None]:
sns.barplot(x = "listeners", y = "playcount", data = df_top_artist_filtered)
plt.xticks(rotation=90)
plt.show()

In [None]:
listeners= df_top_artist_filtered["listeners"]
playcount= df_top_artist_filtered["playcount"]
stats.pearsonr(listeners, playcount)[0]

In [None]:
sns.set(style='whitegrid')
sns.lineplot(data=df_top_artist_filtered, x='listeners', y='playcount', marker='o')
 
plt.ylabel('Listeners')
plt.xlabel('Playcount')
plt.tight_layout()
plt.show()
#El número de oyentes de los artistas no influye directamente en el número de reproducciones.

In [None]:
#TOP 20 CANCIONES POR PAIS
df_tracks_españa = get_top_tracks_by_country("Spain")
df_tracks_france = get_top_tracks_by_country("France")
df_tracks_germany = get_top_tracks_by_country("Germany")
df_tracks_uk = get_top_tracks_by_country("United Kingdom")
df_tracks_usa= get_top_tracks_by_country("United States")
df_tracks_rusia = get_top_tracks_by_country("Russian Federation")

df_tracks_españa

In [None]:
df_tracks_españa_filtered = limpiar_y_ordenar(df_tracks_españa,["mbid", "url", "image", "streamable.#text", "streamable.fulltrack", "artist.mbid",	"artist.url", "@attr.rank"])
df_tracks_france_filtered = limpiar_y_ordenar(df_tracks_france,["mbid", "url", "image", "streamable.#text", "streamable.fulltrack", "artist.mbid",	"artist.url", "@attr.rank"])
df_tracks_germany_filtered = limpiar_y_ordenar(df_tracks_germany,["mbid", "url", "image", "streamable.#text", "streamable.fulltrack", "artist.mbid",	"artist.url", "@attr.rank"])
df_tracks_uk_filtered = limpiar_y_ordenar(df_tracks_uk,["mbid", "url", "image", "streamable.#text", "streamable.fulltrack", "artist.mbid",	"artist.url", "@attr.rank"])
df_tracks_usa_filtered = limpiar_y_ordenar(df_tracks_usa,["mbid", "url", "image", "streamable.#text", "streamable.fulltrack", "artist.mbid",	"artist.url", "@attr.rank"])
df_tracks_rusia_filtered = limpiar_y_ordenar(df_tracks_rusia,["mbid", "url", "image", "streamable.#text", "streamable.fulltrack", "artist.mbid",	"artist.url", "@attr.rank"])    

df_tracks_españa_filtered

In [None]:
df_tracks_españa_filtered[["duration", "listeners"]] = df_tracks_españa_filtered[["duration", "listeners"]].astype(float)
df_tracks_france_filtered[["duration", "listeners"]] = df_tracks_france_filtered[["duration", "listeners"]].astype(float)
df_tracks_germany_filtered[["duration", "listeners"]] = df_tracks_germany_filtered[["duration", "listeners"]].astype(float)
df_tracks_uk_filtered[["duration", "listeners"]] = df_tracks_uk_filtered[["duration", "listeners"]].astype(float)
df_tracks_usa_filtered[["duration", "listeners"]] = df_tracks_usa_filtered[["duration", "listeners"]].astype(float)
df_tracks_rusia_filtered[["duration", "listeners"]] = df_tracks_rusia_filtered[["duration", "listeners"]].astype(float)

df_tracks_españa_filtered.info()

In [None]:
df_tracks_españa_filtered['pais'] = 'España'
df_tracks_france_filtered['pais'] = "France"
df_tracks_germany_filtered['pais'] = "Germany"
df_tracks_uk_filtered['pais'] = "United Kingdom"
df_tracks_usa_filtered['pais'] = "United States"
df_tracks_rusia_filtered ['pais'] = "Russian Federation"

df_tracks_españa_filtered

In [None]:
df_tracks_concat = pd.concat([df_tracks_españa_filtered, df_tracks_france_filtered, df_tracks_germany_filtered, df_tracks_uk_filtered, df_tracks_usa_filtered, df_tracks_rusia_filtered ], axis = 0)
df_tracks_concat

In [None]:
df_tracks_concat_sorted = df_tracks_concat.sort_values("listeners", ascending = False)
df_tracks_concat_sorted

In [None]:
media_españa = df_tracks_españa_filtered["listeners"].mean()
media_france= df_tracks_france_filtered["listeners"].mean()
media_germany= df_tracks_germany_filtered["listeners"].mean()
media_uk= df_tracks_uk_filtered["listeners"].mean()
media_usa= df_tracks_usa_filtered["listeners"].mean()
media_rusia= df_tracks_rusia_filtered["listeners"].mean() 

print(f"La media de oyentes en España es {media_españa}")
print(f"La media de oyentes en France es {media_france}")
print(f"La media de oyentes en Germany es {media_germany}")
print(f"La media de oyentes en United Kingdom es {media_uk}")
print(f"La media de oyentes en United States es {media_usa}")
print(f"La media de oyentes en Rusia es {media_rusia}")

In [None]:
mediana_españa = df_tracks_españa_filtered["listeners"].median()
mediana_france= df_tracks_france_filtered["listeners"].median()
mediana_germany= df_tracks_germany_filtered["listeners"].median()
mediana_uk= df_tracks_uk_filtered["listeners"].median()
mediana_usa= df_tracks_usa_filtered["listeners"].median()
mediana_rusia= df_tracks_rusia_filtered["listeners"].median() 

print(f"La mediana de oyentes en España es {mediana_españa}")
print(f"La mediana de oyentes en France es {mediana_france}")
print(f"La mediana de oyentes en Germany es {mediana_germany}")
print(f"La mediana de oyentes en United Kingdom es {mediana_uk}")
print(f"La mediana de oyentes en United States es {mediana_usa}")
print(f"La mediana de oyentes en Rusia es {mediana_rusia}")

In [None]:
canciones_por_artista = df_tracks_concat_sorted['artist.name'].value_counts()
canciones_por_artista 

In [None]:
sns.barplot(x = canciones_por_artista.index, y = canciones_por_artista.values)
plt.xticks(rotation=90, fontsize=8)
plt.show()

In [None]:
pais = df_tracks_concat_sorted['pais'].unique()
 
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 10))
axes = axes.flatten()
 
for i, pais in enumerate(pais):
    datos_pais = df_tracks_concat_sorted[df_tracks_concat_sorted['pais'] == pais]
    sns.barplot(x='listeners', y='name', data=datos_pais, ax=axes[i])
    axes[i].set_title(f'Oyentes de {pais}')
    axes[i].set_xlabel('Oyentes')
    axes[i].set_ylabel('Canción')
 
plt.tight_layout()
plt.show()

In [None]:
df_duration_media= df_tracks_concat_sorted["duration"].map(lambda x : x if x > 0 else df_tracks_concat_sorted["duration"].mean())

sns.kdeplot(x = df_duration_media,
            y = df_tracks_concat_sorted["listeners"])

plt.xticks(rotation=90)
plt.show()

In [None]:
artista_pais_esp = df_tracks_españa_filtered['artist.name'].value_counts()
artista_pais_fr = df_tracks_france_filtered['artist.name'].value_counts()
artista_pais_ger =df_tracks_germany_filtered['artist.name'].value_counts()
artista_pais_uk=df_tracks_uk_filtered['artist.name'].value_counts()
artista_pais_usa=df_tracks_usa_filtered['artist.name'].value_counts()
artista_pais_rusia=df_tracks_rusia_filtered['artist.name'].value_counts()

print(f"España \n {artista_pais_esp}")
print(f"Francia \n {artista_pais_fr}")
print(f"Germany \n {artista_pais_ger}")
print(f"United Kingdom \n {artista_pais_uk}")
print(f"United States \n {artista_pais_usa}")
print(f"Rusia \n {artista_pais_rusia}")

In [None]:
sns.histplot(df_tracks_concat_sorted["listeners"], color = "green", kde = True)
plt.show()

stats.skew(df_tracks_concat_sorted["listeners"])
#Asimetría negativa (cola a la izquierda)

In [None]:
##CANCIONES POR ARTISTA
df_lana = get_top_tracks_by_artist("Lana del Rey")
df_ic3peak = get_top_tracks_by_artist("Ic3peak")
df_billie = get_top_tracks_by_artist("Billie Eilish")
df_sia = get_top_tracks_by_artist("Sia")
df_estopa = get_top_tracks_by_artist("Estopa")
df_badbunny = get_top_tracks_by_artist("Bad Bunny")

In [None]:
df_lana_filtered = limpiar_y_ordenar(df_lana,["url","streamable","image","artist.url", "@attr.rank","mbid","artist.mbid"])
df_ic3peak_filtered = limpiar_y_ordenar(df_ic3peak,["url","streamable","image","artist.url", "@attr.rank","artist.mbid"])
df_billie_filtered = limpiar_y_ordenar(df_billie,["url","streamable","image","artist.url", "@attr.rank"])
df_sia_filtered = limpiar_y_ordenar(df_sia,["url","streamable","image","artist.url", "@attr.rank","mbid","artist.mbid"])
df_estopa_filtered = limpiar_y_ordenar(df_estopa,["url","streamable","image","artist.url", "@attr.rank","mbid","artist.mbid"])
df_badbunny_filtered = limpiar_y_ordenar(df_badbunny,["url","streamable","image","artist.url", "@attr.rank"])

In [None]:
df_lana_filtered[["playcount", "listeners"]] = df_lana_filtered[["playcount", "listeners"]].astype(float)
df_ic3peak_filtered[["playcount", "listeners"]] = df_ic3peak_filtered[["playcount", "listeners"]].astype(float)
df_billie_filtered[["playcount", "listeners"]] = df_billie_filtered[["playcount", "listeners"]].astype(float)
df_sia_filtered[["playcount", "listeners"]] = df_sia_filtered[["playcount", "listeners"]].astype(float)
df_estopa_filtered[["playcount", "listeners"]] = df_estopa_filtered[["playcount", "listeners"]].astype(float)
df_badbunny_filtered[["playcount", "listeners"]] = df_badbunny_filtered[["playcount", "listeners"]].astype(float)

In [None]:
df_concatenated = pd.concat([df_lana_filtered, df_ic3peak_filtered, df_billie_filtered, df_sia_filtered, df_estopa_filtered, df_badbunny_filtered], ignore_index=True)
df_top_canciones = df_concatenated.sort_values(by="listeners", ascending=False).reset_index(drop=True)

In [None]:
reproducciones_por_artista = df_top_canciones.groupby('artist.name')['playcount'].sum().sort_values(ascending=False)

oyentes_por_artista = df_top_canciones.groupby('artist.name')['listeners'].sum().sort_values(ascending=False)

In [None]:
##Gráfico de barras para reproducciones
sns.barplot(x=reproducciones_por_artista.values, y=reproducciones_por_artista.index)
plt.title('Reproducciones por Artista')
plt.xlabel('Reproducciones')
plt.ylabel('Artista')
plt.show()

##Gráfico de barras para oyentes
sns.barplot(x=oyentes_por_artista.values, y=oyentes_por_artista.index)
plt.title('Oyentes por Artista')
plt.xlabel('Oyentes')
plt.ylabel('Artista')
plt.show()

In [None]:
top_canciones = df_top_canciones.sort_values(by='playcount', ascending=False).head(10)

sns.barplot(x='playcount', y='name', data=top_canciones, hue='artist.name')
plt.title('Top 10 Canciones por Reproducciones')
plt.xlabel('Reproducciones')
plt.ylabel('Canción')
plt.legend(title='Artista')
plt.show()


In [None]:
##correlacion de pearson
listeners= df_top_canciones["listeners"]
playcount= df_top_canciones["playcount"]
stats.pearsonr(listeners, playcount)[0]

In [None]:
##relacion oyentes y reproducciones
sns.scatterplot(data=df_top_canciones, x='listeners', y='playcount', hue='artist.name')
plt.title('Oyentes vs Reproducciones por Canción')
plt.xlabel('Oyentes')
plt.ylabel('Reproducciones')
plt.legend(title='Artista')
plt.show()

In [None]:
df_top_canciones['Reproducciones_por_Oyente'] = df_top_canciones['playcount'] / df_top_canciones['listeners']

sns.scatterplot(data=df_top_canciones, x='Reproducciones_por_Oyente', y='playcount', hue='artist.name')
plt.title('Reproducciones por Oyente vs Reproducciones Totales')
plt.xlabel('Reproducciones por Oyente')
plt.ylabel('Reproducciones Totales')
plt.legend(title='Artista')
plt.show()

In [None]:
mediana = df_top_canciones['Reproducciones_por_Oyente'].median()

##calcular IQR
Q1 = df_top_canciones['Reproducciones_por_Oyente'].quantile(0.25)
Q3 = df_top_canciones['Reproducciones_por_Oyente'].quantile(0.75)
IQR = Q3 - Q1

# Definir los límites para los outliers
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR

##columna outlier
df_top_canciones['Es_Outlier'] = df_top_canciones['Reproducciones_por_Oyente'].apply(
    lambda x: 'Outlier' if x < limite_inferior or x > limite_superior else 'Normal')

sns.histplot(data=df_top_canciones, x='Reproducciones_por_Oyente', hue='Es_Outlier', bins=30, palette={'Normal': 'skyblue', 'Outlier': 'orange'}, multiple='stack')

##mediana
plt.axvline(mediana, color='red', linestyle='--', label=f'Mediana: {mediana:.2f}')

# Personalizar el gráfico
plt.title('Distribución de Reproducciones por Oyente con Outliers')
plt.xlabel('Reproducciones por Oyente')
plt.ylabel('Frecuencia')
plt.legend()
plt.show()

In [None]:
artistas = df_top_canciones['artist.name'].unique()

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 10))
axes = axes.flatten()

for i, artista in enumerate(artistas):
    datos_artista = df_top_canciones[df_top_canciones['artist.name'] == artista]
    sns.barplot(x='playcount', y='name', data=datos_artista, ax=axes[i])
    axes[i].set_title(f'Reproducciones de {artista}')
    axes[i].set_xlabel('Reproducciones')
    axes[i].set_ylabel('Canción')

plt.tight_layout()
plt.show()

In [None]:
##Distribución por reproducciones
sns.histplot(df_top_canciones['playcount'], bins=20, kde=True)
plt.title('Distribución de Reproducciones')
plt.xlabel('Reproducciones')
plt.ylabel('Frecuencia')
plt.show()

##Distribución por oyentes
sns.histplot(df_top_canciones['listeners'], bins=20, kde=True)
plt.title('Distribución de Oyentes')
plt.xlabel('Oyentes')
plt.ylabel('Frecuencia')
plt.show()

In [None]:
df_top_canciones['Reproducciones_Log'] = np.log1p(df_top_canciones['playcount'])
df_top_canciones['Oyentes_Log'] = np.log1p(df_top_canciones['listeners'])

##Distribución por reproducciones Log
sns.histplot(df_top_canciones['Reproducciones_Log'], bins=20, kde=True)
plt.title('Distribución de Reproducciones')
plt.xlabel('Reproducciones')
plt.ylabel('Frecuencia')
plt.show()

##Distribución por oyentes Log
sns.histplot(df_top_canciones['Oyentes_Log'], bins=20, kde=True)
plt.title('Distribución de Oyentes')
plt.xlabel('Oyentes')
plt.ylabel('Frecuencia')
plt.show()