In [None]:
import sys
from pathlib import Path

# Ruta a la carpeta raíz del proyecto
BASE_DIR = Path(r"D:\Python\spotify_rock")  # ajusta si tu ruta es otra
SRC_DIR = BASE_DIR / "src"

# Añadir src al sys.path
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

import pandas as pd
from data.loading import load_spotify_raw

df = load_spotify_raw()
print(df.shape)
df.head()

In [None]:
df.info()
df.isna().sum().sort_values(ascending=False)

In [None]:
df.describe().T

In [None]:
df.columns

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("ggplot")
sns.set()

In [None]:
features = ["Danceability", "Energy", "Valence", "Tempo", "Duration_ms"]

plt.figure(figsize=(14, 8))
for i, col in enumerate(features, 1):
    plt.subplot(2, 3, i)
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(14, 4))
sns.histplot(df["Stream"], bins=50)
plt.title("Distribución de Streams (Spotify)")
plt.xscale("log")  # porque hay valores muy grandes
plt.show()

In [None]:
# Top 20 artistas con más canciones en el dataset
df["Artist"].value_counts().head(20)

In [None]:
# Ejemplos de canciones por un artista concreto
df[df["Artist"] == "Gorillaz"][["Artist", "Track", "Danceability", "Energy", "Valence", "Tempo"]].head(10)

In [None]:
rock_artists = [
    "Red Hot Chili Peppers",
    "Metallica",
    "Linkin Park",
    "Radiohead",
    "AC/DC",
    "Gorillaz",
]

df_rock = df[df["Artist"].isin(rock_artists)].copy()
df_no_rock = df[~df["Artist"].isin(rock_artists)].copy()

df_rock.shape, df_no_rock.shape

In [None]:
audio_cols = ["Danceability", "Energy", "Valence", "Tempo", "Duration_ms"]

rock_stats = df_rock[audio_cols].describe().T
no_rock_stats = df_no_rock[audio_cols].describe().T

rock_stats, no_rock_stats

In [None]:
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
sns.boxplot(data=df, x=df["Artist"].isin(rock_artists), y="Energy")
plt.xticks([0, 1], ["No rock", "Rock"])
plt.title("Energy: Rock vs No Rock")

plt.subplot(1, 2, 2)
sns.boxplot(data=df, x=df["Artist"].isin(rock_artists), y="Tempo")
plt.xticks([0, 1], ["No rock", "Rock"])
plt.title("Tempo: Rock vs No Rock")

plt.tight_layout()
plt.show()

In [None]:
audio_cols = ["Danceability", "Energy", "Valence", "Tempo", "Duration_ms"]

plt.figure(figsize=(14, 8))
for i, col in enumerate(audio_cols, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(
        data=df,
        x=df["Artist"].isin(rock_artists),
        y=col,
    )
    plt.xticks([0, 1], ["No rock", "Rock"])
    plt.title(col)
plt.tight_layout()
plt.show()


In [None]:
from analysis.rock_features import (
    DEFAULT_ROCK_ARTISTS,
    split_rock_nonrock,
    audio_stats_by_group,
)
from viz.plots_rock import boxplots_audio_rock_vs_nonrock

df_rock, df_no_rock = split_rock_nonrock(df)
rock_stats, no_rock_stats = audio_stats_by_group(df_rock, df_no_rock)

rock_stats, no_rock_stats

In [None]:
boxplots_audio_rock_vs_nonrock(
    df=df,
    rock_artists=DEFAULT_ROCK_ARTISTS,
)

In [16]:
from analysis.eda_general import (
    summarize_nulls,
    numeric_describe,
    top_artists_by_streams,
    top_tracks_by_streams,
)

In [None]:
nulls = summarize_nulls(df)
nulls.head(15)

In [None]:
numeric_stats = numeric_describe(df)
numeric_stats

In [None]:
top_artists = top_artists_by_streams(df, n=15)
top_artists

In [None]:
top_tracks = top_tracks_by_streams(df, n=15)
top_tracks

In [None]:
corr_cols = ["Danceability", "Energy", "Valence", "Tempo", "Duration_ms", "Stream"]
corr = df[corr_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlaciones entre audio features y Streams")
plt.show()

In [None]:
top_artists = top_artists_by_streams(df, n=10)

plt.figure(figsize=(10, 5))
sns.barplot(
    data=top_artists,
    x="Stream",
    y="Artist",
    hue="Artist",
    palette="viridis",
    legend=False,
)
plt.title("Top 10 artistas por Streams (Spotify)")
plt.xlabel("Streams totales")
plt.ylabel("Artista")
plt.show()

In [None]:
from analysis.rock_features import DEFAULT_ROCK_ARTISTS

df["is_rock"] = df["Artist"].isin(DEFAULT_ROCK_ARTISTS)
df["is_rock"].value_counts()

In [None]:
plt.figure(figsize=(12, 5))

# Energy vs Streams
plt.subplot(1, 2, 1)
sns.scatterplot(
    data=df[df["is_rock"] == False],
    x="Energy",
    y="Stream",
    color="lightgray",
    alpha=0.3,
    s=15,
    label="No rock",
)
sns.scatterplot(
    data=df[df["is_rock"] == True],
    x="Energy",
    y="Stream",
    color="red",
    alpha=0.8,
    s=40,
    label="Rock",
)
plt.yscale("log")
plt.title("Energy vs Streams (log)")
plt.legend()

# Valence vs Streams
plt.subplot(1, 2, 2)
sns.scatterplot(
    data=df[df["is_rock"] == False],
    x="Valence",
    y="Stream",
    color="lightgray",
    alpha=0.3,
    s=15,
    label="No rock",
)
sns.scatterplot(
    data=df[df["is_rock"] == True],
    x="Valence",
    y="Stream",
    color="red",
    alpha=0.8,
    s=40,
    label="Rock",
)
plt.yscale("log")
plt.title("Valence vs Streams (log)")
plt.legend()

plt.tight_layout()
plt.show()