In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Cargar los datos
data = pd.read_csv('../data/most-spotify-2024.csv', encoding='latin1')

In [None]:
data2 = data.copy()

# Verificar duplicados basados en 'Track' y 'Artist'
print("\nCantidad de datos duplicados:")
print(data2.duplicated(subset=['Track', 'Artist']).sum())

# Eliminar duplicados
data2 = data2.drop_duplicates(subset=['Track','Artist'])

In [None]:
#CANTIDAD DE VALORES NULOS DESPUES DE ELIMINAR DUPLICADOS
print("\nCantidad de valores faltantes por columna :")
print(data2.isna().sum().sort_values(ascending=False))

In [None]:
#LISTA DE COLUMNAS A ELIMINAR
cols_drop = [
    'TIDAL Popularity',
    'Soundcloud Streams',
    'SiriusXM Spins',
    'Pandora Track Stations',
    'Amazon Playlist Count',
    'Deezer Playlist Count',
    'Deezer Playlist Reach',
    'YouTube Playlist Reach'
]

#ELIMINAR COLUMNAS INNECESARIAS 
for c in cols_drop:
    if c in data2.columns:
        data2.drop(columns=c, inplace=True)

In [None]:
#transformar Release Date a formato datetime y extraer año y mes
data2["Release Date"] = pd.to_datetime(data2["Release Date"], errors="coerce")
data2["Release Year"] = data2["Release Date"].dt.year
data2["Release Month"] = data2["Release Date"].dt.month

data2[["Track", "Release Date", "Release Year", "Release Month"]].head()

In [None]:
#lista de columnas a convertir a formato numérico
cols_to_fix = [
    'Spotify Streams','Spotify Playlist Count','Spotify Playlist Reach',
    'YouTube Views','YouTube Likes','TikTok Posts','TikTok Likes','TikTok Views',
    'AirPlay Spins','Pandora Streams','Shazam Counts','All Time Rank'
]

# Convertir columnas a formato numérico, eliminando comas y manejando errores
for col in cols_to_fix:
    if col in data2.columns:
        data2[col] = pd.to_numeric(data2[col].astype(str).str.replace(",", "", regex=False), errors='coerce')

# Rellenar valores nulos en columnas numéricas con la mediana de cada columna
num_cols = data2.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols)>0:
    data2[num_cols] = data2[num_cols].fillna(data2[num_cols].median())

print("\nCantidad de valores NaN por columna numérica:")
print(data2[num_cols].isna().sum())

In [None]:
#en las filas numericas convierte nan a 0
#Ya todos los nan estaban con la mediana sjsjsjs
"""
num_cols = data2.select_dtypes(include=[np.number]).columns
data2[num_cols] = data2[num_cols].fillna(0)
"""

In [None]:
data2.to_csv("../data/data_limpia.csv", index=False)