In [6]:
import requests
import pandas as pd

# Function to get Spotify access token
def get_spotify_token(client_id, client_secret):
    auth_url = 'https://accounts.spotify.com/api/token'
    auth_response = requests.post(auth_url, {
        'grant_type': 'client_credentials',
        'client_id': '',
        'client_secret': '',
    })
    auth_data = auth_response.json()
    return auth_data['access_token']

# Function to search for a track and get its ID
def search_track(track_name, artist_name, token):
    query = f"{track_name} artist:{artist_name}"
    url = f"https://api.spotify.com/v1/search?q={query}&type=track"
    response = requests.get(url, headers={
        'Authorization': f'Bearer {token}'
    })
    json_data = response.json()
    try:
        first_result = json_data['tracks']['items'][0]
        track_id = first_result['id']
        return track_id
    except (KeyError, IndexError):
        return None

# Function to get track details
def get_track_details(track_id, token):
    url = f"https://api.spotify.com/v1/tracks/{track_id}"
    response = requests.get(url, headers={
        'Authorization': f'Bearer {token}'
    })
    json_data = response.json()
    image_url = json_data['album']['images'][0]['url']
    return image_url

# Your Spotify API Credentials
client_id = 'your_client_id'   # Cambia esto por tu client_id
client_secret = 'your_client_secret'  # Cambia esto por tu client_secret

# Get Access Token
access_token = get_spotify_token(client_id, client_secret)

# Read your DataFrame
df_spotify = pd.read_csv('Spotify_2024.csv', encoding='ISO-8859-1')

# Loop through each row to get track details and add to DataFrame
for i, row in df_spotify.iterrows():
    track_name = row['Track']  # Utiliza la columna 'Track'
    artist_name = row['Artist']  # Utiliza la columna 'Artist'

    track_id = search_track(track_name, artist_name, access_token)
    if track_id:
        image_url = get_track_details(track_id, access_token)
        df_spotify.at[i, 'image_url'] = image_url  # Añade la imagen a la nueva columna

# Save the updated DataFrame
df_spotify.to_csv('updated_file.csv', index=False)


In [16]:
# Leer el archivo CSV actualizado
df_updated = pd.read_csv('updated_file.csv')

# Mostrar las primeras filas para verificar
print(df_updated.head())

                        Track                    Album Name          Artist  \
0         MILLION DOLLAR BABY  Million Dollar Baby - Single   Tommy Richman   
1                 Not Like Us                   Not Like Us  Kendrick Lamar   
2  i like the way you kiss me    I like the way you kiss me         Artemas   
3                     Flowers              Flowers - Single     Miley Cyrus   
4                     Houdini                       Houdini          Eminem   

  Release Date          ISRC All Time Rank  Track Score Spotify Streams  \
0    4/26/2024  QM24S2402528             1        725.4     390,470,936   
1     5/4/2024  USUG12400910             2        545.9     323,703,884   
2    3/19/2024  QZJ842400387             3        538.4     601,309,283   
3    1/12/2023  USSM12209777             4        444.9   2,031,280,633   
4    5/31/2024  USUG12403398             5        423.3     107,034,922   

  Spotify Playlist Count Spotify Playlist Reach  ...  Deezer Playlist Coun

In [25]:
# Verificar valores nulos en el DataFrame
print(df_updated.isnull().sum())

Track                            0
Album Name                       0
Artist                           5
Release Date                     0
ISRC                             0
All Time Rank                    0
Track Score                      0
Spotify Streams                113
Spotify Playlist Count          70
Spotify Playlist Reach          72
Spotify Popularity             804
YouTube Views                  308
YouTube Likes                  315
TikTok Posts                  1173
TikTok Likes                   980
TikTok Views                   981
YouTube Playlist Reach        1009
Apple Music Playlist Count     561
AirPlay Spins                  498
SiriusXM Spins                2123
Deezer Playlist Count          921
Deezer Playlist Reach          928
Amazon Playlist Count         1055
Pandora Streams               1106
Pandora Track Stations        1268
Soundcloud Streams            3333
Shazam Counts                  577
TIDAL Popularity              4600
Explicit Track      

In [27]:
# Cargar el DataFrame original
df = pd.read_csv('updated_file.csv', encoding='ISO-8859-1')

# Función para limpiar datos
def clean_data(df):
    # Limpiar caracteres raros en columnas de texto
    def clean_text(text):
        if isinstance(text, str):  # Asegúrate de que sea una cadena
            text = ''.join(c for c in text if ord(c) < 128)  # Eliminar caracteres no ASCII
            return text.strip()  # Eliminar espacios en blanco
        return text

    # Aplicar limpieza a las columnas relevantes
    df['Track'] = df['Track'].apply(clean_text)
    df['Album Name'] = df['Album Name'].apply(clean_text)
    df['Artist'] = df['Artist'].apply(clean_text)

    # Reemplazar valores nulos
    df['Artist'] = df['Artist'].fillna('Desconocido')  # O un valor que tenga sentido
    df['Spotify Streams'] = df['Spotify Streams'].fillna(0)  # Reemplazar por 0 o un valor representativo

    # Verificar si hay NaN en 'Spotify Streams'
    if df['Spotify Streams'].isnull().any():
        print("Hay valores no numéricos en 'Spotify Streams' que han sido convertidos a NaN.")
        print(df[df['Spotify Streams'].isnull()])

    # Convertir a numérico, forzando errores a NaN
    df['Track Score'] = pd.to_numeric(df['Track Score'], errors='coerce')

    # Eliminar solo filas con 'Track' nulos
    df.dropna(subset=['Track'], inplace=True)

    return df

# Limpiar el DataFrame
df_cleaned = clean_data(df)

# Mostrar la forma del DataFrame limpio
print(df_cleaned.shape)

# Verificar valores nulos en el DataFrame limpio
print(df_cleaned.isnull().sum())

(4600, 30)
Track                            0
Album Name                       0
Artist                           0
Release Date                     0
ISRC                             0
All Time Rank                    0
Track Score                      0
Spotify Streams                  0
Spotify Playlist Count          70
Spotify Playlist Reach          72
Spotify Popularity             804
YouTube Views                  308
YouTube Likes                  315
TikTok Posts                  1173
TikTok Likes                   980
TikTok Views                   981
YouTube Playlist Reach        1009
Apple Music Playlist Count     561
AirPlay Spins                  498
SiriusXM Spins                2123
Deezer Playlist Count          921
Deezer Playlist Reach          928
Amazon Playlist Count         1055
Pandora Streams               1106
Pandora Track Stations        1268
Soundcloud Streams            3333
Shazam Counts                  577
TIDAL Popularity              4600
Explicit 

In [28]:
# Convertir la columna 'Release Date' a tipo fecha
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')  # Convierte a fecha; errores se convierten a NaT

# Rellenar columnas numéricas con 0
numeric_columns = ['Spotify Streams', 'Spotify Playlist Count', 'Spotify Playlist Reach', 
                   'Spotify Popularity', 'YouTube Views', 'YouTube Likes', 
                   'TikTok Posts', 'TikTok Likes', 'TikTok Views', 
                   'YouTube Playlist Reach', 'Apple Music Playlist Count', 
                   'AirPlay Spins', 'SiriusXM Spins', 'Deezer Playlist Count', 
                   'Deezer Playlist Reach', 'Amazon Playlist Count', 
                   'Pandora Streams', 'Pandora Track Stations', 
                   'Soundcloud Streams', 'Shazam Counts', 'TIDAL Popularity']

# Reemplazar NaN en columnas numéricas por 0
df[numeric_columns] = df[numeric_columns].fillna(0)

# Rellenar columnas no numéricas con 'No Data'
df[['Track', 'Album Name', 'Artist', 'ISRC']] = df[['Track', 'Album Name', 'Artist', 'ISRC']].fillna('No Data')

# Verificar si hay valores nulos restantes
print(df.isnull().sum())

# Mostrar la forma del DataFrame después de la limpieza
print(df.shape)


Track                           0
Album Name                      0
Artist                          0
Release Date                    0
ISRC                            0
All Time Rank                   0
Track Score                     0
Spotify Streams                 0
Spotify Playlist Count          0
Spotify Playlist Reach          0
Spotify Popularity              0
YouTube Views                   0
YouTube Likes                   0
TikTok Posts                    0
TikTok Likes                    0
TikTok Views                    0
YouTube Playlist Reach          0
Apple Music Playlist Count      0
AirPlay Spins                   0
SiriusXM Spins                  0
Deezer Playlist Count           0
Deezer Playlist Reach           0
Amazon Playlist Count           0
Pandora Streams                 0
Pandora Track Stations          0
Soundcloud Streams              0
Shazam Counts                   0
TIDAL Popularity                0
Explicit Track                  0
image_url     

In [29]:
# Eliminar filas donde 'image_url' es nulo
df_cleaned = df.dropna(subset=['image_url'])

# Verificar la forma del DataFrame después de eliminar las filas
print(df_cleaned.shape)

# Verificar si aún hay valores nulos en 'image_url'
print(df_cleaned['image_url'].isnull().sum())

(4037, 30)
0


In [31]:
# Verificar duplicados en la columna 'Track'
duplicated_tracks = df_cleaned[df_cleaned.duplicated(subset=['Track'], keep=False)]
if not duplicated_tracks.empty:
    print("Se encontraron duplicados en los nombres de los tracks:")
    print(duplicated_tracks[['Track', 'Artist']])  # Puedes imprimir más columnas si es necesario
else:
    print("No se encontraron duplicados en los nombres de los tracks.")

Se encontraron duplicados en los nombres de los tracks:
                          Track         Artist
3                       Flowers    Miley Cyrus
4                       Houdini         Eminem
5                   Lovin On Me    Jack Harlow
7                     Gata Only     FloyyMenor
8          Danza Kuduro - Cover  MUSIC LAB JPN
...                         ...            ...
4508             Happy Birthday        YolyBoy
4522    Smells Like Teen Spirit        Nirvana
4539  Cake By The Ocean - Cover  MUSIC LAB JPN
4570                        You     Willy Paul
4590                   Daylight   Harry Styles

[350 rows x 2 columns]


In [32]:
# Eliminar duplicados basados en la columna 'Track'
df_cleaned = df_cleaned.drop_duplicates(subset=['Track'], keep='first')

In [33]:
# Guardar el DataFrame limpio y sin duplicados en un archivo CSV
df_cleaned.to_csv('spotify_cleaned.csv', index=False, encoding='utf-8')

In [34]:
print(df_cleaned.columns)

Index(['Track', 'Album Name', 'Artist', 'Release Date', 'ISRC',
       'All Time Rank', 'Track Score', 'Spotify Streams',
       'Spotify Playlist Count', 'Spotify Playlist Reach',
       'Spotify Popularity', 'YouTube Views', 'YouTube Likes', 'TikTok Posts',
       'TikTok Likes', 'TikTok Views', 'YouTube Playlist Reach',
       'Apple Music Playlist Count', 'AirPlay Spins', 'SiriusXM Spins',
       'Deezer Playlist Count', 'Deezer Playlist Reach',
       'Amazon Playlist Count', 'Pandora Streams', 'Pandora Track Stations',
       'Soundcloud Streams', 'Shazam Counts', 'TIDAL Popularity',
       'Explicit Track', 'image_url'],
      dtype='object')


: 