In [48]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import lyricsgenius
import numpy as np

class songScraper:
    """
    A class for scraping songs off spotify API and getting their lyrics from Genius API


    """

    def __init__(self, cid, secret, genius_api_key):
        """
        Initializes the SongScraper class.

        :param cid: Spotify API client ID.
        :param secret: Spotify API client secret.
        :param genius_api_key: Genius API access token.
        """
        self.spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=cid, client_secret=secret))
        self.genius = lyricsgenius.Genius(genius_api_key)

    def get_artists(self, genre="Rock Nacional", artist_n=5, song_n=5, market="AR",artist_popularity=15):
        """
        Retrieves songs for a specific genre.

        :param genre: music genre to search. Default "Rock Nacional".
        :param artist_n: number of artists to return. Default 5.
        :param song_n: number of songs to return for each artist. Default 5.
        :param popularity: lower threshold of popularity for artists. Default 15.
        :param market: market for the artists. Default "AR".

        :return: DataFrame containing column witt audio features.
        """
        offset=1
        size = artist_n
        # Request de search endpoint query=genre, tipo playlist.
        response = self.spotify.search(q=genre, type='playlist', market=market, limit=None, offset=offset)

        #Extraigo el id de las playlists que devuelve response
        playlists = [playlist['id'] for playlist in response['playlists']['items']]

        #Creo una lista vacia que se convertirá en el df final y un set de artistas para comprobar repeticiones
        artists_data = []
        seen_artist_ids = set()

        # While loop para obtener la cantidad de artistas especificada en size

        while size is None or len(artists_data) < size:
            #itero sobre cada playlist ID
            for playlist_id in playlists:
                results = self.spotify.playlist_tracks(playlist_id)
                tracks = results['items']

                # Itero sobre todos los tracks de la playlist
                for track in tracks:
                    track_info = track['track']

                    # Itero sobre los artistas de cada track
                    for artist in track_info['artists']:
                        artist_id = artist['id']

                        # Compruebo repeticiones
                        if artist_id not in seen_artist_ids:
                            seen_artist_ids.add(artist_id)


                            # Intento obtener más información del artista
                            try:
                                artist_data = self.spotify.artist(artist_id)
                            except:
                                artist_data = None

                            # Compruebo si el artista produce rock nacional
                            #if artist_data is not None and ("argentine rock" in artist_data["genres"] or "rock nacional" in artist_data["genres"]):

                            #Filtro los artistas por un umbral de popularidad
                            if artist_data['popularity'] > artist_popularity:

                                # Extraigo el nombre, id, genre y popularidad del artista.
                                artist_info = {
                                    'Artist': artist['name'],
                                    'Artist_ID': artist_id,
                                    "Artist_genres": artist_data['genres'],
                                    "Artist_popularity": artist_data['popularity']
                                }
                                artists_data.append(artist_info)

                        if size is not None and len(artists_data) >= size:
                            break

                    if size is not None and len(artists_data) >= size:
                        break

                if size is not None and len(artists_data) >= size:
                    break

        df = pd.DataFrame(artists_data)
        songs = []
        song_id = []
        songs_release = []
        songs_popularity = []
        for i in df["Artist"]:
            #Obtengo n tracks para artista
            results = self.spotify.search(q=f"artist:{i}",  type='track', offset=0, limit=song_n)

            #Extraigo canción, id de canción, release y popularidad.
            songs.append([result["name"] for result in results["tracks"]["items"]])
            song_id.append([result["id"] for result in results["tracks"]["items"]])
            songs_release.append([result["album"]["release_date"] for result in results["tracks"]["items"]])
            songs_popularity.append([result["popularity"] for result in results["tracks"]["items"]])
        df["Track"] = songs
        df["Track_ID"] = song_id
        df["Track_release_date"] = songs_release
        df["Track_popularity"] = songs_popularity
        df = df.explode(["Track","Track_ID","Track_release_date","Track_popularity"])

        return df

    def get_songs(self, n=50, genre="", market="AR"):
        """
       Fetches song data, including track information, artist details, and audio features.

       :param n: Number of songs to fetch. Limit: 1000. See get_artists method for a bigger number of songs.
       :param genre: Music genre to fetch.
       :param market: Market on which to fetch songs. Defaults to AR.
       :return: DataFrame containing song data.
       """
        offset = 0  # You can adjust the offset as needed
        songs_data = []

        while len(songs_data) < n:
            results = self.spotify.search(q=f'genre:"{genre}"', type='track', limit=50, offset=offset, market=market)
            for track in results['tracks']['items']:
                track_name = track['name']
                track_id = track["id"]
                track_release_date = track['album']['release_date']
                track_popularity = track['popularity']
                artist_data = track['artists'][0]  # Assuming a single artist for simplicity
                artist_id = artist_data['id']
                artist_name = artist_data['name']
                artist_popularity = self.get_artist_popularity(artist_id)
                artist_genres = self.get_artist_genres(artist_id)
                album_name = track['album']['name']
                songs_data.append({'Track': track_name,"Track_ID": track_id ,"Track_release_date": track_release_date,"Track_popularity": track_popularity,'Artist': artist_name, 'Artist_ID': artist_id, 'Artist_Popularity': artist_popularity, 'Artist_Genres': artist_genres, 'Album': album_name})

                if len(songs_data) >= n:
                    break

            offset += 50

        return pd.DataFrame(songs_data)

    def get_artist_popularity(self, artist_id):
        try:
            artist_data = self.spotify.artist(artist_id)
            return artist_data['popularity']
        except:
            return "nan"

    def get_artist_genres(self, artist_id):
        artist_data = self.spotify.artist(artist_id)
        return artist_data['genres']

    def get_audio_features(self, df):
        """
        Retrieves audio features for a song using its Spotify track ID.

        :param df: dataframe with column with "Track_ID".
        :return: DataFrame containing column witt audio features.
        """
    # Extraigo los audio features para cada canción
        features = []
        for i in df["Track_ID"]:
            try:
                feature = self.spotify.audio_features(i)
                if feature:
                    feature = {k: feature[0][k] for k in list(feature[0])[:11]}
                else:
                    feature = "nan"
            except:
                feature = "nan"
            features.append(feature)
        df["features_dict"] = features
        return df

    def get_lyrics(self, df):
        """
        Fetches lyrics for songs in a DataFrame using the Genius API.

        :param df: DataFrame containing song data.
        :return: DataFrame with added "Lyrics" column.
        """
        lyrics = []

        total_rows = len(df)
        for idx, row in df.iterrows():
            progress = f"Fetching Lyrics: {idx+1}/{total_rows}"
            print(progress, end="\r")  # Print with carriage return to overwrite previous line

            try:
                song = self.genius.search_song(row["Track"], row["Artist"])
                if song:
                    lyrics.append(song.lyrics)
                else:
                    lyrics.append("nan")
            except Exception as e:
                print(f"Error occurred: {e}")
                lyrics.append("nan")

        print()  # Print a newline after all iterations are done
        df["Lyrics"] = lyrics
        return df

    def get_id(self,artist_name):
        try:
            # Search for the artist
            results = self.spotify.search(q=artist_name, type='artist', limit=1)

            # Extract the artist ID if found
            if 'artists' in results and 'items' in results['artists'] and len(results['artists']['items']) > 0:
                artist_id = results['artists']['items'][0]['id']
                return artist_id
            else:
                print(f"Artist '{artist_name}' not found.")
                return None

        except spotipy.SpotifyException as e:
            print(f"Spotipy error: {e}")
            return None


In [49]:
music = songScraper(cid="82c6e48354394b06956482e5ca4491b3",secret="71dcf5d5e38d451589cd5b521184928d",genius_api_key="0PiAxG73oHP3bj_nFttt-mdd25RMUO8d2kElPJHRUfSdnIMKKu7Il-lRO7dddwPR")

In [19]:
df = pd.read_csv("https://gefero.github.io/factor_data_track_CSS/M4/practica_integradora_1/data/tango_rock.csv")
df["artista"]

0                 A-Tirador Láser
1                 A-Tirador Láser
2                 A-Tirador Láser
3                 A-Tirador Láser
4                 A-Tirador Láser
                   ...           
21873        jorge padula perkins
21874    francisco garcia jimenez
21875    enrique santos discepolo
21876                 maximo orsi
21877              enrique lorenz
Name: artista, Length: 21878, dtype: object

In [40]:
id = pd.DataFrame(df.groupby("artista")["artista"].first())

id["id"] = id["artista"].apply(music.get_id)
id

Artist 'Cualquier Limón' not found.
Artist 'Dum69' not found.
Artist 'Hallibour Fiberglass Sereneiders' not found.


Unnamed: 0_level_0,artista,id
artista,Unnamed: 1_level_1,Unnamed: 2_level_1
A-Tirador Láser,A-Tirador Láser,7pHZKUfpbFMZoMaBOg2jwH
A.N.I.M.A.L.,A.N.I.M.A.L.,6eoz7BtTcC5Q303xQtSgj9
Abril a Mil,Abril a Mil,0X9HIxFmynjrSOzbBv7uoM
Abterno,Abterno,1iXLcpr2SlUwrU2oCP8nI9
Acorazado Potemkin,Acorazado Potemkin,4QDSqX4vtsftXfEAlKBVXR
...,...,...
yacare,yacare,0G6VQf773ZIIYkKs8QnW1V
yaguaron,yaguaron,1CajkwEgJac9j9alNJJxTQ
yaravi,yaravi,0g3UV4KTsBXySBgxKtNUFG
yelma baldi,yelma baldi,7kwCkEJ384PWm0UQW3hxjS


In [50]:
id["popularidad_spotify"] = id["id"].apply(music.get_artist_popularity)

In [54]:
id.sort_values(by="popularidad_spotify",ascending=False)

TypeError: '<' not supported between instances of 'int' and 'str'

In [59]:
id.to_excel(r"C:\Users\PC\Desktop\Popularidad_canciones.xlsx")

In [58]:
id['popularidad_spotify'] = id['popularidad_spotify'].replace('nan', np.nan)
id.sort_values(by="popularidad_spotify",ascending=False)

Unnamed: 0_level_0,artista,id,popularidad_spotify
artista,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Plum,Plum,12GqGscKJx3aE4t07u7eVZ,93.0
Pez,Pez,12GqGscKJx3aE4t07u7eVZ,93.0
ZAS,ZAS,40ZNYROS4zLfyyBSs2PGe2,89.0
Malón,Malón,246dkjvS1zLTtiykXe5h60,89.0
Africana,Africana,66CXWjxzNUsdJxJ2JdwvnR,87.0
...,...,...,...
juan augusto,juan augusto,4QxUPd2C4apTMIer7cx4t0,0.0
anibal de iturriaga,anibal de iturriaga,1zTMhx7WaaQ3Ox9UZFhJMW,0.0
Cualquier Limón,Cualquier Limón,,
Dum69,Dum69,,


In [63]:
df

Unnamed: 0,link,fecha,artista,titulo,letra,genero
0,https://rock.com.ar/artistas/93/letras/12635,2004.0,A-Tirador Láser,¿Despertarás?,Siento que se aleja / un poco antes de descarg...,rock
1,https://rock.com.ar/artistas/93/letras/4456,1998.0,A-Tirador Láser,Acao,Amor eterno me invernas bajo la espada santo m...,rock
2,https://rock.com.ar/artistas/93/letras/12643,2002.0,A-Tirador Láser,Admiración,"Vida es vida aqui, no hay temor al fin / vamos...",rock
3,https://rock.com.ar/artistas/93/letras/12629,2004.0,A-Tirador Láser,Algo nuevo,"Veo que todos debieron partir, / junto con tod...",rock
4,https://rock.com.ar/artistas/93/letras/4447,1996.0,A-Tirador Láser,Armas del bien,El tiempo se vio entregado / y el cielo mando ...,rock
...,...,...,...,...,...,...
21873,http://www.todotango.com/musica/tema/8916/Y-ve...,,jorge padula perkins,y veo llover,version para cantante masculino | estoy mirand...,tango
21874,http://www.todotango.com/musica/tema/123/Ya-es...,1934.0,francisco garcia jimenez,ya estamos iguales,mi noche es tu noche | mi llanto tu llanto | m...,tango
21875,http://www.todotango.com/musica/tema/167/Yira-...,1930.0,enrique santos discepolo,yira yira,cuando la suerte qu' es grela | fayando y faya...,tango
21876,http://www.todotango.com/musica/tema/2716/Yo-s...,1934.0,maximo orsi,yo soy aquel muchacho,ya paso la primavera con sus flores | golondri...,tango


In [68]:
id.drop(columns="artista").reset_index()

Unnamed: 0,artista,id,popularidad_spotify
0,A-Tirador Láser,7pHZKUfpbFMZoMaBOg2jwH,6.0
1,A.N.I.M.A.L.,6eoz7BtTcC5Q303xQtSgj9,35.0
2,Abril a Mil,0X9HIxFmynjrSOzbBv7uoM,0.0
3,Abterno,1iXLcpr2SlUwrU2oCP8nI9,38.0
4,Acorazado Potemkin,4QDSqX4vtsftXfEAlKBVXR,14.0
...,...,...,...
1791,yacare,0G6VQf773ZIIYkKs8QnW1V,21.0
1792,yaguaron,1CajkwEgJac9j9alNJJxTQ,62.0
1793,yaravi,0g3UV4KTsBXySBgxKtNUFG,45.0
1794,yelma baldi,7kwCkEJ384PWm0UQW3hxjS,50.0


In [71]:
merge.to_excel(r"C:\Users\PC\Desktop\Popularidad_canciones_completo.xlsx")
