In [None]:
import os
import glob
import pandas as pd
import csv
import requests
import time
import random
from bs4 import BeautifulSoup

In [None]:
seed_path = os.path.join('data', 'seed')
top_songs_filename = os.path.join('data', 'raw', 'top_songs_with_lyrics.csv')

# Read data

In [None]:
all_files = glob.glob(seed_path + "/*.csv")
song_data = pd.concat(pd.read_csv(f) for f in all_files)
song_data.head()

In [None]:
# Normalize artist and title and drop duplicate rows
song_data['artists'] = song_data['artists'].str.upper() 
song_data['title'] = song_data['title'].str.upper() 
song_data.drop_duplicates(subset=['title', 'artists'], keep='first', inplace=True)

song_data.head()

# Get lyrics from MusicMatch api

In [None]:
# TODO: Ignorar las canciones que no tienen lyrics
# Referencia de como usar el api de music match https://youtu.be/WFRzKmpepj4
base_url= 'https://api.musixmatch.com/ws/1.1/'
api_key = "3950aca40989ee9f7d05723278e3fdfd"

In [None]:
def get_lyric(row):
    title = row['title']
    artist = row['artists']
    lyric_matcher = 'matcher.lyrics.get?format=json&callback=callback'
    url = f"{base_url}/{lyric_matcher}&q_track={title}&q_artist={artist}&apikey={api_key}"
    # print(title, artist, url)
    
    response = requests.get(url)
    data = response.json() # lee la información en json (cuidado con jsonp, se tiene que hacer mayor procesamiento)
    if data['message']['header']['status_code'] == 200:
        lyric = data['message']['body']['lyrics']['lyrics_body'].replace('******* This Lyrics is NOT for Commercial use *******', '')
        # print(lyric)
        
        row['lyric'] = lyric.replace('(1409620732275)',' ')
        row['explicit'] = data['message']['body']['lyrics']['explicit']
        row['lyric_provider'] = 'MusicMatch'
    return row

def get_music_genre(row):
    title = row['title']
    artist = row['artists']
    url = f"{base_url}/matcher.track.get?format=json&callback=callback&q_track={title}&q_artist={artist}&apikey={api_key}"
    # print(title, artist, url)
    
    response = requests.get(url)
    data = response.json()
    # print(data)
    if data['message']['header']['status_code'] == 200:
        track = data['message']['body']['track']
        # print(track)
        
        row['musixmatch_rating'] = track['track_rating']
        # row['artist_name'] = track['artist_name']
        # row['track_name'] = track['track_name']

        if len(track['primary_genres']['music_genre_list']) > 0:
            row['genre'] = track['primary_genres']['music_genre_list'][0]['music_genre']['music_genre_name_extended']
    return row

In [None]:
song_data = song_data.apply(get_lyric, axis=1)
song_data = song_data.apply(get_music_genre, axis=1)

# Get lyrics from Google api

In [None]:
def get_google_lyrics(row):
    # Simulate chrome, this prevent that google does not sent g-expandable-content elements
    headers = {
        # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }
    
    # Generate google url
    title = row['title']
    artist = row['artists']
    page = requests.get(f"https://www.google.com/search?q={artist}-{title}", headers=headers)
    
    # if does not return response 200 exit
    if page.status_code != 200:
        print('Blocked: ', page.status_code)
        return row
    
    # Parse html
    html = BeautifulSoup(page.text, 'html.parser')
     
    # Find Lyrics google
    lyrics_select = html.select("div[data-lyricid] > g-expandable-content:nth-child(2)")#
    if len(lyrics_select) > 0:  # If google use LyricFind
        lyrics_container = lyrics_select[0]
        # print(title, artist, "Find Lyric")
        
        google_lyric = []
        for prose in lyrics_container.select("div"):
            for verse in prose.select("span"):
                google_lyric.append(verse.text + '\n')  # End of verse
            google_lyric.append('\n')  # End of prose
        google_lyric = ''.join(google_lyric) 

        row['lyric'] = google_lyric
        row['lyric_provider'] = 'Google'
    else:  # Todo: other Google lyrics provider
            pass  
    
    # Se agrega timer ramdom para que no se bloquee le IP, sinembargo se bloquea 
    time.sleep(random.uniform(1, 3))
    return row

In [None]:
song_data = song_data.apply(get_google_lyrics, axis=1)

In [None]:
song_data.head(50)

# Export lyrics data

In [None]:
song_data.to_csv(
    top_songs_filename, 
    index=False, 
    quoting=csv.QUOTE_NONNUMERIC  # Prevent errors on strings with , and \n characters
)