In [3]:
import requests
from bs4 import BeautifulSoup

bts_id = 47923


def extract_lyrics_links(artist_url: str) -> list[str]:
    response = requests.get(url=artist_url)
    print(f"Response from artist page: {response.status_code}")
    soup = BeautifulSoup(response.text, "html.parser")
    links_ul = soup.find("ul", class_="listado-letras")
    links = [a["href"] for a in links_ul.find_all("a")]
    return links

def extract_song_lyric(song_url: str) -> str:
    response = requests.get(url=song_url)
    response.encoding = "utf-8"
    print(f"Response from lyrics page: {response.status_code}")
    soup = BeautifulSoup(response.text, "html.parser")
    song_title = soup.find("h1").get_text()
    print(f"Title: {song_title}")
    header = soup.find("h2", string="LETRA")
    if header is None:
        header = soup.find("h2", string="LETRA EN ESPAÑOL")
        
    if header is None:
        return ""

    paragraphs = []

    for p in header.find_all_next("p"):
        if p.find_parent("div") is None:
            continue
        if p.find_parent("div").get("id") == "letra":
            paragraphs.append(p.get_text(separator="\n"))

    lyrics = "\n".join(p for p in paragraphs)
    return f"Título: {song_title} \n\n {lyrics}"

def extract_artist_lyrics(artist: str, artist_id: int):
    lyrics_url = f"https://www.musica.com/letras.asp?letras={artist_id}&orden=alf"
    links = extract_lyrics_links(lyrics_url)
    print(f"found {len(links)} songs")
    artist_header = f"{artist}\n===\n"
    lyrics = []
    for link in links:
        print(f"extracting song from: {link}")
        lyrics.append(extract_song_lyric(link))
    lyrics_str = "\n\n===\n\n".join(lyrics)
    artist_str = artist_header + lyrics_str
    with open(f"{artist_id}_{artist}.txt", "w", encoding="utf-8") as file:
        file.write(artist_str)
    return artist_str

In [4]:
# extract_artist_lyrics("Kjarkas", kjarkas_id)
extract_artist_lyrics("BTS", bts_id)

Response from artist page: 200
found 314 songs
extracting song from: https://www.musica.com/letras.asp?letra=2376498
Response from lyrics page: 200
Title: 離れないで (歌詞)
extracting song from: https://www.musica.com/letras.asp?letra=2291365
Response from lyrics page: 200
Title: 방탄소년단 Save Me
extracting song from: https://www.musica.com/letras.asp?letra=2354039
Response from lyrics page: 200
Title: 방탄소년단 Save Me en español
extracting song from: https://www.musica.com/letras.asp?letra=2286247
Response from lyrics page: 200
Title: 봄날 (Spring Day) Korean
extracting song from: https://www.musica.com/letras.asp?letra=2506678
Response from lyrics page: 200
Title: 00:00 (Zero O’Clock)
extracting song from: https://www.musica.com/letras.asp?letra=2506694
Response from lyrics page: 200
Title: 00:00 (Zero O’Clock) en español
extracting song from: https://www.musica.com/letras.asp?letra=2383772
Response from lyrics page: 200
Title: 134340 en español
extracting song from: https://www.musica.com/letras.a

'BTS\n===\nTítulo: 離れないで (歌詞) \n\n 離れないで (歌詞)\n君の言葉にはいくつもの意味が\nあるように間こえるから   What’s what?\n謎の足跡見つけたら Follow\n飲み込まれる My Shadow 光の裏側に潜めた\n会えない君だけが知る My Answer\n途切れる前に I wanna know everything\n時が流れるほど深まる君の\n過去と未来の間にいる僕は今\nI Don’t Leave Me, now Believe 走り出す\nNo Ending 君は僕の鼓動\n例えどんな雨に打たれても 例えどんな闇に消されても\n救い出すよ必ず 君は一人じゃない\n動くはずなし時計の金ナが今 一歩一歩踏み出すように moving on\nCalling me passing me すれ違い all of it, destiny\nそれでも女台まる one way race 離れ初めても that’s okay\n「何も心配しないでよ いや、何も」\n\n===\n\nTítulo: 방탄소년단 Save Me \n\n 방탄소년단 Save Me\n[Jimin] Nan sumswigo sipeo I bami sireo\nIjen kkaego sipeo kkumsogi sireo\nNae ane gathyeoseo nan jugeoisseo\nDon’t wanna be lonely\nJust wanna be yours\n[JungKook] Wae iri kkamkkamhan geonji\nNiga eoptneun I goseun\nWiheomhajanha manggajin nae moseup\nGuhaejwo nal nado nal jabeul su eopseo (su eopseo)\n[V] Nae simjangsoril deureobwa\nJemeotdaero neol bureujanha\n[Jin] I kkaman eodum sogeseo\nNeoneun ireoke bitnanikka\n[V] Geu soneul naemireojwo Save me Save me\nI need your love be

In [5]:
from dotenv import load_dotenv
import openai
import os
from llama_index.core import (
    VectorStoreIndex, 
    SimpleDirectoryReader, 
    StorageContext,
    load_index_from_storage,
    Document,
    PromptTemplate
)
load_dotenv()

True

In [6]:
def get_artist_documents(filename: str) -> list[Document]:
    with open(filename) as file:
        data = file.read()
    songs = data.split("===")
    artist = songs.pop(0).strip()
    
    documents = [
        Document(
            text=song,
            metadata={
                "category":"music",
                "artist": artist,
            }
        )
        for song in songs
    ]    
    return documents
    

In [7]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
Settings.embed_model = embed_model

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


README.md:   0%|          | 0.00/179k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [8]:
PERSIST_DIR = "lyrics_store2"

if not os.path.exists(PERSIST_DIR):
    documents = get_artist_documents("47923_BTS.txt")
    index = VectorStoreIndex.from_documents(documents, show_progress=True)
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) 
    index = load_index_from_storage(storage_context)

Parsing nodes:   0%|          | 0/314 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/327 [00:00<?, ?it/s]

In [9]:
query_engine = index.as_query_engine(verbose=True)

In [10]:
qa_template_str = """
    You are an expert in K-Pop music, your task is to guide and teach the user 
    about your field. Answer the user queries only with supported data in your context.
    Your context may contain complete lyrics or parts of them in different languages, but
    your answer will always be in Spanish. 

    Context information is below.
    ---------------------
    {context_str}
    ---------------------
    Given the context information and not prior knowledge, 
    answer the query with detailed source information, include direct quotes and use bullet lists in your 
    answers, in one of the bullets detail the tone/sentiment of the song.
    Query: {query_str}
    Answer: 
"""
qa_template = PromptTemplate(qa_template_str)

In [11]:
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_template}
)

In [12]:
response = query_engine.query("cuales canciones de BTS hablan de amor?")

In [13]:
print(response)

- La canción "Fake Love" de BTS habla sobre el amor, específicamente sobre un amor falso y las emociones contradictorias que puede generar. Algunas partes de la letra que reflejan esto son:
  - "Estoy tan harto de este falso amor, falso amor, falso amor"
  - "Intento moldear una mentira bonita para ti"
  - "El amor es una locura, el amor es una locura"
  - Tonos/sentimientos de la canción: melancólico, decepcionado, confundido.

- Otra canción de BTS que habla sobre el amor es "Waste It On Me". Aunque la canción aborda más la idea de si el amor es una pérdida de tiempo, también refleja la temática del amor. Algunas partes de la letra que lo muestran son:
  - "So if love is nothing more, then just a waste of your time?"
  - "Waste it on me, waste it on me"
  - Tonos/sentimientos de la canción: reflexivo, cuestionador, esperanzador.


In [14]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='845bf103-f759-4b71-900d-ce53120ef34d', embedding=None, metadata={'category': 'music', 'artist': 'BTS'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e46d213a-9ef6-463a-9fd1-5b3dd0b4183b', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'category': 'music', 'artist': 'BTS'}, hash='0577ae1459c28fbdbb436c27328128a77245662562c141c9fb938fdf55c3f293')}, text="Título: Fake Love en español \n\n Fake Love\n[Intro: V, Jungkook]\nPor ti, podía fingir que estaba feliz cuando estaba triste\nPor ti, podía fingir que era fuerte cuando estaba herido\nDesearía que el amor fuera perfecto como el amor mismo\nDesearía que todas mis debilidades pudieran ser ocultadas\nCuidé de una flor que no puede florecer en un sueño que no puede volverse realidad\n[Hook: Jimin]\nEstoy tan harto de este falso amor, falso amor, falso amor\nLo siento mucho pero es falso amor, falso amor, falso amor\n

In [15]:
chat_engine = index.as_chat_engine(verbose=True)

In [16]:
response = chat_engine.chat("que canciones de BTS hablan de amor?")

Added user message to memory: que canciones de BTS hablan de amor?
=== Calling Function ===
Calling function: query_engine_tool with args: {"input":"Canciones de BTS que hablan de amor"}
Got output: Fake Love, Waste It On Me



In [17]:
print(response)

Algunas canciones de BTS que hablan de amor son "Fake Love" y "Waste It On Me".


In [18]:
print(chat_engine.chat("y cuales hablan de amor propio?"))

Added user message to memory: y cuales hablan de amor propio?
=== Calling Function ===
Calling function: query_engine_tool with args: {"input":"Canciones de BTS que hablan de amor propio"}
Got output: Singularity

Una canción de BTS que habla de amor propio es "Singularity".
