# YouTube audio fetcher

In [55]:
from pytubefix import YouTube, Search

from pathlib import Path

data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

## Fetch url from title-artist

In [56]:
title = "Як ніколи"
artist = "Океан Ельзи"

results = Search(f"{title} {artist}")

for video in results.videos[:5]:
    print(f'Title: {video.title}')
    print(f'URL: {video.watch_url}')
    print(f'Duration: {video.length} sec')
    print(f'Views: {video.views}')
    print('---')

Title: Океан Ельзи - Як ніколи | Yak Nikoly (official video)
URL: https://youtube.com/watch?v=-lJKm0mL4B0
Duration: 285 sec
Views: 2903815
---
Title: Океан Ельзи—Як ніколи I і як ніколи у небі зима лютує, то болю тебе бракує
URL: https://youtube.com/watch?v=n6UAcosOKgo
Duration: 264 sec
Views: 94039
---
Title: Океан Ельзи - Як ніколи (Nick de Grand Edit)
URL: https://youtube.com/watch?v=tHyP444Aksw
Duration: 176 sec
Views: 16865
---
Title: Океан Ельзи - Як ніколи (Прем'єра 2024) | У небі зима лютує, до болю тебе бракує, сумую без тебе я
URL: https://youtube.com/watch?v=wBzba-5kTl8
Duration: 309 sec
Views: 223444
---
Title: Ніколи
URL: https://youtube.com/watch?v=0A0Sdw2MM3g
Duration: 200 sec
Views: 133558
---


## Download audio from url

In [15]:
url = "https://www.youtube.com/watch?v=-lJKm0mL4B0"
yt = YouTube(url)
print(yt.title)

Океан Ельзи - Як ніколи | Yak Nikoly (official video)


In [25]:
save_path = f"{yt.title}.mp3"
yt.streams.filter(only_audio=True).first().download(output_path=str(data_dir), filename=save_path)  

'c:\\DATA\\UCU\\thesis\\data\\Океан Ельзи - Як ніколи  Yak Nikoly (official video).mp3'

# Last.Fm artist-title list parser

In [4]:
import requests
from bs4 import BeautifulSoup

In [5]:
def get_soup(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    return BeautifulSoup(response.text, "html.parser")

## Ukrainian artists page (artists list)

In [8]:
page = 1
url = f"https://www.last.fm/tag/ukrainian/artists?page={page}"
headers = {"User-Agent": "Mozilla/5.0"}  # Last.fm may block bots without a user agent
response = requests.get(url, headers=headers)
response.text



In [3]:
soup = BeautifulSoup(response.text, "html.parser")

In [5]:
artist_names = [a.text for a in soup.select(".big-artist-list-title a")]
artist_names

['Океан Ельзи',
 'Go_A',
 'Бумбокс',
 'Скрябін',
 'KALUSH',
 'The Hardkiss',
 'glichery',
 'alyona alyona',
 'Poshlaya Molly',
 'Valentin Strykalo',
 'AShamaluevMusic',
 'Mykola Dmytrovych Leontovych',
 'daKooka',
 'Океан Ельзи',
 'Kazka',
 'Lama',
 'Pencil Legs',
 'Один В Каное',
 'Сметана band',
 'O.Torvald',
 '4Wheel']

## Artist main page (popularity, genres)

In [10]:
artist = "Океан Ельзи"
url = f"https://www.last.fm/music/{artist}"
soup = get_soup(url)

In [16]:
listener_count = soup.select_one(".header-metadata-tnew-display abbr").get("title")
scrobble_count = soup.select(".header-metadata-tnew-display abbr")[1].get("title")

print(f"Listeners: {listener_count}")
print(f"Scrobbles: {scrobble_count}")

Listeners: 203,045
Scrobbles: 13,552,013


In [15]:
genre_tags = [a.text for a in soup.select(".tags-list .tag a")]
genre_tags

['rock', 'ukrainian', 'ukrainian rock', 'indie', 'pop-rock']

## Artist tags page (tags)

In [17]:
artist = "Океан Ельзи"
url = f"https://www.last.fm/music/{artist}/+tags"
soup = get_soup(url)

In [22]:
tags = [a.text for a in soup.select(".big-tags-item-name a")]
tags

['rock',
 'ukrainian',
 'ukrainian rock',
 'indie',
 'pop-rock',
 'pop rock',
 'alternative rock']

## Artist all tracks page (track list)

In [12]:
artist = "Океан Ельзи"
page = 90
url = f"https://www.last.fm/music/{artist}/+tracks?page={page}"
soup = get_soup(url)

In [13]:
track_names = [a.text for a in soup.select(".chartlist-name a")]
track_names

['07_Поiзд "чужа Любов"',
 'Місто весни (feat. Один в каное)',
 'Vidpusti',
 '11_Невидима сiмя',
 'Ой, чий то кiнь стоїть',
 'Nikoly',
 'Fialky',
 'Відповідь',
 'Ночі i дні',
 'Бодегіта (Земля 2013)',
 '09_Невидимая сiмя',
 'Сюзи',
 'Невидима сімья',
 '10_Ото була весна',
 'Веселi часи',
 'Oto Bula Vesna',
 "Кам'яний ліс (Live)",
 'Kavachai',
 "Khochu napytys' toboyu",
 'Я не сдамся без боя',
 'І я на небi',
 'Сьюзi',
 '06_Сумна мелодия',
 'Тильки там де нас нема',
 '13_Колиска вiтру',
 'Твої зеленi очi',
 "Jak Ostannij Den'",
 '11_Вiддам',
 'Дзвони',
 'Silent night',
 'Там, де нас нема (Danilkin Де ви є remix)',
 'Ой, чий то кінь стоїть (narodna pisnya)',
 '02_Susy',
 'Etud',
 '10_Ластiвка з мого мiста',
 '09_Вiсiм',
 'Кошка',
 'Коли тебе нема (OST Брат 2)',
 'Просто менi',
 'Ty i ja',
 'Suzy',
 'Come To Me Baby',
 '06 Квитка',
 'Ya Do Tebe',
 'Лелеки // Міра',
 'Vyshche neba',
 'Bez Bou',
 '911 (тихий океан version)',
 'Мовчати',
 'В годi вже']

## Track page (tags, popularity, duration)

In [6]:
artist = "Океан Ельзи"
title = "Не питай"
url = f"https://www.last.fm/music/{artist}/_/{title}"
soup = get_soup(url)

In [7]:
track_tags = [a.text for a in soup.select(".tags-list .tag a")]
track_tags

['ukrainian', 'rock', 'love', 'sad', 'rock', 'ukrainian', 'ukrainian rock']

In [9]:
song_listener_count = soup.select(".header-metadata-tnew-display abbr")[0].get("title")
song_scrobble_count = soup.select(".header-metadata-tnew-display abbr")[1].get("title")

print(f"Listeners: {song_listener_count}")
print(f"Scrobbles: {song_scrobble_count}")

Listeners: 39,876
Scrobbles: 303,396


In [51]:
duration = soup.select_one(".catalogue-metadata-description").text.strip()
duration

'2:39'

# Fetching missing data

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import random
from loguru import logger
import os

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Initialize Selenium WebDriver once
options = Options()
options.add_argument("--headless")  # Run in headless mode for speed
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(options=options)

def get_soup(url):
    sleep(random.uniform(0.2, 0.6))  # Small random delay to avoid rate limiting

    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200 or response.text.strip() == "":
        logger.warning(f"Fallback to Selenium for {url}")
        driver.get(url)
        html = driver.page_source
    else:
        html = response.text

    return BeautifulSoup(html, "html.parser")

def soup_select_wrapper(soup, selector):
    try:
        return soup.select(selector)
    except Exception as e:
        logger.error(f"Error selecting {selector}: {e}")
        return None
    
# Remember to close the WebDriver at the end
import atexit
atexit.register(driver.quit)

<bound method ChromiumDriver.quit of <selenium.webdriver.chrome.webdriver.WebDriver (session="521444f61a51512b74ce37a1a1960645")>>

## Artists info

Exploring the artists whose data wasn't fetched with the current version of the script (possibly contain a different html structure)

In [4]:
import pandas as pd

artists_df = pd.read_csv("metadata/artists_10_pages.csv")
artists_df.head()

Unnamed: 0,artist,artist_listeners,artist_scrobbles,artist_tags
0,Океан Ельзи,203060,13552898,"['rock', 'ukrainian', 'ukrainian rock', 'indie..."
1,Go_A,189946,4088960,"['ukrainian', 'folktronica', 'folk', 'electron..."
2,Бумбокс,174374,6424069,"['ukrainian', 'funk', 'reggae', 'hip-hop', 'fu..."
3,Скрябін,58581,2793323,[]
4,KALUSH,106178,1850491,"['ukrainian', 'rap', 'hip-hop', 'ukraine', 'eu..."


In [5]:
artists_df[artists_df["artist_listeners"].isnull()]

Unnamed: 0,artist,artist_listeners,artist_scrobbles,artist_tags
64,Laud,,,[]
65,XARAKTER,,,[]
67,Lюк,,,[]
68,Анна,,,[]
69,Mistmorn,,,[]
...,...,...,...,...
193,Bad,,,[]
196,Лея,,,[]
199,Веремій,,,[]
201,Dens,,,[]


A modified artist info fetching func + added Selenium in ```get_soup(url)```:

In [6]:
def get_artist_info(artist):
    # Artist popularity
    url = f"https://www.last.fm/music/{artist}"
    soup = get_soup(url)
    select_res = soup_select_wrapper(soup, ".header-metadata-tnew-display abbr")

    listener_count = select_res[0].get("title") if select_res else None
    scrobble_count = select_res[1].get("title") if select_res else None

    # Artist tags
    url = f"https://www.last.fm/music/{artist}/+tags"
    soup = get_soup(url)
    tags = [a.text for a in soup_select_wrapper(soup, ".big-tags-item-name a") or []]

    return {
        "listeners": listener_count, 
        "scrobbles": scrobble_count,
        "tags": tags
        }

In [7]:
empty_artists = artists_df[artists_df["artist_listeners"].isnull()]["artist"].tolist()
empty_artists[:5]

['Laud', 'XARAKTER', 'Lюк', 'Анна', 'Mistmorn']

In [27]:
empty_artists = artists_df[artists_df["artist_tags"] == "[]"]["artist"].tolist()
empty_artists[:5]

['D4C', 'BohoMan', 'GXNRC']

In [28]:
still_empty_artists = []

for i, artist in enumerate(empty_artists):
    if (artists_df.loc[artists_df["artist"] == artist, "artist_tags"].values[0]) != "[]":
        logger.info(f"Already have info for {artist}")
        continue
    sleep(random.uniform(0, 1))
    artist_info = get_artist_info(artist)

    if artist_info["listeners"] is None and artist_info["scrobbles"] is None and not artist_info["tags"]:
        logger.error(f"Failed to get info for {artist}")
        still_empty_artists.append(artist)
        continue
    logger.info(f"Got info for {artist}")
    artists_df.loc[artists_df["artist"] == artist, "artist_listeners"] = artist_info["listeners"]
    artists_df.loc[artists_df["artist"] == artist, "artist_scrobbles"] = artist_info["scrobbles"]
    artists_df.loc[artists_df["artist"] == artist, "artist_tags"] = str(artist_info["tags"])

    if i % 10 == 0:
        artists_df.to_csv("metadata/artists_10_pages_v2.csv", index=False)
        logger.info(f"Saved progress at {i}")

artists_df.to_csv("metadata/artists_10_pages_v2.csv", index=False)

[32m2025-03-23 13:38:08.853[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mGot info for D4C[0m
[32m2025-03-23 13:38:08.862[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mSaved progress at 0[0m
[32m2025-03-23 13:38:10.966[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mGot info for BohoMan[0m
[32m2025-03-23 13:38:12.816[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mGot info for GXNRC[0m


A few are still with empty tag list, but this is due to the their absence in the original Last.FM page:

In [25]:
artist = 'BohoMan'

get_artist_info(artist)

{'listeners': '20,024', 'scrobbles': '77,073', 'tags': []}

## Songs info

In [41]:
import urllib.parse

def construct_lastfm_url(artist, title):
    base_url = "https://www.last.fm/music"

    # Replace spaces with + for both artist and title
    artist_formatted = urllib.parse.quote(artist.replace(" ", "+"), safe="+")
    title_formatted = urllib.parse.quote(title.replace(" ", "+"), safe="+")  # Keep `+`, encode `/`

    return f"{base_url}/{artist_formatted}/_/{title_formatted}"


def get_artist_tracks(artist, pages=1):
    # TODO: handle accessing unavailable pages
    tracks = []
    for page in range(1, pages + 1):
        url = f"https://www.last.fm/music/{artist}/+tracks?page={page}"
        soup = get_soup(url)
        track_names = [a.text for a in soup_select_wrapper(soup, ".chartlist-name a") or []]
        tracks.extend(track_names)
    return tracks


def get_track_info(track, artist):
    # Track popularity
    # url = f"https://www.last.fm/music/{artist}/_/{track}"
    url = construct_lastfm_url(artist, track)
    soup = get_soup(url)
    select_res = soup_select_wrapper(soup, ".header-metadata-tnew-display abbr")

    listener_count = select_res[0].get("title") if select_res else None
    scrobble_count = select_res[1].get("title") if select_res else None

    # Track tags
    tags = [a.text for a in soup_select_wrapper(soup, ".tags-list .tag a") or []]

    # Track duration
    duration = soup_select_wrapper(soup, ".catalogue-metadata-description")
    duration = duration[0].text.strip() if duration else None  # TODO: parse string to seconds

    return {
        "listeners": listener_count, 
        "scrobbles": scrobble_count,
        "tags": tags,
        "duration": duration
        }


In [None]:
songs_df = pd.read_csv("metadata/songs_10_2_pages.csv")

In [34]:
missing_artists_songs = artists_df[~artists_df["artist"].isin(songs_df["artist"])]["artist"].tolist()
missing_artists_songs[:5]

['KALUSH', 'glichery', 'AShamaluevMusic', 'daKooka', 'Kazka']

In [39]:
for artist in missing_artists_songs:
    sleep(random.uniform(1, 5))
    tracks = get_artist_tracks(artist, pages=2)
    if not tracks:
        logger.error(f"No tracks found for {artist}")
        continue
    logger.info(f"Got {len(tracks)} tracks for {artist}")
    for track in tracks:
        sleep(random.uniform(1, 5))
        track_info = get_track_info(track, artist)
        if track_info["listeners"] is None:
            logger.error(f"Failed to get info for {track} by {artist}")
            continue
        logger.info(f"Got info for {track} by {artist}")
        new_row = pd.DataFrame([{
            "title": track,
            "artist": artist,
            "title_listeners": track_info["listeners"],
            "title_scrobbles": track_info["scrobbles"],
            "title_tags": str(track_info["tags"]),
            "title_duration": track_info["duration"]
        }])
        songs_df = pd.concat([songs_df, new_row], ignore_index=True)

[32m2025-02-21 21:48:58.432[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mGot 100 tracks for KALUSH[0m
[32m2025-02-21 21:49:02.380[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mGot info for Stefania (Kalush Orchestra) by KALUSH[0m
[32m2025-02-21 21:49:06.959[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mGot info for Додому (feat. Skofka) by KALUSH[0m
[32m2025-02-21 21:49:12.481[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mGot info for Калуські вечорниці (feat. Tember Blanche) by KALUSH[0m
[32m2025-02-21 21:49:15.739[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mGot info for Гори (Gory) by KALUSH[0m
[32m2025-02-21 21:49:21.058[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mGot info for Зорi by KALUSH[0m
[32m2025-02-21 21:49:25.551[0m | [1mINFO    [0m | [36m__main__[0m:[36m<m

ConnectTimeout: HTTPSConnectionPool(host='www.last.fm', port=443): Max retries exceeded with url: /music/bris/_/License%20Plate (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000027088A38A30>, 'Connection to www.last.fm timed out. (connect timeout=None)'))

In [40]:
songs_df.to_csv("metadata/more_sooongs.csv", index=False)

Now let's fill the missing info about the tracks in the previously fetch dataframe:

In [30]:
songs_df = pd.read_csv("metadata/songs_10_2_pages.csv")
songs_df.head()

Unnamed: 0,title,artist,title_listeners,title_scrobbles,title_tags,title_duration
0,Без бою,Океан Ельзи,67241,500416,"['ukrainian', 'rock', 'pop rock', 'alternative...",4:21
1,Обійми,Океан Ельзи,40990,282632,"['ukrainian', 'blues', 'rock', 'alternative ro...",3:44
2,Не питай,Океан Ельзи,39876,303396,"['ukrainian', 'rock', 'love', 'sad', 'rock', '...",2:39
3,Коли тебе нема,Океан Ельзи,37672,239250,"['rock', 'ukrainian', 'rock', 'ukrainian', 'uk...",3:18
4,911,Океан Ельзи,35998,235613,"['rock', 'ukrainian', 'indie rock', 'rock', 'u...",3:29


In [33]:
missing_info_songs = songs_df[songs_df["title_listeners"].isnull() | songs_df["title_tags"].isnull()]
missing_info_songs

Unnamed: 0,title,artist,title_listeners,title_scrobbles,title_tags,title_duration
86,SHUM - Eurovision 2021 - Ukraine / Karaoke Ver...,Go_A,,,[],
116,Діагноз,Бумбокс,,,[],
117,Hip-Hop,Бумбокс,,,[],
119,Почути,Бумбокс,,,[],
120,Супер-пупер,Бумбокс,,,[],
...,...,...,...,...,...,...
9561,Mudaki,Сергій Жадан Та Собаки В Космосі,,,[],
9563,Хвилинку,Сергій Жадан Та Собаки В Космосі,,,[],
9565,Званімір Бобан,Сергій Жадан Та Собаки В Космосі,,,[],
9568,Інстаграм,Сергій Жадан Та Собаки В Космосі,,,[],


In [35]:
for index, row in missing_info_songs.iterrows():
    artist = row["artist"]
    title = row["title"]

    logger.info(f"Fetching info for track: {title} by {artist}")
    sleep(random.uniform(0, 1))  # Add a delay to avoid rate-limiting
    track_info = get_track_info(title, artist)

    if track_info["listeners"] is None or track_info["scrobbles"] is None or not track_info["tags"]:
        logger.error(f"Failed to get info for {title} by {artist}")
        continue

    logger.info(f"Got info for {title} by {artist}")
    songs_df.loc[index, "title_listeners"] = track_info["listeners"]
    songs_df.loc[index, "title_scrobbles"] = track_info["scrobbles"]
    songs_df.loc[index, "title_tags"] = str(track_info["tags"])
    songs_df.loc[index, "title_duration"] = track_info["duration"]

    if index % 50 == 0:
        songs_df.to_csv("metadata/songs_10_2_pages_v2.csv", index=False)
        logger.info(f"Saved progress at index {index}")

songs_df.to_csv("metadata/songs_10_2_pages_v2.csv", index=False)

[32m2025-03-23 13:59:49.293[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mFetching info for track: SHUM - Eurovision 2021 - Ukraine / Karaoke Version by Go_A[0m
[32m2025-03-23 13:59:53.787[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [31m[1mFailed to get info for SHUM - Eurovision 2021 - Ukraine / Karaoke Version by Go_A[0m
[32m2025-03-23 13:59:53.792[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mFetching info for track: Діагноз by Бумбокс[0m
[32m2025-03-23 13:59:56.407[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mGot info for Діагноз by Бумбокс[0m
[32m2025-03-23 13:59:56.410[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mFetching info for track: Hip-Hop by Бумбокс[0m
[32m2025-03-23 13:59:58.853[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mGot info for Hip-Hop by Бумбокс[0m
[3

Some songs are still missing the info due to bad url construction, thus we modified it and here are the results:

In [36]:
still_missing_info_songs = songs_df[songs_df["title_listeners"].isnull() | songs_df["title_tags"].isnull()]
still_missing_info_songs

Unnamed: 0,title,artist,title_listeners,title_scrobbles,title_tags,title_duration
86,SHUM - Eurovision 2021 - Ukraine / Karaoke Ver...,Go_A,,,[],
242,Carol of the Bells / God Rest Ye Merry Gentlemen,Mykola Dmytrovych Leontovych,,,[],
249,Leontovich / Arr Prizeman: Carol of the Bells,Mykola Dmytrovych Leontovych,,,[],
256,Leontovych / Arr. Prizeman: Carol of the Bells,Mykola Dmytrovych Leontovych,,,[],
262,Carol of the Bells/God Rest Ye Merry Gentelmen,Mykola Dmytrovych Leontovych,,,[],
...,...,...,...,...,...,...
9406,Я все сказав+,Dazzle Dreams,,,[],
9431,"Шейк май бейбі Де Ти, моє Сонце?",Dazzle Dreams,,,[],
9451,Ya vse skazav+,Dazzle Dreams,,,[],
9533,За#бали,Сергій Жадан Та Собаки В Космосі,,,[],


In [43]:
for index, row in still_missing_info_songs.iterrows():
    artist = row["artist"]
    title = row["title"]

    logger.info(f"Fetching info for track: {title} by {artist}")
    sleep(random.uniform(0, 1))  # Add a delay to avoid rate-limiting
    track_info = get_track_info(title, artist)

    if track_info["listeners"] is None or track_info["scrobbles"] is None or not track_info["tags"]:
        logger.error(f"Failed to get info for {title} by {artist}")
        continue

    logger.info(f"Got info for {title} by {artist}")
    songs_df.loc[index, "title_listeners"] = track_info["listeners"]
    songs_df.loc[index, "title_scrobbles"] = track_info["scrobbles"]
    songs_df.loc[index, "title_tags"] = str(track_info["tags"])
    songs_df.loc[index, "title_duration"] = track_info["duration"]

    if index % 50 == 0:
        songs_df.to_csv("metadata/songs_10_2_pages_v2.csv", index=False)
        logger.info(f"Saved progress at index {index}")

songs_df.to_csv("metadata/songs_10_2_pages_v2.csv", index=False)

[32m2025-03-23 14:45:16.887[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mFetching info for track: SHUM - Eurovision 2021 - Ukraine / Karaoke Version by Go_A[0m
[32m2025-03-23 14:45:17.743[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mGot info for SHUM - Eurovision 2021 - Ukraine / Karaoke Version by Go_A[0m
[32m2025-03-23 14:45:17.743[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mFetching info for track: Carol of the Bells / God Rest Ye Merry Gentlemen by Mykola Dmytrovych Leontovych[0m
[32m2025-03-23 14:45:18.911[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mGot info for Carol of the Bells / God Rest Ye Merry Gentlemen by Mykola Dmytrovych Leontovych[0m
[32m2025-03-23 14:45:18.912[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mFetching info for track: Leontovich / Arr Prizeman: Carol of the Bells by Mykola Dmytrovych

We still failed to fetch the data for some songs (probably due to special symbols), however they are not relevant for us and we will filter them out later anyway:

In [46]:
songs_df[songs_df["title_listeners"].isnull() | songs_df["title_tags"].isnull()]

Unnamed: 0,title,artist,title_listeners,title_scrobbles,title_tags,title_duration
685,PORNKHAB KRISMAS KLAB + lyrics,Poshlaya Molly,,,[],
931,y = x + 4z,Corn Wave,,,[],
1664,Човен LYRICS + ENG SUB,Odyn v kanoe,,,[],
2044,18+,kavabanga Depo kolibri,,,[],
2287,CTRL+Zzz,Poshlaja Molli,,,[],
2330,ПОШЛАЯ МОЛЛИ - CTRL+Zzz,Poshlaja Molli,,,[],
2450,Рассвет (slowed + reverb),Какая Разница,,,[],
3575,Я тобі брехала (slowed + reverb),Klavdia Petrivna,,,[],
3580,Знайди мене (𝖘𝖑𝖔𝖜𝖊𝖉 + 𝖗𝖊𝖛𝖊𝖗𝖇),Klavdia Petrivna,,,[],
3609,Знайди мене (Slowed + Reverb & remix by KAVA),Klavdia Petrivna,,,[],


## *.csv finalization

Now let's prepare the final *.csv files with artists and songs data and save them to the ```data``` dir:

In [66]:
os.makedirs("data", exist_ok=True)

Save artists list (first 10 pages from LastFM):

In [57]:
artists_df = pd.read_csv("metadata/artists_10_pages_v2.csv")
artists_df.head()

Unnamed: 0,artist,artist_listeners,artist_scrobbles,artist_tags
0,Океан Ельзи,203060,13552898,"['rock', 'ukrainian', 'ukrainian rock', 'indie..."
1,Go_A,189946,4088960,"['ukrainian', 'folktronica', 'folk', 'electron..."
2,Бумбокс,174374,6424069,"['ukrainian', 'funk', 'reggae', 'hip-hop', 'fu..."
3,Скрябін,59090,2821158,"['ukrainian', 'synthpop', 'pop-rock', 'electro..."
4,KALUSH,106178,1850491,"['ukrainian', 'rap', 'hip-hop', 'ukraine', 'eu..."


In [67]:
artists_df.to_csv("data/artists_v1.csv", index=False)

Prepare and save songs list:

In [None]:
songs_df = pd.read_csv("metadata/songs_10_2_pages_v2.csv")
more_songs_df = pd.read_csv("metadata/more_sooongs.csv")

In [None]:
songs_df[~songs_df["title"].isin(more_songs_df["title"])]

Unnamed: 0,title,artist,title_listeners,title_scrobbles,title_tags,title_duration


In [68]:
print("Number of new rows:", len(more_songs_df) - len(songs_df))

new_songs_df = more_songs_df[~more_songs_df.set_index(['title', 'artist']).index.isin(songs_df.set_index(['title', 'artist']).index)]
new_songs_df

Number of new rows: 1283


Unnamed: 0,title,artist,title_listeners,title_scrobbles,title_tags,title_duration
9571,Stefania (Kalush Orchestra),KALUSH,87761,1066420,"['rap', 'folk', 'ukrainian', 'eurovision', 'eu...",3:00
9572,Додому (feat. Skofka),KALUSH,9175,77198,"['pop', 'ukrainian', 'rap', 'hip-hop']",3:36
9573,Калуські вечорниці (feat. Tember Blanche),KALUSH,7394,61796,"['ukrainian', 'rap', 'hip-hop', 'ukraine', 'uk...",3:00
9574,Гори (Gory),KALUSH,7241,47316,"['rap', 'pop rap', 'nfs', '2019', 'ukrainian',...",3:04
9575,Зорi,KALUSH,6000,44853,"['ukrainian', 'rap', 'hip-hop', 'ukraine', 'uk...",3:20
...,...,...,...,...,...,...
10849,BIG Bloody,bris,2366,22081,"['punk', 'hardcore', 'rap', 'hip-hop', 'emo', ...",Add lyrics on Musixmatch
10850,Me Important,bris,2350,22318,"['sacremento', 'punk', 'hardcore', 'rap']","I get that re-up, hit my plug, and go and buy ..."
10851,Main Course,bris,2260,18749,"['punk', 'hardcore', 'rap', 'hip-hop', 'emo', ...",3:14
10852,Sparked a Fuse,bris,2070,18948,"['punk', 'hardcore', 'rap', 'hip-hop', 'emo', ...",So we don't need a clue (Like they always do)


In [69]:
all_songs_df = pd.concat([songs_df, new_songs_df], ignore_index=True)
all_songs_df.to_csv("data/songs_v1.csv", index=False)

assert len(all_songs_df) == len(songs_df) + len(new_songs_df)

# Fetched data summary

In [82]:
artists_df = pd.read_csv("data/artists_v1.csv")
artists_df.head()

Unnamed: 0,artist,artist_listeners,artist_scrobbles,artist_tags
0,Океан Ельзи,203060,13552898,"['rock', 'ukrainian', 'ukrainian rock', 'indie..."
1,Go_A,189946,4088960,"['ukrainian', 'folktronica', 'folk', 'electron..."
2,Бумбокс,174374,6424069,"['ukrainian', 'funk', 'reggae', 'hip-hop', 'fu..."
3,Скрябін,59090,2821158,"['ukrainian', 'synthpop', 'pop-rock', 'electro..."
4,KALUSH,106178,1850491,"['ukrainian', 'rap', 'hip-hop', 'ukraine', 'eu..."


In [83]:
songs_df = pd.read_csv("data/songs_v1.csv")
songs_df.head()

Unnamed: 0,title,artist,title_listeners,title_scrobbles,title_tags,title_duration
0,Без бою,Океан Ельзи,67241,500416,"['ukrainian', 'rock', 'pop rock', 'alternative...",4:21
1,Обійми,Океан Ельзи,40990,282632,"['ukrainian', 'blues', 'rock', 'alternative ro...",3:44
2,Не питай,Океан Ельзи,39876,303396,"['ukrainian', 'rock', 'love', 'sad', 'rock', '...",2:39
3,Коли тебе нема,Океан Ельзи,37672,239250,"['rock', 'ukrainian', 'rock', 'ukrainian', 'uk...",3:18
4,911,Океан Ельзи,35998,235613,"['rock', 'ukrainian', 'indie rock', 'rock', 'u...",3:29


In [None]:
artists_songs = songs_df["artist"].unique()
artists_artists = artists_df["artist"].unique()

print(f"Number of artists based on songs: {len(artists_songs)}")
print(f"Number of artists based on artists: {len(artists_artists)}")

Number of artists based on songs: 114
Number of artists based on artists: 205


In [81]:
songs_per_artist = songs_df["artist"].value_counts()

print(f"Total number of songs: {len(songs_df)}\n")
print(f"Number of songs per artist:\n{songs_per_artist.describe()}")

Total number of songs: 10854

Number of songs per artist:
count    114.000000
mean      95.210526
std       15.364868
min       11.000000
25%       99.000000
50%      100.000000
75%      100.000000
max      149.000000
Name: artist, dtype: float64


The list of artists for which we didn't fetch any songs (yet):

In [78]:
artists_with_no_songs = set(artists_artists) - set(artists_songs)
artists_with_no_songs

{'-deTach-',
 '100лиця',
 '5’nizza',
 'AKINE',
 'Alyosha',
 'Antytila',
 'Ariadna',
 'BaWn',
 'Bad',
 'BohoMan',
 'BƱBBLE',
 'CHEEV',
 'Cepasa',
 'D4C',
 'DZIDZIO',
 'Dens',
 'ETOLUBOV',
 'Folknery',
 'GXNRC',
 'Gaitana',
 'Godo',
 'Grebz',
 'Howtodie',
 'KRUTЬ',
 'Khayat',
 'Kola',
 'Kulakostas',
 'Latexfauna',
 'Laud',
 'Lely45',
 'Lюк',
 'MamaRika',
 'Mistmorn',
 'NAZVA',
 'OTOY',
 'Parfeniuk',
 'QARPA',
 'Rozhden',
 'Schmalgauzen',
 'Singleton',
 'Siuzanna',
 'SobaKKa',
 'Sudno',
 'Sx1nxwy',
 'TAYANNA',
 'TNMK',
 'Tik',
 'Txpa',
 'Vagonovozhatye',
 'XARAKTER',
 'YAKTAK',
 'Zapaska',
 'ZeFear',
 'Ziferblat',
 'badactress',
 'drwncvnt',
 'krbk',
 'takizava',
 'vioria',
 'Анна',
 'Апатія',
 'Брюссель',
 'Веремій',
 'Вій',
 'Гайдамаки',
 'Гайтана',
 'Дмц',
 'Карна',
 'Колос',
 'Крихітка',
 'Лея',
 'Лилу45',
 'Люсі',
 'Лівінстон',
 'Мандри',
 'Марний',
 'Мері',
 "Мотор'ролла",
 'Мур',
 'Руслана',
 'С.к.а.й.',
 'Сюзанна',
 'Тнмк',
 'Тонка',
 'Тік',
 'Фліт',
 'Фіолет',
 'Юркеш',
 'анастим