In [1]:
import pickle
import re

import requests
import tqdm
from bs4 import BeautifulSoup

### Парсим Everynoise

Достаём из сервиса **Everynoise** Spotify's ids исполнителей каждого доступного на Everynoise жанра

In [2]:
def get_genre_pages():
    """Retrieve urls of all genres with name containing *genre_name*

    :genre_list: list of genre names
    :return: list of links (to use globally, add locally defined *url* var prefix) with genre names [(genre_full_name, link), (...), ... ]
    """
    url = "https://everynoise.com/"
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, "lxml")

    genres_divs = soup.find_all("div", class_="genre scanme")

    links = []

    for genre_div in genres_divs:
        genre_full_name = genre_div.text[:-2]  # to remove the "» " part

        if re.findall("(^rap| rap|-rap|hip hop|hip-hop)", genre_full_name):
            href = genre_div.find("a")["href"]
            links.append((genre_full_name, href))

    return links


def get_artist_names(genre_page_url):
    """Retrieve names of all artists within a genre page"""

    full_address = "https://everynoise.com/" + genre_page_url
    resp = requests.get(full_address)
    soup = BeautifulSoup(resp.content, "lxml")

    artist_names_divs = soup.find_all("div", class_="genre scanme")

    artists_ids = []

    for name_div in artist_names_divs:
        spotify_artist_id = name_div.find("a")["href"]
        everynoise_artist_name = name_div.text[:-2]
        artists_ids.append((everynoise_artist_name, spotify_artist_id))

    return artists_ids

In [3]:
links = get_genre_pages()

In [4]:
genre_names = dict()

In [7]:
for genre_name, href in tqdm.tqdm(links):
    artists_ids = get_artist_names(href)
    genre_names.update({genre_name: artists_ids})

100%|██████████| 424/424 [02:18<00:00,  3.07it/s]


In [8]:
with open("everynoise_genres_artists_ids.pickle", "wb") as f:
    pickle.dump(genre_names, f)