# Filtering Ukrainian artists

In [4]:
import pandas as pd
import os

songs_df = pd.read_csv(os.path.join("data", "songs_v1.csv"))
print("Length before filtering: ", len(songs_df))

Length before filtering:  10854


1. Drop duplicates:

In [69]:
songs_df = songs_df.drop_duplicates(subset=["title", "artist"])
print("Length after dropping duplicates: ", len(songs_df))

Length after dropping duplicates:  10804


2. Filter non ukr artists and dublicates of the same artist:

In [70]:
ARTISTS_BLACKLIST = [
    "Poshlaya Molly", "Poshlaja Molli", "–°–º–µ—Ç–∞–Ω–∞ band", "Corn Wave", "–ü–æ—Ç–∞–ø –ò –ù–∞—Å—Ç—è –ö–∞–º–µ–Ω—Å–∫–∏—Ö", 
    "–°–≤–µ—Ç–ª–∞–Ω–∞ –õ–æ–±–æ–¥–∞", "Valentin Strykalo", "–í–µ—Ä–∫–∞ –°–µ—Ä–¥—é—á–∫–∞", "L—ék", "kavabanga Depo kolibri", 
    "Quest Pistols Show", "–ö–∞–∫–∞—è –†–∞–∑–Ω–∏—Ü–∞", "Green Grey", "jockii druce", "–ú–∏—à–∞ –ú–∞—Ä–≤–∏–Ω", 
    "–û–ª—è –ü–æ–ª—è–∫–æ–≤–∞", "–ú–∏—Ö–∞–∏–ª –ë—É–±–ª–∏–∫", "–ê–ª—ë–Ω–∞ –í–∏–Ω–Ω–∏—Ü–∫–∞—è", "–ù–∞—Ç–∞–ª—å—è –ú–æ–≥–∏–ª–µ–≤—Å–∫–∞—è",
    "–¢–∞–∏—Å–∏—è –ü–æ–≤–∞–ª–∏–π", "–°–µ—Ä–≥—ñ–π –ñ–∞–¥–∞–Ω –¢–∞ –°–æ–±–∞–∫–∏ –í –ö–æ—Å–º–æ—Å—ñ", "Poshlaya Molly",
    "Valentin Strykalo", "INGRET", "Svitlana Nianio", "Vasily Richter",
    "MARUV & Boosin", "–°—É—Å—ñ–¥–∏ –°—Ç–µ—Ä–ø–ª—è—Ç—å", "The Feels", "Grandma's smuzi", 
    "–¢–∞–±—É–ª–∞ –†–∞—Å–∞", "–û–ª–µ–≥ –ö–µ–Ω–∑–æ–≤", "Potap & Nastya", "–í–∏—Ç–∞–ª–∏–π –ß–∏—Ä–≤–∞", "TELLY GRAVE",
    "–ê–Ω–∞—Å—Ç–∞—Å–∏—è –ü—Ä–∏—Ö–æ–¥—å–∫–æ", "glichery", "AShamaluevMusic", "daKooka", "4Wheel", "KLIM", "CLONNEX", 
    "bris", "Pencil Legs", "Mykola Dmytrovych Leontovych", "Odyn v kanoe", 
    "–°–≤—è—Ç–æ—Å–ª–∞–≤ –í–∞–∫–∞—Ä—á—É–∫", "DJ Jedy", "–ú–∞—Ä—ñ—è –ß–∞–π–∫–æ–≤—Å—å–∫–∞", "The Ukrainians", "Los Colorados", 
    "–¢—ñ–Ω–∞ –ö–∞—Ä–æ–ª—å", "–Ü—Ä–∏–Ω–∞ –ë—ñ–ª–∏–∫", "national radio"
    ]

songs_df = songs_df[~songs_df["artist"].isin(ARTISTS_BLACKLIST)]
print("Length after removing blacklisted artists: ", len(songs_df))

Length after removing blacklisted artists:  5872


3. Remove songs that have brackets in the title (probably remixes of the original songs) or are too long:

In [71]:
songs_df = songs_df[~songs_df["title"].str.contains("\(") | ~songs_df["title"].str.contains("\)")]
songs_df = songs_df[~songs_df["title"].str.contains("\[") | ~songs_df["title"].str.contains("\]")]
songs_df = songs_df[~songs_df["title"].str.contains("\|")]
songs_df = songs_df[~songs_df["title"].str.contains("\.")]
songs_df = songs_df[songs_df["title"].str.len() < 30]

print("Length after filtering based on song title: ", len(songs_df))

Length after filtering based on song title:  4417


4. Filter non ukr artists based on the songs' tags:

Note: don't include russian here as a considerable amount of ukr songs have a missleading russian tag

In [72]:
TAGS_BLACKLIST = ["finnish", "japanese"]
songs_df = songs_df[~songs_df["title_tags"].str.contains("|".join(TAGS_BLACKLIST))]
print("Length after filtering based on title tags: ", len(songs_df))

Length after filtering based on title tags:  4341


Save the filtered songs to a new *.csv file:

In [None]:
songs_df.to_csv(os.path.join("data", "songs_filtered_v1.csv"), index=False)

## Data summary after filtering

In [93]:
import pandas as pd
import os

filtered_songs_df = pd.read_csv(os.path.join("data", "songs_filtered_v1.csv"))

In [97]:
print("Number of ukr artists: ", len(filtered_songs_df["artist"].unique()))

Number of ukr artists:  62


In [98]:
songs_per_artist = filtered_songs_df["artist"].value_counts()

print(f"Total number of songs: {len(filtered_songs_df)}\n")
print(f"Number of songs per artist:\n{songs_per_artist.describe()}")

Total number of songs: 4341

Number of songs per artist:
count     62.000000
mean      70.016129
std       21.063909
min       18.000000
25%       53.000000
50%       75.500000
75%       87.750000
max      100.000000
Name: artist, dtype: float64


Save the current blacklist version to a *.txt file":

In [None]:
with open(os.path.join("configs", "artists_blacklist_v1.txt"), "w", encoding="utf-8") as f:
    f.write("\n".join(ARTISTS_BLACKLIST))

## Concat with YT fetched results and update summary

In [None]:
import pandas as pd
import os

filtered_songs_df = pd.read_csv(os.path.join("data", "songs_filtered_v1.csv"))
len(filtered_songs_df)

4341

In [None]:
audios_5_df = pd.read_csv(os.path.join("metadata", "yt_songs_5_2_pages_filtered.csv"))
audios_10_df = pd.read_csv(os.path.join("metadata", "yt_songs_10_2_pages_filtered.csv"))
audios_more_df = pd.read_csv(os.path.join("metadata", "yt_songs_more_sooongs_filtered.csv"))

audios_df = pd.concat([audios_5_df, audios_10_df, audios_more_df], ignore_index=True)

print("Total: ", len(audios_df))
print("No yt_url: ", len(audios_df[audios_df["yt_url"].isnull()]))

Total:  4068
No yt_url:  602


In [89]:
merged_df = pd.merge(filtered_songs_df, audios_df, on=["title", "artist"], how="left")

print("Columns: ", merged_df.columns.values)
print("Total: ", len(merged_df))
print("No yt_url: ", len(merged_df[merged_df["yt_url"].isnull()]))

Columns:  ['title' 'artist' 'title_listeners_x' 'title_scrobbles_x' 'title_tags_x'
 'title_duration_x' 'title_listeners_y' 'title_scrobbles_y' 'title_tags_y'
 'title_duration_y' 'yt_title' 'yt_url' 'yt_duration' 'yt_views'
 'audio_path']
Total:  4341
No yt_url:  886


In [None]:
merged_df[["title", "artist", "audio_path", "yt_title", "yt_url", "yt_duration", "yt_views"]]\
    .to_csv(os.path.join("data", "yt_songs_filtered_v1.csv"), index=False)

Updated summary after fetching the missing songs:

In [3]:
import pandas as pd
import os

filtered_songs_df = pd.read_csv(os.path.join("data", "yt_songs_filtered_v1.csv"))

print("Columns: ", filtered_songs_df.columns.values)
print("Total: ", len(filtered_songs_df))
print("No yt_url: ", len(filtered_songs_df[filtered_songs_df["audio_path"].isnull()]))

Columns:  ['title' 'artist' 'audio_path' 'yt_title' 'yt_url' 'yt_duration'
 'yt_views']
Total:  4341
No yt_url:  60


# Exploring non unique audio_path

In [5]:
import pandas as pd
import os

yt_df = pd.read_csv(os.path.join("data", "yt_songs_filtered_v1.csv"))

len(yt_df["audio_path"].unique())

3227

In [21]:
non_unique_paths = yt_df["audio_path"].value_counts()[yt_df["audio_path"].value_counts() > 1].index.tolist()
non_unique_paths

['audio\\–ù—É–º–µ—Ä 482 - –î–æ–±—Ä–∏–π —Ä–∞–Ω–æ–∫ –£–∫—Ä–∞—ó–Ω–æ - (–û—Ñ—ñ—Ü—ñ–π–Ω–∏–π –∫–ª—ñ–ø- 2015).mp3',
 'audio\\–Ü–≥–æ—Ä –ö–∞–π–¥–∞—à - —Ü—å–æ–≥–æ –≤–∞—Ä—Ç—É—î –∫–æ—Ö–∞–Ω–Ω—è (official mood video).mp3',
 'audio\\–î–∏–º–Ω–∞ –°—É–ºi—à - –ó–ª–∞–º–∞–Ωi.mp3',
 'audio\\–°–µ—Ä—Ü–µ–≤–∏–π –ù–∞–ø–∞–¥ - –°—É–±–∫—É–ª—å—Ç—É—Ä–∞.mp3',
 'audio\\–ù—É–º–µ—Ä 482 - –í–∞–∂–ª–∏–≤–∞ | Official Video.mp3',
 'audio\\–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ.mp3',
 "audio\\–á—ó –¥—É—à—ñ –∑—ñ–≤'—è–ª—ñ –∫–≤—ñ—Ç–∏.mp3",
 'audio\\‚óæ–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó ‚óæ –í–æ–Ω–∞ ‚óæ.mp3',
 'audio\\–õ–µ–Ω—Ç–∞ –ó–∞ –õ–µ–Ω—Ç–æ—é.mp3',
 'audio\\–õ—ñ–Ω—ñ—è –ú–∞–Ω–Ω–µ—Ä–≥–µ–π–º–∞ ‚Äì –î–µ —Ç–≤–æ—è –ª—ñ–Ω—ñ—è?.mp3',
 'audio\\–•–æ–ª–æ–¥–Ω–µ –°–æ–Ω—Ü–µ - –¢—ñ–Ω—å –ö–æ—Ö–∞–Ω–Ω—è (remastered HQ).mp3',
 'audio\\–û–¥–∏–Ω –≤ –∫–∞–Ω–æ–µ - –ü–æ–¥–æ–±–∞—î—Ç—å—Å—è, —è–∫ —Ç–∏ —ñ–¥–µ—à.mp3',
 'audio\\–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞ - –í–ï–°–ù–ê.mp3',
 'audio\\–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞ - –ë—É–ª–∏ –Ω–∞ —Å–µ–ª—ñ.m

It seems like we have two cases:
1. It is literaly the same song but with slightly different title name in the LastFM platform

In [8]:
yt_df[yt_df["audio_path"] == 'audio\\–ê–ª–æ-–∞–ª–µ.mp3']

Unnamed: 0,title,artist,audio_path,yt_title,yt_url,yt_duration,yt_views
752,–ê–ª–æ-–∞–ª–µ,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,audio\–ê–ª–æ-–∞–ª–µ.mp3,–ê–ª–æ-–∞–ª–µ,https://youtube.com/watch?v=xH6cxIXtGWI,222.0,172200.0
805,–ê–ª–æ –∞–ª–µ,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,audio\–ê–ª–æ-–∞–ª–µ.mp3,–ê–ª–æ-–∞–ª–µ,https://youtube.com/watch?v=xH6cxIXtGWI,222.0,172201.0
812,"–ê–ª–ª–æ, –∞–ª–µ",–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,audio\–ê–ª–æ-–∞–ª–µ.mp3,–ê–ª–æ-–∞–ª–µ,https://youtube.com/watch?v=xH6cxIXtGWI,222.0,172201.0
813,"–¢–≤–æ—ó –∞–ª–æ, –º–æ—ó –∞–ª–µ",–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,audio\–ê–ª–æ-–∞–ª–µ.mp3,–ê–ª–æ-–∞–ª–µ,https://youtube.com/watch?v=xH6cxIXtGWI,222.0,172201.0
818,"–ê–ª–æ, –∞–ª–µ",–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,audio\–ê–ª–æ-–∞–ª–µ.mp3,–ê–ª–æ-–∞–ª–µ,https://youtube.com/watch?v=xH6cxIXtGWI,222.0,172201.0


2. Those are different songs, but the script failed to search the relevant YT audio (in some cases it doesn't even exist on YT)

In [73]:
yt_df[yt_df["audio_path"] == 'audio\\–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à.mp3']

Unnamed: 0,title,artist,audio_path,yt_title,yt_url,yt_duration,yt_views,title_norm
1659,–¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à,–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó,audio\–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ...,–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à,https://youtube.com/watch?v=ergjjMQywHU,217.0,2045107.0,–¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à
1685,–õ—ñ—Ç–æ,–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó,audio\–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ...,–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à,https://youtube.com/watch?v=ergjjMQywHU,217.0,2045107.0,–õ—ñ—Ç–æ
1704,–¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ªi—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à,–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó,audio\–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ...,–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à,https://youtube.com/watch?v=ergjjMQywHU,217.0,2045107.0,–¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à
1705,–¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ªi—Ç–∞ –∑–∞—Ü–≤i—Ç–µ—à,–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó,audio\–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ...,–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à,https://youtube.com/watch?v=ergjjMQywHU,217.0,2045107.0,–¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à
1725,"–¢—ñ, —â–æ –∑–∞–±—É–ª–∏",–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó,audio\–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ...,–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à,https://youtube.com/watch?v=ergjjMQywHU,217.0,2045108.0,"–¢—ñ, —â–æ –∑–∞–±—É–ª–∏"
3260,–¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à,–¢–∞—Ä–∞—Å –ß—É–±–∞–π,audio\–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ...,–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à,https://youtube.com/watch?v=ergjjMQywHU,217.0,2050516.0,–¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à


We need to perform two further postprocessing steps:
1. Normalize artist and titles and filter duplicates
2. Prepare a column which will indicate the need for re-downloading the audio (or setting None if it doesn't exist on YT)

## Artist normalization

In [14]:
print("Number of unique artists:", len(yt_df['artist'].unique()))
yt_df['artist'].unique()

Number of unique artists: 60


array(['–û–∫–µ–∞–Ω –ï–ª—å–∑–∏', 'Go_A', '–ë—É–º–±–æ–∫—Å', 'The Hardkiss', 'alyona alyona',
       '–û–¥–∏–Ω –í –ö–∞–Ω–æ–µ', '–î—Ä—É–≥–∞ –†—ñ–∫–∞', 'Vivienne Mort', '–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞',
       '–ú–µ—Ä—Ç–≤–∏–π –ü—ñ–≤–µ–Ω—å', '–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å', '–ñ–∞–¥–∞–Ω —ñ —Å–æ–±–∞–∫–∏',
       '–ë—Ä–∞—Ç–∏ –ì–∞–¥—é–∫—ñ–Ω–∏', 'The Unsleeping', 'Artem Pivovarov',
       '–°–¢–†–£–ö–¢–£–†–ê –©–ê–°–¢–Ø', '–ö–æ–º—É –í–Ω–∏–∑', '–î–∏–º–Ω–∞ –°—É–º—ñ—à', 'Klavdia Petrivna',
       'Tember Blanche', 'Khrystyna Soloviy', 'dity inzheneriv',
       'PROBASS ‚àÜ HARDI', '–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó', '–•–æ–ª–æ–¥–Ω–µ –°–æ–Ω—Ü–µ', 'Sad Novelist',
       '–°–∫—Ä—è–±—ñ–Ω', '–ü–µ—Ä–∫–∞–ª–∞–±–∞', 'Zlata Ognevich', 'Mad Heads XL',
       'Kalush Orchestra', '–ü—ñ–∫–∫–∞—Ä–¥—ñ–π—Å—å–∫–∞ –¢–µ—Ä—Ü—ñ—è', 'BLOOMS CORDA',
       '–ù–∏—Ü–æ –ü–æ—Ç–≤–æ—Ä–Ω–æ', '–§–∞–∫—Ç–∏—á–Ω–æ –°–∞–º—ñ', 'renie cares', '–û–±—ñ–π–º–∏ –î–æ—â—É',
       '–õ—ñ–Ω—ñ—è –ú–∞–Ω–Ω–µ—Ä–≥–µ–π–º–∞', '–ê—Ä—Å–µ–Ω –ú—ñ—Ä–∑–æ—è–Ω', '–ú–∞—à–∞ –

Duplicates in the artists list:
- Zhadan i Sobaky - –ñ–∞–¥–∞–Ω —ñ —Å–æ–±–∞–∫–∏
- –í–æ–ø–ªi –íi–¥–æ–ø–ª—è—Å–æ–≤–∞ - –í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞
- Kalush Orchestra - Kalush (UPD: separate artists)

In [6]:
def normalize_artist(df):
    df['artist'] = df['artist'].replace('–í–æ–ø–ªi –íi–¥–æ–ø–ª—è—Å–æ–≤–∞', '–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞')
    df['artist'] = df['artist'].replace('Zhadan i Sobaky', '–ñ–∞–¥–∞–Ω —ñ —Å–æ–±–∞–∫–∏')
    return df

In [None]:
yt_df = normalize_artist(yt_df)
print("Number of unique artists after normalization:", len(yt_df['artist'].unique()))
yt_df['artist'].unique()

Number of unique artists after normalization: 60


array(['–û–∫–µ–∞–Ω –ï–ª—å–∑–∏', 'Go_A', '–ë—É–º–±–æ–∫—Å', 'The Hardkiss', 'alyona alyona',
       '–û–¥–∏–Ω –í –ö–∞–Ω–æ–µ', '–î—Ä—É–≥–∞ –†—ñ–∫–∞', 'Vivienne Mort', '–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞',
       '–ú–µ—Ä—Ç–≤–∏–π –ü—ñ–≤–µ–Ω—å', '–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å', '–ñ–∞–¥–∞–Ω —ñ —Å–æ–±–∞–∫–∏',
       '–ë—Ä–∞—Ç–∏ –ì–∞–¥—é–∫—ñ–Ω–∏', 'The Unsleeping', 'Artem Pivovarov',
       '–°–¢–†–£–ö–¢–£–†–ê –©–ê–°–¢–Ø', '–ö–æ–º—É –í–Ω–∏–∑', '–î–∏–º–Ω–∞ –°—É–º—ñ—à', 'Klavdia Petrivna',
       'Tember Blanche', 'Khrystyna Soloviy', 'dity inzheneriv',
       'PROBASS ‚àÜ HARDI', '–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó', '–•–æ–ª–æ–¥–Ω–µ –°–æ–Ω—Ü–µ', 'Sad Novelist',
       '–°–∫—Ä—è–±—ñ–Ω', '–ü–µ—Ä–∫–∞–ª–∞–±–∞', 'Zlata Ognevich', 'Mad Heads XL',
       'Kalush Orchestra', '–ü—ñ–∫–∫–∞—Ä–¥—ñ–π—Å—å–∫–∞ –¢–µ—Ä—Ü—ñ—è', 'BLOOMS CORDA',
       '–ù–∏—Ü–æ –ü–æ—Ç–≤–æ—Ä–Ω–æ', '–§–∞–∫—Ç–∏—á–Ω–æ –°–∞–º—ñ', 'renie cares', '–û–±—ñ–π–º–∏ –î–æ—â—É',
       '–õ—ñ–Ω—ñ—è –ú–∞–Ω–Ω–µ—Ä–≥–µ–π–º–∞', '–ê—Ä—Å–µ–Ω –ú—ñ—Ä–∑–æ—è–Ω', '–ú–∞—à–∞ –

UPD: can be performed without hardcoding via ```cyrtranslit``` library:

In [17]:
import cyrtranslit

print(cyrtranslit.to_cyrillic("Zhadan i Sobaky", "ua"))
print(cyrtranslit.to_latin("–ñ–∞–¥–∞–Ω —ñ –°–æ–±–∞–∫–∏", "ua"))

–ó–≥–∞–¥–∞–Ω —ñ –°–æ–±–∞–∫–∏
≈Ωadan i Sobaky


## Title normalization

In [None]:
dup_sample = yt_df[yt_df["audio_path"] == "audio\–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ.mp3"]
dup_sample

Unnamed: 0,title,artist,audio_path,yt_title,yt_url,yt_duration,yt_views
750,–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,audio\–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ.mp3,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ,https://youtube.com/watch?v=RT6BXmAArYc,212.0,102050.0
771,–¢–∏ –Ω–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,audio\–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ.mp3,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ,https://youtube.com/watch?v=RT6BXmAArYc,212.0,102050.0
777,–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,audio\–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ.mp3,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ,https://youtube.com/watch?v=RT6BXmAArYc,212.0,102050.0
782,–ù–∞ –ø–µ—Ä—à–æ–º—É –ºi—Å—Üi,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,audio\–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ.mp3,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ,https://youtube.com/watch?v=RT6BXmAArYc,212.0,102050.0
800,Ty na pershomu misci,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,audio\–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ.mp3,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ,https://youtube.com/watch?v=RT6BXmAArYc,212.0,102051.0
801,–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,audio\–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ.mp3,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ,https://youtube.com/watch?v=RT6BXmAArYc,212.0,102051.0
814,–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,audio\–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ.mp3,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ,https://youtube.com/watch?v=RT6BXmAArYc,212.0,102051.0


In [18]:
dup_sample["title"].value_counts()

–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ        1
–¢–∏ –Ω–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ     1
–ù–∞ –ø–µ—Ä—à–æ–º—É  –º—ñ—Å—Ü—ñ       1
–ù–∞ –ø–µ—Ä—à–æ–º—É –ºi—Å—Üi        1
Ty na pershomu misci    1
–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ        1
–ù–∞ –ø–µ—Ä—à–æ–º—É  –º—ñ—Å—Ç—ñ       1
Name: title, dtype: int64

### Latin to cyrillic
Handle the case with mix latin and cyrillic letters and convert all to cyrillic ([GitHub](https://github.com/opendatakosovo/cyrillic-transliteration)):

In [7]:
import cyrtranslit
import unicodedata

def fix_mixed_cyrillic(text):
    text = unicodedata.normalize("NFKC", text)
    return cyrtranslit.to_cyrillic(text, "ua")

assert "–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞" == fix_mixed_cyrillic("–í–æ–ø–ªi –í—ñ–¥–æ–ø–ª—è—Åo–≤–∞")

In [None]:
yt_df['title_norm'] = yt_df['title'].apply(fix_mixed_cyrillic)

yt_df[yt_df["audio_path"] == "audio\–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ.mp3"]["title_norm"].value_counts()

–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ        2
–¢–∏ –Ω–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ     1
–ù–∞ –ø–µ—Ä—à–æ–º—É  –º—ñ—Å—Ü—ñ       1
–¢–∏ –Ω–∞ –ø–µ—Ä—Å–≥–æ–º—É –º—ñ—Å—Ü—ñ    1
–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ        1
–ù–∞ –ø–µ—Ä—à–æ–º—É  –º—ñ—Å—Ç—ñ       1
Name: title_norm, dtype: int64

### Brackets

In [8]:
import re

def remove_brackets(string: str):
    return re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}', '', string)

some_str = "Hello [Official version]"
assert remove_brackets(some_str) == "Hello "

### White spaces
Remove trailing white spaces:

In [9]:
def normalize_whitespace(text):
    return re.sub(r"\s+", " ", text).strip()

assert "a b c" == normalize_whitespace(" a    b   c ")

In [52]:
yt_df['title_norm'] = yt_df['title_norm'].apply(normalize_whitespace)

yt_df[yt_df["audio_path"] == "audio\–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å - –ù–∞ –ü–µ—Ä—à–æ–º—É –ú—ñ—Å—Ü—ñ.mp3"]["title_norm"].value_counts()

–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ        3
–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ        2
–¢–∏ –Ω–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ     1
–¢–∏ –Ω–∞ –ø–µ—Ä—Å–≥–æ–º—É –º—ñ—Å—Ü—ñ    1
Name: title_norm, dtype: int64

### String similarity
Consider the level of string similarity. We choose partial_ratio as it considers substrings and calculates their similarity, which is more suitable in our case:

In [58]:
from rapidfuzz import fuzz

titles = [
    "–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ",
    "–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ",
    "–¢–∏ –Ω–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ",
    "–¢–∏ –Ω–∞ –ø–µ—Ä—Å–≥–æ–º—É –º—ñ—Å—Ü—ñ"
]

print("WRatio:")   
for i in range(len(titles)):
    for j in range(i + 1, len(titles)):
        print(f"Similarity({titles[i]} <-> {titles[j]}) = {fuzz.WRatio(titles[i], titles[j]):.2f}")

print("\nPartial ratio:")
for i in range(len(titles)):
    for j in range(i + 1, len(titles)):
        print(f"Similarity({titles[i]} <-> {titles[j]}) = {fuzz.partial_ratio(titles[i], titles[j]):.2f}")

WRatio:
Similarity(–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ <-> –ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ) = 93.75
Similarity(–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ <-> –¢–∏ –Ω–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ) = 85.71
Similarity(–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ <-> –¢–∏ –Ω–∞ –ø–µ—Ä—Å–≥–æ–º—É –º—ñ—Å—Ü—ñ) = 77.78
Similarity(–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ <-> –¢–∏ –Ω–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ) = 80.00
Similarity(–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ <-> –¢–∏ –Ω–∞ –ø–µ—Ä—Å–≥–æ–º—É –º—ñ—Å—Ü—ñ) = 72.22
Similarity(–¢–∏ –Ω–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ <-> –¢–∏ –Ω–∞ –ø–µ—Ä—Å–≥–æ–º—É –º—ñ—Å—Ü—ñ) = 92.31

Partial ratio:
Similarity(–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ <-> –ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ) = 93.75
Similarity(–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ <-> –¢–∏ –Ω–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ) = 96.77
Similarity(–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ <-> –¢–∏ –Ω–∞ –ø–µ—Ä—Å–≥–æ–º—É –º—ñ—Å—Ü—ñ) = 87.50
Similarity(–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ <-> –¢–∏ –Ω–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ) = 90.32
Similarity(–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ <-> –¢–∏ –Ω–∞ –ø–µ—Ä—

Checking the correspondence of the found YT title to the song title seems to be working pretty good (TODO: then the rest of the songs with high similarity score need to be merged into one):

In [74]:
song_titles = yt_df[yt_df["audio_path"] == 'audio\\–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à.mp3']["title_norm"].values
yt_title = yt_df[yt_df["audio_path"] == 'audio\\–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à.mp3']["yt_title"].values[0]

for title in song_titles:
    print(f"Similarity({title} <-> {yt_title}) = {fuzz.WRatio(title, yt_title):.2f}")

Similarity(–¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à <-> –ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à) = 95.00
Similarity(–õ—ñ—Ç–æ <-> –ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à) = 30.00
Similarity(–¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à <-> –ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à) = 95.00
Similarity(–¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à <-> –ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à) = 95.00
Similarity(–¢—ñ, —â–æ –∑–∞–±—É–ª–∏ <-> –ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à) = 32.88
Similarity(–¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à <-> –ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó - –¢–∏ –≤—Ç—Ä–µ—Ç—î —Ü—å–æ–≥–æ –ª—ñ—Ç–∞ –∑–∞—Ü–≤—ñ—Ç–µ—à) = 95.00


Some cornercases: here the "–¢–≤–æ—ó –∞–ª–æ, –º–æ—ó –∞–ª–µ" contains both words "–ê–ª–æ-–∞–ª–µ" and is probably still the same song despite the low similarity:

In [70]:
song_titles = yt_df[yt_df["audio_path"] == 'audio\\–ê–ª–æ-–∞–ª–µ.mp3']["title_norm"].values
yt_title = yt_df[yt_df["audio_path"] == 'audio\\–ê–ª–æ-–∞–ª–µ.mp3']["yt_title"].values[0]

for title in song_titles:
    print(f"Similarity({title} <-> {yt_title}) = {fuzz.WRatio(title, yt_title):.2f}")

Similarity(–ê–ª–æ-–∞–ª–µ <-> –ê–ª–æ-–∞–ª–µ) = 100.00
Similarity(–ê–ª–æ –∞–ª–µ <-> –ê–ª–æ-–∞–ª–µ) = 85.71
Similarity(–ê–ª–ª–æ, –∞–ª–µ <-> –ê–ª–æ-–∞–ª–µ) = 75.00
Similarity(–¢–≤–æ—ó –∞–ª–æ, –º–æ—ó –∞–ª–µ <-> –ê–ª–æ-–∞–ª–µ) = 55.38
Similarity(–ê–ª–æ, –∞–ª–µ <-> –ê–ª–æ-–∞–ª–µ) = 80.00


In [10]:
from rapidfuzz import fuzz

def similarity_score(str_1: str, str_2: str):
    str_1, str_2 = str(str_1).lower(), str(str_2).lower()
    similarity_score = fuzz.WRatio(str_1, str_2)
    return similarity_score

similarity_score("–ù–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ü—ñ", "–¢–∏ –Ω–∞ –ø–µ—Ä—à–æ–º—É –º—ñ—Å—Ç—ñ")

85.71428571428572

### Similarity score on our dataset

In [None]:
yt_df["title-yt_title score"] = yt_df.apply(lambda row: similarity_score(row["title"], row["yt_title"]), axis=1)
yt_df["title-yt_title score"].describe()

count    4341.000000
mean       81.891261
std        24.347289
min         0.000000
25%        85.500000
50%        90.000000
75%        96.000000
max       100.000000
Name: title-yt_title score, dtype: float64

In [93]:
yt_df[(yt_df["title-yt_title score"] < 70) & (yt_df["title-yt_title score"] > 60)][["artist", "title", "yt_title", "title-yt_title score", "yt_url"]]

Unnamed: 0,artist,title,yt_title,title-yt_title score,yt_url
368,–û–¥–∏–Ω –í –ö–∞–Ω–æ–µ,–ü—ñ—à–∏,–û–¥–∏–Ω –≤ –∫–∞–Ω–æ–µ - –ü—ñ—à—ã,67.5,https://youtube.com/watch?v=6L5-Psa4K_o
381,–û–¥–∏–Ω –í –ö–∞–Ω–æ–µ,–¥–æ—âi,–û–¥–∏–Ω –≤ –∫–∞–Ω–æ–µ - –î–æ—â—ñ,67.5,https://youtube.com/watch?v=JBfCO_t9pmk
458,–î—Ä—É–≥–∞ –†—ñ–∫–∞,–î–µ–Ω—å —ñ –Ω—ñ—á,–î—Ä—É–≥–∞ –†—ñ–∫–∞ - –î–µ–Ω–Ω—ñ—á,67.5,https://youtube.com/watch?v=eyacuWfV88c
629,–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞,–°–≤i—Ç,–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞ - –°–≤—ñ—Ç,67.5,https://youtube.com/watch?v=t9Z7LR5uBS0
639,–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞,–©–æ –±—É–ª–æ,–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞ - –ë—É–ª–∏ –Ω–∞ —Å–µ–ª—ñ,61.071429,https://youtube.com/watch?v=Kk6uv80Slj8
1305,–î–∏–º–Ω–∞ –°—É–º—ñ—à,–î–æ–ø–æ–±–∞—á–µ–Ω–Ω—è –ª—é–±–æ–≤,"–î–æ –ø–æ–±–∞—á–µ–Ω–Ω—è, –∫–æ—Ö–∞–Ω–Ω—è",68.421053,https://youtube.com/watch?v=wT0rh7_XhBc
1331,–í–æ–ø–ªi –íi–¥–æ–ø–ª—è—Å–æ–≤–∞,–°–≤i—Ç,–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞ - –°–≤—ñ—Ç,67.5,https://youtube.com/watch?v=t9Z7LR5uBS0
1336,–í–æ–ø–ªi –íi–¥–æ–ø–ª—è—Å–æ–≤–∞,–©–æ –±—É–ª–æ,–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞ - –ë—É–ª–∏ –Ω–∞ —Å–µ–ª—ñ,61.071429,https://youtube.com/watch?v=Kk6uv80Slj8
1452,Klavdia Petrivna,–ü–∞–º‚Äò—è—Ç–∞–∏ÃÜ,Klavdia Petrivna ‚Äî –ü–∞–º'—è—Ç–∞–π (—Ç–µ—Ö–Ω–æ),60.0,https://youtube.com/watch?v=iSnYw0jmwek
1542,Khrystyna Soloviy,–ö–æ–∞–ª–∞ - Instrumental,,60.0,


In [94]:
yt_df[(yt_df["title-yt_title score"] < 60) & (yt_df["title-yt_title score"] > 50)][["artist", "title", "yt_title", "title-yt_title score", "yt_url"]]

Unnamed: 0,artist,title,yt_title,title-yt_title score,yt_url
375,–û–¥–∏–Ω –í –ö–∞–Ω–æ–µ,–®–∞—Ö–∏ - –ù–∞–∂–∏–≤–æ,–û–¥–∏–Ω –≤ –∫–∞–Ω–æ–µ - –®–∞—Ö–∏,60.0,https://youtube.com/watch?v=AMgRGZehGeU
653,–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞,–°–æ–ª–Ω–µ—á–Ω—ã–µ –¥–Ω–∏,–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞ - –°–æ–Ω—è—á–Ω—ñ –¥–Ω—ñ [Official Video],57.0,https://youtube.com/watch?v=Pm0h5on95Kw
745,–ú–µ—Ä—Ç–≤–∏–π –ü—ñ–≤–µ–Ω—å,Beautifull –∫–∞—Ä–ø–∞—Ç–∏,–ë'—é—Ç—ñ—Ñ—É–ª –ö–∞—Ä–ø–∞—Ç–∏,57.826087,https://youtube.com/watch?v=u-ymCwdgSIY
813,–ö—Ä–∏—Ö—ñ—Ç–∫–∞ –¶–∞—Ö–µ—Å,"–¢–≤–æ—ó –∞–ª–æ, –º–æ—ó –∞–ª–µ",–ê–ª–æ-–∞–ª–µ,55.384615,https://youtube.com/watch?v=xH6cxIXtGWI
1028,Artem Pivovarov,"–û, –ü–∞–Ω–Ω–æ!","–ê—Ä—Ç–µ–º –ü–∏–≤–æ–≤–∞—Ä–æ–≤ & The –í—É—Å–∞ - –û, –ü–∞–Ω–Ω–æ (feat. –î...",57.0,https://youtube.com/watch?v=7egk1oND8x8
1080,Artem Pivovarov,–î–µ–∂–∞–≤—é - Acoustic Version,–ê—Ä—Ç–µ–º –ü–∏–≤–æ–≤–∞—Ä–æ–≤ - –î–µ–∂–∞–≤—é (UA Version),55.16129,https://youtube.com/watch?v=D83kYtgLZpI
1222,–ö–æ–º—É –í–Ω–∏–∑,–õ—ñ—Ä–Ω–∏–∫,–ö–æ–º—É –í–Ω–∏–∑ - –õ—ñ—Ä–∞ (–£–∫—Ä–∞–∏–Ω–∞),51.3,https://youtube.com/watch?v=7hcatw4aTGo
1321,–í–æ–ø–ªi –íi–¥–æ–ø–ª—è—Å–æ–≤–∞,–°–æ–ª–Ω–µ—á–Ω—ã–µ –¥–Ω–∏,–í–æ–ø–ª—ñ –í—ñ–¥–æ–ø–ª—è—Å–æ–≤–∞ - –°–æ–Ω—è—á–Ω—ñ –¥–Ω—ñ [Official Video],57.0,https://youtube.com/watch?v=Pm0h5on95Kw
1450,Klavdia Petrivna,–£–∂–µ —Å–≤—ñ—Ç–∞—î –∞ –º–µ–Ω–µ –Ω–µ–º–∞—î,Klavdia Petrivna ‚Äî –£–∂–µ —Å–≤—ñ—Ç–∞—î,57.575758,https://youtube.com/watch?v=9kdk-xdKsE8
1718,–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó,–ê –≤–æ–Ω–∞ –ª–∏—à –≤–æ–Ω–∞,‚óæ–ü–ª–∞—á –Ñ—Ä–µ–º—ñ—ó ‚óæ –í–æ–Ω–∞ ‚óæ,54.285714,https://youtube.com/watch?v=cKIX5yZEm2g


In [None]:
yt_df[(yt_df["title-yt_title score"] < 50) & (yt_df["title-yt_title score"] > 40)][["artist", "title", "yt_title", "title-yt_title score", "yt_url"]]

Unnamed: 0,artist,title,yt_title,title-yt_title score,yt_url
78,Go_A,Tenera,Go-A - –ñ–ê–õ–¨–ú–ï–ù–Ü–ù–ê [ZHALMENINA] (Audio),45.0,https://youtube.com/watch?v=4vEeYQ9wwuc
80,Go_A,GoaShum,Go_A - SHUM - Ukraine üá∫üá¶ Official Music Video ...,40.714286,https://youtube.com/watch?v=U7-dxzp6Jvs
300,–û–∫–µ–∞–Ω –ï–ª—å–∑–∏,–öi—à–∫–∞,–û–∫–µ–∞–Ω –ï–ª—å–∑–∏ - –ö—ñ—à–∫–∞ | Kishka (official video),48.0,https://youtube.com/watch?v=QQ_GAf_g72I
366,–û–¥–∏–Ω –í –ö–∞–Ω–æ–µ,–ü–∏—à–∏,–û–¥–∏–Ω –≤ –∫–∞–Ω–æ–µ - –ü—ñ—à—ã,45.0,https://youtube.com/watch?v=6L5-Psa4K_o
382,–û–¥–∏–Ω –í –ö–∞–Ω–æ–µ,–øi—à–∏,–û–¥–∏–Ω –≤ –∫–∞–Ω–æ–µ - –ü—ñ—à—ã,45.0,https://youtube.com/watch?v=6L5-Psa4K_o
480,Vivienne Mort,–Ü–Ω—ñ–π,"Vivienne Mort ‚Äî –Ü–Ω—ñ–∏ÃÜ [single, 2017]",45.0,https://youtube.com/watch?v=8S9x47MLq0Q
549,Vivienne Mort,Ve—Åhir,Vivienne Mort ‚Äî –í–µ—á—ñ—Ä,45.0,https://youtube.com/watch?v=0uQny5u6q8Q
1046,Artem Pivovarov,–ú–∏—Ä–∞–∂,–ê–†–¢–ï–ú –ü–ò–í–û–í–ê–†–û–í - –ú–Ü–†–ê–ñ [UA Version] (–ü—Ä–µ–º'—î—Ä–∞...,48.0,https://youtube.com/watch?v=ZYYDVQNmrBs
1139,–°–¢–†–£–ö–¢–£–†–ê –©–ê–°–¢–Ø,üÑ¥¬†¬†–°–∏–¥–∞—Ç–∏–≤–∏,SadSvit - –°–∏–ª—É–µ—Ç–∏ (feat. –°–¢–†–£–ö–¢–£–†–ê –©–ê–°–¢–Ø) Lyri...,42.75,https://youtube.com/watch?v=NjDMzkRbSZI
1194,–ö–æ–º—É –í–Ω–∏–∑,–ü–æ–øi–ª,"#02. –ö–û–ú–£ –í–ù–ò–ó - –ü–æ–ø—ñ–ª. ""–†–µ—Ç—Ä–æ—Å–ø–µ–∫—Ç–∏–≤–∞"" —É –õ—å–≤—ñ...",48.0,https://youtube.com/watch?v=IElR7110ug0


In [95]:
yt_df[yt_df["title-yt_title score"] < 50][["artist", "title", "yt_title", "title-yt_title score"]]

Unnamed: 0,artist,title,yt_title,title-yt_title score
78,Go_A,Tenera,Go-A - –ñ–ê–õ–¨–ú–ï–ù–Ü–ù–ê [ZHALMENINA] (Audio),45.000000
80,Go_A,GoaShum,Go_A - SHUM - Ukraine üá∫üá¶ Official Music Video ...,40.714286
103,–ë—É–º–±–æ–∫—Å,TNT,–¢–ù–¢,0.000000
180,alyona alyona,–ú–∞–º–æ,alyona alyona & Jerry Heil - Teresa & Maria (L...,0.000000
198,alyona alyona,Zozulia,alyona alyona & Jerry Heil & Ginger Mane - –ó–æ–∑—É–ª—è,27.692308
...,...,...,...,...
4239,TVORCHI,You,TVORCHI - Heart Of Steel (Eurovision Version) ...,20.000000
4304,–ê–Ω—Ç–∏—Ç—ñ–ª–∞,–ê–º–æ—Ä–µ–º–æ—Ä–µ,–ê–ù–¢–ò–¢–Ü–õ–ê - –ù–ï–í–ò–î–ò–ú–ö–ê (Official HD Ukrainian vers),28.500000
4326,–ê–Ω—Ç–∏—Ç—ñ–ª–∞,–ú–∏–ª–∞,Ed Sheeran ‚Äì 2step ft Antytila [Official Video],0.000000
4337,–ê–Ω—Ç–∏—Ç—ñ–ª–∞,–ë–∞–ª–µ—Ä–∏–Ω–∞,Ed Sheeran ‚Äì 2step ft Antytila [Official Video],0.000000


# Final normalization and filtering

Final approach based on the findings from previous steps:

1. **Non-unique audio_path**: get rid of duplicates of the same song (leave only one song)
2. **Unrelevant audio_path**: mark songs whose title doesn't match the fetched YT audio title

## Dataset merging and preprocessing

First let's prepare the dataframe with metadata about the songs and merge it with the YT dataframe:

In [161]:
songs_df = pd.read_csv(os.path.join("data", "songs_filtered_v1.csv"))
yt_df = pd.read_csv(os.path.join("data", "yt_songs_filtered_v1.csv"))
merged_df = pd.merge(songs_df, yt_df, on=["title", "artist"], how="inner")

assert len(merged_df) == len(yt_df) == len(songs_df)
print("Number of samples before filtering: ", len(merged_df))
print("Columns: ", merged_df.columns.values)
print("Number of uniques audio_path:", len(merged_df["audio_path"].unique()))

Number of samples before filtering:  4341
Columns:  ['title' 'artist' 'title_listeners' 'title_scrobbles' 'title_tags'
 'title_duration' 'audio_path' 'yt_title' 'yt_url' 'yt_duration'
 'yt_views']
Number of uniques audio_path: 3227


In [162]:
merged_df[merged_df["artist"] =="Mad Heads XL"]["title_tags"].value_counts()

['ska', 'ukrainian', 'reggae', 'rock', 'ethnic', 'ska', 'ukrainian', 'reggae']    69
['ska', 'ska', 'ukrainian', 'reggae']                                              2
['ukrainian', 'reggae', 'ska', 'ukrainian', 'reggae']                              1
['ukrainian', 'positive', 'ska', 'ukrainian', 'reggae']                            1
['pop-rock', 'ska', 'ukrainian', 'reggae']                                         1
['ska', 'ska punk', 'ska', 'ukrainian', 'reggae']                                  1
['covers', 'positive', 'ska', 'ukrainian', 'reggae']                               1
['ukrainian', 'folk rock', 'folk song', 'ska', 'ukrainian', 'reggae']              1
['ukrainian rock', 'ska', 'ukrainian', 'reggae']                                   1
['feel good', 'ska', 'ukrainian', 'reggae']                                        1
['ska-punk', 'ska', 'ukrainian', 'reggae']                                         1
['ukrainian', 'punk rock', 'ska', 'ukrainian', 'reggae']         

Normalize the arist names and song titles:

In [163]:
import unicodedata

merged_df = normalize_artist(merged_df)
merged_df["title"] = merged_df["title"].apply(lambda x: normalize_whitespace(unicodedata.normalize("NFKC", x)))

Convert popularity and duration data to integers:

In [3]:
def str_to_int(s: str):
    if not pd.isnull(s):
        return int(float(str(s).replace(",", "")))

In [None]:
merged_df["title_listeners"] = merged_df["title_listeners"].apply(str_to_int).astype("Int64")
merged_df["title_scrobbles"] = merged_df["title_scrobbles"].apply(str_to_int).astype("Int64")
merged_df["yt_views"] = merged_df["yt_views"].apply(str_to_int).astype("Int64")
merged_df["yt_duration"] = merged_df["yt_duration"].apply(str_to_int).astype("Int64")

In [2]:
def duration_to_int(duration: str):
    if pd.isnull(duration):
        return None
    if isinstance(duration, int) or isinstance(duration, float):
        return int(duration)
    parts = duration.split(":")
    if parts[0].isdigit() and parts[1].isdigit():
        return int(parts[0]) * 60 + int(parts[1])

In [None]:
merged_df["title_duration"] = merged_df["title_duration"].apply(duration_to_int).astype("Int64")
merged_df["yt_duration"] = merged_df["yt_duration"].apply(duration_to_int).astype("Int64")

### Title genre tags

Preprocess tags: filter only relevant once and prepare column with "most common" genre tags for each song:

In [166]:
merged_df["title_tags"] = merged_df["title_tags"].apply(eval)

In [167]:
all_tags = set()
for tags in merged_df["title_tags"].values:
    all_tags.update(tags)

print("Total number of unique tags: ", len(all_tags))

Total number of unique tags:  437


In [168]:
# Save all tags to the file
with open('configs/tags_v1.txt', 'w') as f:
    all_tags = sorted(list(all_tags))
    for tag in all_tags:
        f.write(tag + '\n')

In [169]:
# Read the file with manually filtered genre tags
with open('configs/genre_tags_v1.txt', 'r') as f:
    genre_tags = f.read().split('\n')

print(genre_tags)

['', 'alt. rock', 'alternative', 'alternative metal', 'alternative rock', 'ambient', 'ambient pop', 'art rock', 'ballad', 'blues', 'britpop', 'cabaret', 'chanson', 'christmas', 'classic', 'classic rock', 'classical', 'comedy', 'country', 'dark electro', 'depressive rock', 'doom metal', 'ebm', 'edm', 'electric buzz', 'electrofolk', 'electronic', 'electronic rock', 'electronica', 'electropop', 'emotional rock', 'ethereal', 'ethnic', 'ethno', 'ethno-ska', 'experimental', 'experimental rock', 'finnish punk', 'finskii religioznij rock', 'folk', 'folk metal', 'folk pop', 'folk rock', 'folk song', 'folk-punk', 'folk-rock', 'folk-ska', 'folk/rock', 'folktronica', 'funk', 'funky-groove', 'gop rock', 'goth rock', 'gothic', 'gothic love metal', 'gothic metal', 'gothic rock', 'groove', 'grunge', 'gypsy punk', 'hard rock', 'hardcore punk', 'hip hop', 'hip-hop', 'hyperpop', 'indie', 'indie folk', 'indie pop', 'indie rock', 'indie-rock', 'indiepop', 'industrial metal', 'instrumental', 'italo disco', 

In [170]:
set(all_tags) - set(genre_tags)

{'#1',
 '00s',
 '1',
 '10 of 10 stars',
 '10s',
 '1994',
 '2 stars',
 '2000',
 '2004',
 '2005',
 '2007',
 '2010s',
 '2013',
 '2014',
 '2015',
 '2016',
 '2018',
 '2019',
 '2021',
 '2rika',
 '3',
 '3 stars',
 '4 stars',
 '4-5',
 '4dnb',
 '5 stars',
 '80s',
 '90s',
 'a cappella',
 'about me',
 'acapella',
 'acoustic',
 'acoustic guitar',
 'addictive',
 'all',
 'alone with the woods',
 'altair',
 'autumn2011',
 'awesome song',
 'ballade',
 'ballads',
 'bbpe',
 'beautiful',
 'besause i lovvvvve it',
 'booba',
 'boombox',
 'boring',
 'breakbeat',
 'buttons',
 'calm',
 'catherine and coskun',
 'chart may 2013',
 'chill out',
 'chill-out tunes',
 'chillout',
 'christmas 2019',
 'christmas songs',
 'cities',
 'cloudwalkers',
 'cool',
 'cosmos',
 'cover',
 'covers',
 'credo',
 'crossover',
 'dance',
 'dark',
 'dark psytrance',
 'darkwave',
 'dead rooster',
 'depressive',
 'digital',
 'dnb',
 'doom rock',
 'drive',
 'druga rika',
 'drum and bass',
 'drum n bass',
 'dwell',
 'eastbeat',
 'easy lis

In [171]:
def filter_tags(tags):
    return sorted(list(set([tag for tag in tags if tag in genre_tags])))

merged_df["title_tags"] = merged_df["title_tags"].apply(filter_tags)

In [172]:
def tag_list_to_common_genres(tag_list):
    common_genres = [
        'pop', 'rock', 'rap', 'jazz', 'folk', 'funk', 'metal', 'punk', 'blues', 'electro',
        'hip-hop', 'indie', 'classical', 'country', 'soul', 'edm', 'lo-fi', 'alternative',
        'reggae', 'groove',  'grunge', 'gothic', 'ska', 'disco'
    ]
    with open('configs/common_genre_tags_v1.txt', 'w') as f:
        for genre in common_genres:
            f.write(genre + '\n')
    common_tags = []
    for tag in tag_list:
        tag = tag.replace('hip hop', 'hip-hop')
        for genre in common_genres:
            if genre in tag:
                common_tags.append(genre)
    return sorted(list(set(common_tags)))

merged_df['genre_tags'] = merged_df['title_tags'].apply(tag_list_to_common_genres)
merged_df['genre_tags'].head(10)

0      [alternative, pop, rock]
1    [alternative, blues, rock]
2                        [rock]
3                        [rock]
4                 [indie, rock]
5            [indie, pop, rock]
6                        [rock]
7           [alternative, rock]
8                        [rock]
9                        [rock]
Name: genre_tags, dtype: object

## Filtering duplicates after title normalization

Normalize song titles and fetched yt audio titles: cyrillic --> (?) brackets --> white spaces:

In [173]:
def normalize_title(title: str):
    if pd.isnull(title):
        return title
    title = str(title).lower()
    return normalize_whitespace(fix_mixed_cyrillic(title))

merged_df["title_norm"] = merged_df["title"].apply(normalize_title)
merged_df["yt_title_norm"] = merged_df["yt_title"].apply(normalize_title)

Filter duplicates of the same song for each artist separately:

In [174]:
filtered_df = merged_df.dropna(subset=["title", "artist", "audio_path", "yt_url"])

filtered_df = (
    filtered_df.sort_values(by="title_scrobbles", ascending=False)
        .drop_duplicates(subset=["artist", "title_norm"], keep="first")
        .sort_index()  # Restore original order
)

print("Number of samples after filtering: ", len(filtered_df))

Number of samples after filtering:  3960


## Similarity score filtering

Now let's consider the similarity score to check the relevance of the fetched audio:

1) Calculate the similarity score (title, yt_title)
2) Mark songs with similarity score below the threshold as bad_audio (a.k.a the fetched audio doesn't match the song)


In [175]:
TITLE_YT_TITLE_SIMILARITY_THRESHOLD = 60

filtered_df["title-yt_title score"] = filtered_df.apply(lambda row: similarity_score(row["title_norm"], row["yt_title_norm"]), axis=1)
filtered_df["bad_audio"] = filtered_df["title-yt_title score"] < TITLE_YT_TITLE_SIMILARITY_THRESHOLD

3) Group dataset by audio_path (and artist) and consider those with count > 1 (a.k.a. possibly have duplicates)
4) Sort songs in the popularity order
5) Consider only songs with high similarity: mark all as duplicates except for one song with the highest popularity

In [176]:
audio_path_groups = filtered_df.groupby("audio_path").filter(lambda x: len(x) > 1)
print("Number of duplicated audio paths after filtering: ", len(audio_path_groups))

Number of duplicated audio paths after filtering:  1286


In [177]:
filtered_df["duplicate"] = False

for audio_path, group in audio_path_groups.groupby("audio_path"):
    for artist, artist_group in group.groupby("artist"):
        if len(artist_group) == 1:
            continue
        artist_group = artist_group.sort_values(by="title_scrobbles", ascending=False)
        most_popular_relevant_idx = False
        for idx, row in artist_group.iterrows():
            if row["title-yt_title score"] >= TITLE_YT_TITLE_SIMILARITY_THRESHOLD:
                if not most_popular_relevant_idx:
                    most_popular_relevant_idx = idx  # Keep the one with the highest title_scrobbles
                else:
                    filtered_df.loc[idx, "duplicate"] = True

In [178]:
print("Number of duplicates:", filtered_df["duplicate"].sum())
print("Number of bad audio:", filtered_df["bad_audio"].sum())

Number of duplicates: 499
Number of bad audio: 295


In [179]:
relevant_df = filtered_df[~filtered_df["duplicate"] & ~filtered_df["bad_audio"]]
print("Number of relevant samples: ", len(relevant_df))

Number of relevant samples:  3166


## Save merged dataset and update summary

In [180]:
columns_to_save = [
    "title", "artist", "title_listeners", "title_scrobbles", "title_tags", "genre_tags",
    "title_duration", "audio_path", "yt_title", "yt_url", "yt_duration", "yt_views"
]

relevant_df[columns_to_save].to_csv(os.path.join("data", "merged_songs_v1.csv"), index=False)

In [182]:
print("Number of ukr artists: ", len(relevant_df["artist"].unique()))

Number of ukr artists:  60


In [181]:
songs_per_artist = relevant_df["artist"].value_counts()

print(f"Total number of songs: {len(relevant_df)}\n")
print(f"Number of songs per artist:\n{songs_per_artist.describe()}")

Total number of songs: 3166

Number of songs per artist:
count    60.000000
mean     52.766667
std      19.916237
min      16.000000
25%      37.000000
50%      53.500000
75%      66.250000
max      94.000000
Name: artist, dtype: float64


## Artists df preprocessing

In [1]:
import pandas as pd
import os

artists_df = pd.read_csv(os.path.join("data", "artists_v1.csv"))
artists_df

Unnamed: 0,artist,artist_listeners,artist_scrobbles,artist_tags
0,–û–∫–µ–∞–Ω –ï–ª—å–∑–∏,203060,13552898,"['rock', 'ukrainian', 'ukrainian rock', 'indie..."
1,Go_A,189946,4088960,"['ukrainian', 'folktronica', 'folk', 'electron..."
2,–ë—É–º–±–æ–∫—Å,174374,6424069,"['ukrainian', 'funk', 'reggae', 'hip-hop', 'fu..."
3,–°–∫—Ä—è–±—ñ–Ω,59090,2821158,"['ukrainian', 'synthpop', 'pop-rock', 'electro..."
4,KALUSH,106178,1850491,"['ukrainian', 'rap', 'hip-hop', 'ukraine', 'eu..."
...,...,...,...,...
201,Dens,2485,66284,"['post-hardcore', 'christian', 'melodic hardco..."
202,TELLY GRAVE,4942,66814,"['rap', 'hip-hop', 'russian', 'ukraine', 'ukra..."
203,NAZVA,4640,60064,"['ukrainian', 'folk', 'folk pop', 'pop', 'ukra..."
204,Dazzle Dreams,5948,109380,"['ukrainian', 'electronic', 'electropop', 'amb..."


In [5]:
artists_df["artist_listeners"] = artists_df["artist_listeners"].apply(str_to_int).astype("Int64")
artists_df["artist_scrobbles"] = artists_df["artist_scrobbles"].apply(str_to_int).astype("Int64")

In [6]:
artists_df["artist_tags"] = artists_df["artist_tags"].apply(eval)

In [9]:
unique_artist_tags = set()

for tags in artists_df["artist_tags"].values:
    unique_artist_tags = unique_artist_tags.union(set(tags))

print("Number of unique artists tags:", len(unique_artist_tags))

Number of unique artists tags: 352


In [10]:
unique_artist_tags

{'00s',
 '10s',
 '2',
 '2000s',
 '2005',
 '2009',
 '2010s',
 '2017 single',
 '3',
 '4',
 '6 of 10 stars',
 '90s',
 'a cappella',
 'abstract hip-hop',
 'acapella',
 'acid jazz',
 'acoustic',
 'agri-alco-rock',
 'all',
 'alt rock',
 'alternative',
 'alternative metal',
 'alternative rock',
 'ambient',
 'ambient music',
 'anime',
 'anthem',
 'argentina',
 'art pop',
 'art rock',
 'austria',
 'authentic',
 'avant-folk',
 'avant-garde',
 'babkin',
 'band',
 'bands',
 'baroque pop',
 'based',
 'bayracore',
 'beats',
 'beautiful',
 'bedroom pop',
 'belarus',
 'belarusian',
 'best of preselection',
 'blues',
 'bonzaiprogressive',
 'booba',
 'bosnian',
 'british',
 'britpop',
 'cabaret',
 'celebration',
 'chanson',
 'childrens music',
 'chillhop',
 'chillout',
 'chiptune',
 'christian',
 'christian hardcore',
 'christmas',
 'christmas music',
 'christmas songs',
 'classical',
 'cloud rap',
 'club',
 'codeinoslav',
 'comedy',
 'cover',
 'covers',
 'cringecore',
 'cute',
 'cyber-folk',
 'dance',


In [11]:
artists_df.to_csv(os.path.join("data", "artists_v1.csv"), index=False)