In [4]:
import pandas as pd
from youtube_search import YoutubeSearch
from pytubefix import YouTube
import re
import os

### Load Scrapping Result

In [5]:
data = pd.read_csv('data/scrapping_result/data.csv', sep=';')

data.head()

Unnamed: 0,nama,daerah,keyword,lirik
0,Kicir-Kicir,Jakarta,jakarta,
1,Ondel Ondel,jakarta,Jakarta,
2,Manuk Dadali,jawa barat,jawa barat,


### Search and Download Music Function

In [6]:
def search_yt(query):
    results = YoutubeSearch(query, max_results=3).to_dict()

    for i in range(len(results)):
        results[i]['url'] = 'https://www.youtube.com' + results[i]['url_suffix']
        
    return results

def normalized_yt_title(title):
    text = title.lower()
    text = text.replace(' ', '_')
    text = re.sub(r'[^a-z0-9_]', '', text)
    text = re.sub(r'_{2,}', '_', text)
    
    return text

def dl_video(query):
    try:
        yt = YouTube(query, 'ANDROID')
        print(f'Downloading {yt.title}...')
        
        normalized_title = normalized_yt_title(yt.title)
        
        ys = yt.streams.get_audio_only()
        ys.download(mp3=True, output_path='datasets/songs', filename=normalized_title)
        
        return f'datasets/songs/{normalized_title}.mp3'
    except Exception as e:
        print(e)
        return None
    

In [7]:
dl_res = []

for index, row in data.iterrows():
    # search for songs with the name and region keywords
    keyword = f"{row['nama']} asal {row['daerah']}"
    searched_songs = search_yt(keyword)
    
    # loop through the search results
    for song in searched_songs:
        # display(song['duration'])
        if float(song['duration']) <= 5:
            # download the video using dl_video function
            path = dl_video(song['url'])
            
            # append the result to dl_res list
            dl_res.append({
                'title': song['title'],
                'region': row['daerah'],
                'keyword': f"{row['nama']} asal {row['daerah']}",
                'duration': float(song['duration']),
                'url': song['url'],
                'path': path
            })
        else:
            print(f"Duration of {song['title']} is too long")
        

The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209


Downloading KICIR KICIR - Lagu dan Tari Nusantara - Lagu Anak...


The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209


Downloading KICIR KICIR | Lagu Daerah Jakarta (Betawi) | Diva bernyanyi | Diva The Series Official...
Downloading Kicir Kicir Jakarta...


The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209


Downloading ONDEL ONDEL | Lagu Daerah DKI Jakarta - Betawi | Budaya Indonesia | Dongeng Kita...


The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209


Downloading ONDEL ONDEL 💞 LAGU DAERAH DKI JAKARTA | ONDEL ONDEL BETAWI REMIX...
Downloading Lirik Lagu ONDEL ONDEL Benyamin Sueb | Lagu Betawi Asli | Lagu Daerah Provinsi DKI Jakarta...


The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209


Downloading Lagu manuk dadali...


The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
The ANDROID client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209


Downloading Aty Surya - Manuk Dadali...
Downloading Lagu Daerah Manuk Dadali dan Lirik "Apa Ya Makna Lagu Daerah Kita?"...


### Save list of songs to csv

In [8]:
df = pd.DataFrame(dl_res)

df.to_csv('data/results.csv', sep=';', index=False)

### Convert to wav format

In [11]:
import os
import subprocess

os.makedirs('datasets/wav_songs', exist_ok=True)

def convert_to_wav(path):
    if path is None:
        print(f'File is not found: {path}')
        return None
    try:
        # path = path
        wav_path = path.replace('songs', 'wav_songs').replace('.mp3', '.wav')
        
        if os.path.exists(wav_path):
            return wav_path
        
        print(f'Converting {path} to:\n{wav_path}')
        subprocess.run(['ffmpeg', '-i', path, wav_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        
        return wav_path
    except Exception as e:
        print(e)
        return None

In [12]:
df['wav_path'] = df['path'].apply(convert_to_wav)

Converting datasets/songs/lirik_lagu_ondel_ondel_benyamin_sueb_lagu_betawi_asli_lagu_daerah_provinsi_dki_jakarta.mp3 to:
datasets/wav_songs/lirik_lagu_ondel_ondel_benyamin_sueb_lagu_betawi_asli_lagu_daerah_provinsi_dki_jakarta.wav
Converting datasets/songs/lagu_manuk_dadali.mp3 to:
datasets/wav_songs/lagu_manuk_dadali.wav


In [14]:
df.head()

df.to_csv('data/results_wav.csv', sep=';', index=False)