In [33]:
import pandas as pd
from youtube_search import YoutubeSearch
from pytubefix import YouTube
import re
import os

### Load Scrapping Result

In [34]:
data = pd.read_csv('data/scrapping_result/lagu_daerah.csv', sep=',')

print(data.columns)
data.head()

Index(['No', 'Nama Lagu', 'Asal Daerah'], dtype='object')


Unnamed: 0,No,Nama Lagu,Asal Daerah
0,1,Bungong Jeumpa,Aceh
1,2,Jambo – Jambo,Aceh
2,3,Lembah Alas,Aceh
3,4,Aceh Lon Sayang,Aceh
4,5,Tawar Sedenge,Aceh


### Search and Download Music Function

In [35]:
def search_yt(query):
    results = YoutubeSearch(query, max_results=5).to_dict()

    for i in range(len(results)):
        results[i]['url'] = 'https://www.youtube.com' + results[i]['url_suffix']
        
    return results

def normalized_yt_title(title):
    text = title.lower()
    text = text.replace(' ', '_')
    text = re.sub(r'[^a-z0-9_]', '', text)
    text = re.sub(r'_{2,}', '_', text)
    
    return text

def dl_video(query):
    try:
        yt = YouTube(query, 'IOS')
        print(f'Downloading {yt.title}...')
        
        normalized_title = normalized_yt_title(yt.title)
        
        ys = yt.streams.get_audio_only()
        ys.download(mp3=True, output_path='datasets/songs', filename=normalized_title)
        
        return f'datasets/songs/{normalized_title}.mp3'
    except Exception as e:
        print(e)
        return None

In [36]:
dl_res = []

for index, row in data[:30].iterrows():
    # Search for songs with the name and region keywords
    keyword = f"{row['Nama Lagu']} asal {row['Asal Daerah']}"
    searched_songs = search_yt(keyword)
    
    # Loop through the search results
    for song in searched_songs:
        try:
            duration = float(song.get('duration', 0))  # Ambil durasi dan default ke 0 jika tidak ada
            if duration <= 5:  # Filter berdasarkan durasi
                # Download the video using dl_video function
                path = dl_video(song['url'])
                
                # Append the result to dl_res list
                dl_res.append({
                    'title': song['title'],
                    'nama_lagu': row['Nama Lagu'],
                    'region': row['Asal Daerah'],
                    'keyword': keyword,
                    'duration': duration,
                    'url': song['url'],
                    'path': path
                })
            else:
                print(f"Duration of {song['title']} is too long")
        except Exception as e:
            print(f"Error processing song: {e}")



Downloading Bungong Jeumpa - Putri Ariani Cover (Lagu Daerah Aceh)...
Downloading Bungong Jeumpa | Lirik dan Terjemahan | Lagu Daerah Aceh | Dongeng Kita...
Downloading Tari Bungong Jeumpa Aceh |Rumah Belajar Mawinsya...
Downloading KAKA ALFARISI - BUNGONG JEUMPA (OFFICIAL VIDEO)...
Downloading TARI "BUNGONG JEUMPA" BERASAL DARI ACEH...
Duration of Jambo Jambo - Lagu Daerah Aceh (dengan Lirik) is too long
Downloading Jambo - Safira Amalia (Official Music Video)...
Duration of Video Lirik Lagu Daerah | Jambo-Jambo is too long
Downloading TARIAN ACEH JAMBO JAMBO...
Downloading Jambo...
Duration of LAGU ALAS ACEH TENGGRA SANIMAH SULAI - LEMBAH ALAS is too long
Downloading Lembah Alas - Lagu Daerah Aceh (dengan Lirik)...
Downloading LEMBAH ALAS - LAGU DAERAH ACEH - Enjoy Pesona Musik Indonesia...
Downloading Lembah Alas Lirik Lagu Alas...
Duration of Lembah alas lagu daerah is too long
Downloading Aceh Lon Sayang - Lagu Daerah Aceh (Lirik dan Terjemahan)...
Downloading Lagu Aceh Lon Sayang

### Save list of songs to csv

In [37]:
df = pd.DataFrame(dl_res)

df.to_csv('data/results.csv', sep=';', index=False)

### Convert to wav format

In [38]:
import os
import subprocess

os.makedirs('datasets/wav_songs', exist_ok=True)

def convert_to_wav(path):
    if path is None:
        print(f'File is not found: {path}')
        return None
    try:
        # path = path
        wav_path = path.replace('songs', 'wav_songs').replace('.mp3', '.wav')
        
        if os.path.exists(wav_path):
            return wav_path
        
        print(f'Converting {path} to:\n{wav_path}')
        subprocess.run(['ffmpeg', '-i', path, wav_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        
        return wav_path
    except Exception as e:
        print(e)
        return None

In [39]:
df['wav_path'] = df['path'].apply(convert_to_wav)

Converting datasets/songs/bungong_jeumpa_putri_ariani_cover_lagu_daerah_aceh.mp3 to:
datasets/wav_songs/bungong_jeumpa_putri_ariani_cover_lagu_daerah_aceh.wav
Converting datasets/songs/bungong_jeumpa_lirik_dan_terjemahan_lagu_daerah_aceh_dongeng_kita.mp3 to:
datasets/wav_songs/bungong_jeumpa_lirik_dan_terjemahan_lagu_daerah_aceh_dongeng_kita.wav
Converting datasets/songs/tari_bungong_jeumpa_aceh_rumah_belajar_mawinsya.mp3 to:
datasets/wav_songs/tari_bungong_jeumpa_aceh_rumah_belajar_mawinsya.wav
Converting datasets/songs/kaka_alfarisi_bungong_jeumpa_official_video.mp3 to:
datasets/wav_songs/kaka_alfarisi_bungong_jeumpa_official_video.wav
Converting datasets/songs/tari_bungong_jeumpa_berasal_dari_aceh.mp3 to:
datasets/wav_songs/tari_bungong_jeumpa_berasal_dari_aceh.wav
Converting datasets/songs/jambo_safira_amalia_official_music_video.mp3 to:
datasets/wav_songs/jambo_safira_amalia_official_music_video.wav
Converting datasets/songs/tarian_aceh_jambo_jambo.mp3 to:
datasets/wav_songs/taria

In [40]:
df.head()

df.to_csv('data/results_wav.csv', sep=';', index=False)