In [88]:
import pandas as pd
from youtube_search import YoutubeSearch
from pytubefix import YouTube
import re

### Load Scrapping Result

In [89]:
data = pd.read_csv('data/data.csv', sep=';')

data.head()

Unnamed: 0,nama,daerah,keyword,lirik
0,Kicir-Kicir,Jakarta,jakarta,
1,Ondel Ondel,jakarta,Jakarta,
2,Manuk Dadali,jawa barat,jawa barat,


### Search and Download Music Function

In [90]:
def search_yt(query):
    results = YoutubeSearch(query, max_results=3).to_dict()

    for i in range(len(results)):
        results[i]['url'] = 'https://www.youtube.com' + results[i]['url_suffix']
        
    return results

def normalized_yt_title(title):
    text = title.lower()
    text = text.replace(' ', '_')
    text = re.sub(r'[^a-z0-9_]', '', text)
    text = re.sub(r'_{2,}', '_', text)
    
    return text

def dl_video(query):
    try:
        yt = YouTube(query, 'MWEB')
        print(f'Downloading {yt.title}...')
        
        normalized_title = normalized_yt_title(yt.title)
        
        ys = yt.streams.get_audio_only()
        ys.download(mp3=True, output_path='songs', filename=normalized_title)
        
        return f'/songs/{normalized_title}.mp3'
    except Exception as e:
        print(e)
        return None
    

In [91]:
results = []

for index, row in data.iterrows():
    songs = search_yt(f"{row['nama']} asal {row['daerah']}")
    
    result_entry = {
        'nama': row['nama'],
        'daerah': row['daerah'],
        'keyword': row['keyword'],
        'songs': []
    }

    for song in songs:
        path = dl_video(song['url'])
        result_entry['songs'].append({
            'title': song['title'],
            'url': song['url'],
            'path': path
        })
    
    results.append(result_entry)


Downloading KICIR KICIR | Lagu Daerah DKI Jakarta - Betawi | Budaya Indonesia | Dongeng Kita...
Downloading KARAOKE KICIR KICIR    Lagu Daerah Jakarta...
Downloading Kicir Kicir (Lagu Daerah Betawi)...
Downloading ONDEL ONDEL | Lagu Daerah DKI Jakarta - Betawi | Budaya Indonesia | Dongeng Kita...
Downloading ONDEL ONDEL 💞 LAGU DAERAH DKI JAKARTA | ONDEL ONDEL BETAWI REMIX...
Downloading Sejarah Asal Muasal Nama ONDEL - ONDEL...
Downloading Lirik Lagu manuk Dadali...
Downloading Tari Manuk Dadali-Jawa Barat || "Keragaman Budaya Indonesia" XI IPS 4...
Downloading Aty Surya - Manuk Dadali...


### Data Selection

In [92]:
df = pd.DataFrame(results)
df_songs = pd.DataFrame([song for result in results for song in result['songs']])

In [93]:
df_songs.head()

Unnamed: 0,title,url,path
0,KICIR KICIR | Lagu Daerah DKI Jakarta - Betawi...,https://www.youtube.com/watch?v=lhZf7qGZzsE&pp...,/songs/kicir_kicir_lagu_daerah_dki_jakarta_bet...
1,KARAOKE KICIR KICIR Lagu Daerah Jakarta,https://www.youtube.com/watch?v=nbO8hjzAN20&pp...,/songs/karaoke_kicir_kicir_lagu_daerah_jakarta...
2,Kicir Kicir (Lagu Daerah Betawi),https://www.youtube.com/watch?v=CcFbCjM2jsM&pp...,/songs/kicir_kicir_lagu_daerah_betawi.mp3
3,ONDEL ONDEL | Lagu Daerah DKI Jakarta - Betawi...,https://www.youtube.com/watch?v=wardyOl-EHo&pp...,/songs/ondel_ondel_lagu_daerah_dki_jakarta_bet...
4,ONDEL ONDEL 💞 LAGU DAERAH DKI JAKARTA | ONDEL ...,https://www.youtube.com/watch?v=_X39qcf41ZU&pp...,/songs/ondel_ondel_lagu_daerah_dki_jakarta_ond...


In [94]:
import os

def convert_path(path):
    if path is None:
        return None
    return str(os.path.abspath(path))

df_songs['path'] = df_songs['path'].apply(convert_path)

In [95]:
df_songs.head()

Unnamed: 0,title,url,path
0,KICIR KICIR | Lagu Daerah DKI Jakarta - Betawi...,https://www.youtube.com/watch?v=lhZf7qGZzsE&pp...,c:\songs\kicir_kicir_lagu_daerah_dki_jakarta_b...
1,KARAOKE KICIR KICIR Lagu Daerah Jakarta,https://www.youtube.com/watch?v=nbO8hjzAN20&pp...,c:\songs\karaoke_kicir_kicir_lagu_daerah_jakar...
2,Kicir Kicir (Lagu Daerah Betawi),https://www.youtube.com/watch?v=CcFbCjM2jsM&pp...,c:\songs\kicir_kicir_lagu_daerah_betawi.mp3
3,ONDEL ONDEL | Lagu Daerah DKI Jakarta - Betawi...,https://www.youtube.com/watch?v=wardyOl-EHo&pp...,c:\songs\ondel_ondel_lagu_daerah_dki_jakarta_b...
4,ONDEL ONDEL 💞 LAGU DAERAH DKI JAKARTA | ONDEL ...,https://www.youtube.com/watch?v=_X39qcf41ZU&pp...,c:\songs\ondel_ondel_lagu_daerah_dki_jakarta_o...


In [96]:
#remove the first three character of path
def remove_first_three(path):
    if path is None:
        return None
    return path[3:]

df_songs['path'] = df_songs['path'].apply(remove_first_three)

In [97]:
df_songs.head()

Unnamed: 0,title,url,path
0,KICIR KICIR | Lagu Daerah DKI Jakarta - Betawi...,https://www.youtube.com/watch?v=lhZf7qGZzsE&pp...,songs\kicir_kicir_lagu_daerah_dki_jakarta_beta...
1,KARAOKE KICIR KICIR Lagu Daerah Jakarta,https://www.youtube.com/watch?v=nbO8hjzAN20&pp...,songs\karaoke_kicir_kicir_lagu_daerah_jakarta.mp3
2,Kicir Kicir (Lagu Daerah Betawi),https://www.youtube.com/watch?v=CcFbCjM2jsM&pp...,songs\kicir_kicir_lagu_daerah_betawi.mp3
3,ONDEL ONDEL | Lagu Daerah DKI Jakarta - Betawi...,https://www.youtube.com/watch?v=wardyOl-EHo&pp...,songs\ondel_ondel_lagu_daerah_dki_jakarta_beta...
4,ONDEL ONDEL 💞 LAGU DAERAH DKI JAKARTA | ONDEL ...,https://www.youtube.com/watch?v=_X39qcf41ZU&pp...,songs\ondel_ondel_lagu_daerah_dki_jakarta_onde...


### Mengubah MP3 menjadi WAV

In [104]:
# # mengubah mp3 menjadi wav
# import pydub
# from pydub import AudioSegment

# def mp3_to_wav(path):
#     if path is None:
#         return None
#     try:
#         sound = AudioSegment.from_mp3(path)
#         path = path.replace('.mp3', '.wav')
#         sound.export(path, format='wav')
#         return path
#     except Exception as e:
#         print(e)
#         return None

In [105]:
# df_songs['path'] = df_songs['path'].apply(mp3_to_wav)



[WinError 2] The system cannot find the file specified
[WinError 2] The system cannot find the file specified
[WinError 2] The system cannot find the file specified
[WinError 2] The system cannot find the file specified
[WinError 2] The system cannot find the file specified
[WinError 2] The system cannot find the file specified
[WinError 2] The system cannot find the file specified
[WinError 2] The system cannot find the file specified
[WinError 2] The system cannot find the file specified


#### Durasi Lagu
Masih error, path harus seperti ini:
`songs\senam kicir kicir lomba 2024.mp3`

In [103]:
from pydub import AudioSegment
from pydub.utils import which
AudioSegment.converter = which("ffmpeg")

list_songs = []

for row in df_songs.iterrows():
    path = row[1]['path']
    if path is None:
        continue
    try:
        audio = AudioSegment.from_file(path)
        print(f'Processing {path}...')
        list_songs.append({
            'title': row[1]['title'],
            'path': path,
            'duration': len(audio) / 1000  # Duration in seconds
        })
    except Exception as e:
        print(f"Error processing {path}: {e}")
        continue

Error processing songs\kicir_kicir_lagu_daerah_dki_jakarta_betawi_budaya_indonesia_dongeng_kita.mp3: [WinError 2] The system cannot find the file specified
Error processing songs\karaoke_kicir_kicir_lagu_daerah_jakarta.mp3: [WinError 2] The system cannot find the file specified
Error processing songs\kicir_kicir_lagu_daerah_betawi.mp3: [WinError 2] The system cannot find the file specified
Error processing songs\ondel_ondel_lagu_daerah_dki_jakarta_betawi_budaya_indonesia_dongeng_kita.mp3: [WinError 2] The system cannot find the file specified
Error processing songs\ondel_ondel_lagu_daerah_dki_jakarta_ondel_ondel_betawi_remix.mp3: [WinError 2] The system cannot find the file specified
Error processing songs\sejarah_asal_muasal_nama_ondel_ondel.mp3: [WinError 2] The system cannot find the file specified
Error processing songs\lirik_lagu_manuk_dadali.mp3: [WinError 2] The system cannot find the file specified
Error processing songs\tari_manuk_dadalijawa_barat_keragaman_budaya_indonesia_xi



In [102]:
df_list_songs = pd.DataFrame(list_songs)
df_list_songs