### Modules

In [18]:
import pandas as pd
import logging
from youtube_search import YoutubeSearch
from pytubefix import YouTube
from pydub import AudioSegment

import re
import os
import datetime

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

os.makedirs('datasets/songs', exist_ok=True)
os.makedirs('datasets/wav_songs', exist_ok=True)

### Variables
```yaml
This module contains the configuration for the youtube search and download
You can change the configuration here

YOUTUBE_BASE_URL: The base url for the youtube search
YOUTUBE_SEARCH_MAX_RESULTS: The maximum number of results to return from the youtube search
YOUTUBE_CLIENT_TYPE: The type of client to use for the youtube search
YOUTUBE_MAX_VIDEO_LENGTH: The maximum length of a video in seconds
YOUTUBE_MIN_VIDEO_LENGTH: The minimum length of a video in seconds

SONG_SIZE_PER_TITLE: size (count) songs per title
SEGMENT_LENGTH: The length of each segment in seconds
```

In [19]:

YOUTUBE_BASE_URL = 'https://www.youtube.com/watch?v='
YOUTUBE_SEARCH_MAX_RESULTS = 10
YOUTUBE_CLIENT_TYPE='IOS'
YOUTUBE_MAX_VIDEO_LENGTH = 500
YOUTUBE_MIN_VIDEO_LENGTH = 30

SONG_SIZE_PER_TITLE=5
SEGMENT_LENGTH = 30

### Load Scrapping Result

In [20]:
data = pd.read_csv('data/lagu_daerah.csv', sep=',')
data.head()

Unnamed: 0,No,Nama Lagu,Asal Daerah
0,1,Bungong Jeumpa,Aceh
1,2,Jambo – Jambo,Aceh
2,3,Lembah Alas,Aceh
3,4,Aceh Lon Sayang,Aceh
4,5,Tawar Sedenge,Aceh


### Search and Download Music Function

In [21]:
def search_yt(query, max_results=YOUTUBE_SEARCH_MAX_RESULTS):
    try:
        results = YoutubeSearch(query, max_results).to_dict()
        for result in results:
            result['url'] = YOUTUBE_BASE_URL + result['id']
        return results
    except Exception as e:
        logging.error(f"Error searching YouTube for query '{query}': {e}")
        return []

def normalized_yt_title(title):
    text = title.lower()
    text = text.replace(' ', '_')
    text = re.sub(r'[^a-z0-9_]', '', text)
    text = re.sub(r'_{2,}', '_', text)
    
    return text

def dl_video(url):
    try:
        yt = YouTube(url, client=YOUTUBE_CLIENT_TYPE)
        logging.info(f'Downloading {yt.title}...')
        
        normalized_title = normalized_yt_title(yt.title)
        
        # check if video already downloaded
        filepath= f'datasets/songs/{normalized_title}.mp3'
        if os.path.exists(filepath):
            logging.info(f'File already exists: {filepath}')
            return filepath
        
        audio_stream = yt.streams.get_audio_only()
        
        audio_stream.download(output_path='datasets/songs', filename=normalized_title, mp3=True)
        
        return f'datasets/songs/{normalized_title}.mp3'
    except Exception as e:
        logging.error(f"Error downloading video from URL '{url}': {e}")
        return None
    
def parse_duration(duration_str):
    duration_str = duration_str.replace('.', ':')
    if duration_str.count(':') == 2:
        duration = datetime.datetime.strptime(duration_str, '%H:%M:%S')
    else:
        duration = datetime.datetime.strptime(duration_str, '%M:%S')
        
    return duration.hour * 3600 + duration.minute * 60 + duration.second

In [8]:
dl_results = []

In [9]:
for index, row in data[:30].iterrows():
    # Construct the search keyword
    keyword = f"{row['Nama Lagu']} asal {row['Asal Daerah']}"
    searched_songs = search_yt(keyword)
    
    downloaded_count = 0
    for song in searched_songs:
        if downloaded_count >= SONG_SIZE_PER_TITLE:
            logging.info(f"Downloaded {downloaded_count} songs for '{keyword}'")
            break
        
        try:
            duration = parse_duration(song['duration'])
            if duration < YOUTUBE_MAX_VIDEO_LENGTH and duration > YOUTUBE_MIN_VIDEO_LENGTH:
                path = dl_video(song['url'])
                
                # Append the result to downloaded_results list
                dl_results.append({
                    'title': song['title'],
                    'nama_lagu': row['Nama Lagu'],
                    'region': row['Asal Daerah'],
                    'keyword': keyword,
                    'duration': duration,
                    'url': song['url'],
                    'path': path
                })
                logging.info(f"Downloaded: {song['title']} from {song['url']}")
                
                downloaded_count += 1
            else:
                logging.warning(f"Duration of {song['title']} is too long: {duration} seconds")
        except Exception as e:
            logging.error(f"Error processing song {song['title']}: {e}")

2024-11-27 10:56:31,087 - INFO - Downloading Bungong Jeumpa - Putri Ariani Cover (Lagu Daerah Aceh)...
2024-11-27 10:56:33,302 - INFO - Downloaded: Bungong Jeumpa - Putri Ariani Cover (Lagu Daerah Aceh) from https://www.youtube.com/watch?v=RjI7W_XmzLM
2024-11-27 10:56:33,525 - INFO - Downloading Bungong Jeumpa | Lirik dan Terjemahan | Lagu Daerah Aceh | Dongeng Kita...
2024-11-27 10:56:36,097 - INFO - Downloaded: Bungong Jeumpa | Lirik dan Terjemahan | Lagu Daerah Aceh | Dongeng Kita from https://www.youtube.com/watch?v=lAVi2OE2bRY
2024-11-27 10:56:36,318 - INFO - Downloading Tari Bungong Jeumpa Aceh |Rumah Belajar Mawinsya...
2024-11-27 10:56:37,785 - INFO - Downloaded: Tari Bungong Jeumpa Aceh |Rumah Belajar Mawinsya from https://www.youtube.com/watch?v=W3PRdCxocM0
2024-11-27 10:56:38,023 - INFO - Downloading KAKA ALFARISI - BUNGONG JEUMPA (OFFICIAL VIDEO)...
2024-11-27 10:56:43,243 - INFO - Downloaded: KAKA ALFARISI - BUNGONG JEUMPA (OFFICIAL VIDEO) from https://www.youtube.com/watc

### Save list of songs to csv

In [22]:
df = pd.DataFrame(dl_results)

df.to_csv('data/downloaded_songs.csv', index=False)

### Convert to wav format

In [25]:
def convert_to_wav(path):
    if path is None:
        logging.error('File path is None')
        return None
    
    try:
        wav_path = path.replace('songs', 'wav_songs').replace('.mp3', '.wav')
        
        if os.path.exists(wav_path):
            logging.info(f'File already exists: {wav_path}')
            return wav_path
        
        logging.info(f'Converting {path} to {wav_path}')
        
        audio = AudioSegment.from_file(path)
        audio.export(wav_path, format='wav')
        
        return wav_path
    except Exception as e:
        logging.error(f'Exception occurred: {e}')
        return None

In [26]:
df['wav_path'] = df['path'].apply(convert_to_wav)

2024-11-27 11:29:11,751 - INFO - Converting datasets/songs/bungong_jeumpa_putri_ariani_cover_lagu_daerah_aceh.mp3 to datasets/wav_songs/bungong_jeumpa_putri_ariani_cover_lagu_daerah_aceh.wav
2024-11-27 11:29:12,203 - INFO - Converting datasets/songs/bungong_jeumpa_lirik_dan_terjemahan_lagu_daerah_aceh_dongeng_kita.mp3 to datasets/wav_songs/bungong_jeumpa_lirik_dan_terjemahan_lagu_daerah_aceh_dongeng_kita.wav
2024-11-27 11:29:12,708 - INFO - Converting datasets/songs/tari_bungong_jeumpa_aceh_rumah_belajar_mawinsya.mp3 to datasets/wav_songs/tari_bungong_jeumpa_aceh_rumah_belajar_mawinsya.wav
2024-11-27 11:29:13,198 - INFO - Converting datasets/songs/kaka_alfarisi_bungong_jeumpa_official_video.mp3 to datasets/wav_songs/kaka_alfarisi_bungong_jeumpa_official_video.wav
2024-11-27 11:29:13,817 - INFO - Converting datasets/songs/tari_bungong_jeumpa_berasal_dari_aceh.mp3 to datasets/wav_songs/tari_bungong_jeumpa_berasal_dari_aceh.wav
2024-11-27 11:29:14,295 - INFO - Converting datasets/songs/ja

In [27]:
df.head()

df.to_csv('data/songs_wav.csv', index=False)