# Import Modules

In [13]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os

In [14]:
os.makedirs('data', exist_ok=True)

# Scraping

In [15]:
BASE_URL = 'https://www.zonareferensi.com/lagu-daerah-indonesia/'

In [18]:
res = requests.get(BASE_URL)
soup = BeautifulSoup(res.text, 'html.parser')

table = soup.find_all('table')[0]
rows = table.find_all('tr')

data = []

for i, row in enumerate(rows):
    if i == 0:
        continue
    cols = row.find_all('td')
    
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele])

df = pd.DataFrame(data)
df.columns = ['no', 'nama_lagu', 'asal']

df.to_csv('data/lagu_daerah.csv', index=False)

# Cleaning Data

In [19]:
df = pd.read_csv('data/lagu_daerah.csv')
df.sample(5)

Unnamed: 0,no,nama_lagu,asal
385,386,Ya Saman,Sumatera Selatan
9,10,Meyong-Meyong,Bali
153,154,Bantelan,Kalimantan Barat
379,380,Tak Tong-Tong,Sumatera Barat
22,23,Lalan Belek,Bengkulu


In [20]:
df[df.duplicated(subset='nama_lagu', keep=False)].sort_values('nama_lagu')

Unnamed: 0,no,nama_lagu,asal
287,288,Bolelebo,Nusa Tenggara Timur
276,277,Bolelebo,Nusa Tenggara Timur
19,20,Dayung Sampan,Banten
52,53,Dayung Sampan,Jakarta
143,144,Gai Bintang,Jawa Timur Madura
117,118,Gai Bintang,Jawa Timur
104,105,Gek Kepriye,Jawa Tengah
97,98,Gek Kepriye,Jawa Tengah
100,101,Jamuran,Jawa Tengah
107,108,Jamuran,Jawa Tengah


In [21]:
df_clean = df.drop_duplicates(subset='nama_lagu', keep='first')

In [22]:
print(f'Before drop duplicated data: {len(df)}')
print(f'After drop duplicated data: {len(df_clean)}')

Before drop duplicated data: 418
After drop duplicated data: 404


In [23]:
# save to csv
df_clean.to_csv('data/lagu_daerah.csv', index=False)

# Scraping Lagu Daerah Alternatif

In [24]:
BASE_URL = 'https://dianisa.com/lagu-daerah-indonesia-beserta-lirik-dan-asalnya/'

In [26]:
res = requests.get(BASE_URL)

soup = BeautifulSoup(res.text, 'html.parser')

# find table with class has-fixed-layout
table = soup.find_all('table', class_='has-fixed-layout')[0]

rows = table.find_all('tr')

data = []

for i, row in enumerate(rows):
    if i == 0:
        continue
    cols = row.find_all('td')
    
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele])
    
df = pd.DataFrame(data)
df.columns = ['nama_lagu', 'asal']

In [27]:
df.head()

Unnamed: 0,nama_lagu,asal
0,Yamko Rambe Yamko,Papua
1,Rasa Sayange,Maluku
2,Manuk Dadali,Jawa Barat
3,Gundul Pacul,Jawa Tengah
4,Jali – Jali,Jakarta


In [30]:
df = df.sort_values('asal')

df.head()

Unnamed: 0,nama_lagu,asal
21,Bungong Jeumpa,Aceh
22,Mecepat – Cepetan,Bali
19,Sarinande,Bali
104,Putri,Bali
68,Ratu Anom,Bali


In [31]:
df.to_csv('data/lagu_daerah_alternatif.csv', index=False)