In [None]:
# db 정보 csv에 저장/불러오기 함수선언
import pandas as pd
import os

# csv read
def make_dataframe(csv_path):
  if os.path.exists(csv_path):
    dataframe = pd.read_csv(csv_path)
    print("DataFrame loaded.")
    return dataframe
  else:
    dataframe = pd.DataFrame({
        'id': [],             # youtube id
        'title': [],          # 노래제목
        'singer': [],         # 가수이름(,으로 여러명 구분)
    })
    print("DataFrame created.")
  return dataframe

# csv read 예시
# csv_path = "/content/drive/My Drive/crawling.csv"
# dataframe = make_dataframe(csv_path)

# csv write 예시
# dataframe.to_csv(csv_path, index=False)

In [None]:
# 멜론 연간 리스트 크롤링
import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
    }
params = {
    'chartType': "YE",
    'year': "2019",
    'classCd': "KPOP", 
    'moved':'Y'}

titles = []
singers = []
albumids = []
songids = []

for year in range(2000, 2020):
  url = "https://www.melon.com/chart/search/list.htm"
  params['year'] = year;
  res = requests.get(url, params=params, headers=headers)
  html = res.text
  
  soup = BeautifulSoup(html, "html.parser")
  title_list = [title.text.strip() for title in soup.select('.ellipsis.rank01 strong')]
  singer_list = [singer.text.strip() for singer in soup.select('.checkEllipsis')]
  album_list = [''.join(filter(str.isdigit, albumid['href'])) for albumid in soup.select('.ellipsis.rank03 a')]
  songid_list = [''.join(filter(str.isdigit, albumid['href'])) for albumid in soup.select('.t_left .wrap a.btn.btn_icon_detail')]
  print(res.url)
  print(len(title_list), title_list)
  print(len(singer_list), singer_list)
  print(len(album_list), album_list)
  print(len(songid_list), songid_list)

  titles += title_list
  singers += singer_list
  albumids += album_list
  songids += songid_list

  res.close()

In [None]:
# 멜론 월간 리스트 크롤링
import requests
from bs4 import BeautifulSoup

url = "https://www.melon.com/chart/search/list.htm"
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
    }
params = {
          'chartType': "MO",
          'classCd': "DM0000",
          'mon': "01",
          'year': "2020",
          'moved':'Y'
          }
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09']

titles = []
singers = []
albumids = []
songids = []

for mon in months:
  params['mon'] = mon;
  res = requests.get(url, params=params, headers=headers)
  html = res.text
  
  soup = BeautifulSoup(html, "html.parser")
  title_list = [title.text.strip() for title in soup.select('.ellipsis.rank01 strong')]
  singer_list = [singer.text.strip() for singer in soup.select('.checkEllipsis')]
  album_list = [''.join(filter(str.isdigit, albumid['href'])) for albumid in soup.select('.ellipsis.rank03 a')]
  songid_list = [''.join(filter(str.isdigit, albumid['href'])) for albumid in soup.select('.t_left .wrap a.btn.btn_icon_detail')]
  print(res.url)
  print(len(title_list), title_list)
  print(len(singer_list), singer_list)
  print(len(album_list), album_list)
  print(len(songid_list), songid_list)
  
  titles += title_list
  singers += singer_list
  albumids += album_list
  songids += songid_list

  res.close()

In [None]:
# 엘범 메타 정보 크롤링
url_base = 'https://www.melon.com/song/detail.htm?songId='

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'ko-KR',
    'Pragma': 'no-cache',
    }
csv_path = "/content/drive/My Drive/crawling.csv"
dataframe = make_dataframe(csv_path)


res.close()
for i in range(len(albumids)):
  title = titles[i]
  singer = singers[i]

  temp = dataframe[(dataframe['title'] == title) & (dataframe['singer'] == singer)]
  target_index = temp.index[0]

  if pd.notna(dataframe['release'][target_index]):
    print(f'pass {target_index}')
    continue
  res = requests.get(url_base + songids[i], headers=headers)
  soup = BeautifulSoup(res.text, "html.parser")
  meta = soup.select('.meta .list dd')
  print(res.url, res.status_code, title)
  dataframe['release'][target_index] = meta[1].text
  dataframe['genre'][target_index] = meta[2].text

  res.close()

  dataframe.to_csv(csv_path, index=False)

In [None]:
# dataframe에 맬론 크롤링 기록
csv_path = "/content/drive/My Drive/crawling.csv"

dataframe = make_dataframe(csv_path)

for title, singer in zip(titles, singers): # title과 singer 배열이 제대로 차있는지 확인하고 실행바람
  dataframe = dataframe.append({
      'title': title,
      'singer': singer
  }, ignore_index=True)

# 중복 삭제 및 csv 저장
dataframe = dataframe.drop_duplicates(['title', 'singer'])
dataframe.to_csv(csv_path, index=False)

In [None]:
# 유튜브 음원 크롤링
!pip install selenium
!apt-get update
!apt install chromium-chromedriver

In [None]:
# 유튜브 크롤링 범위 정의 및 오디오 url 크롤링
import numpy as np
from urllib import parse
from bs4 import BeautifulSoup
from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
chrome_options.add_argument("disable-gpu")

url_base = 'https://www.youtube.com/results?search_query='

csv_path = "/content/drive/My Drive/crawling.csv"
dataframe = make_dataframe(csv_path)
pd.set_option('mode.chained_assignment',  None) # SettingWithCopyWarning 무시
id_process = dataframe["id"].notna()

for i in range(dataframe.shape[0]):
  if id_process[i]:
    continue

  query = parse.quote(f'{dataframe["title"][i]} {dataframe["singer"][i]}')
  search_url = url_base + query

  driver = webdriver.Chrome(options=chrome_options)
  driver.get(search_url)
  html = driver.page_source

  soup = BeautifulSoup(html, "html.parser")
  video_id = soup.select('.yt-simple-endpoint.style-scope.ytd-video-renderer')[0]['href']
  video_id = video_id[9:]
  print(search_url)
  print(video_id)
  dataframe['id'][i] = video_id
  driver.quit()
  dataframe.to_csv(csv_path, index=False)

DataFrame loaded.


In [None]:
# 음색 추출, 유튭 다운로드, 음성추출 라이브러리 설치
!pip install timbral_models pydub pafy youtube_dl spleeter

In [None]:
# 음원 다운로드 function
import pafy
import os

def get_youtube_audio(video_id:str, save_path:str):
  video = pafy.new(video_id)
  audiostreams = video.audiostreams
  # for s in audiostreams:
  #   print(s)
  # download_target = video.getbestaudio(preftype="webm")
  download_target = audiostreams[0]  #get worst audio
  print('title:', download_target.title, ', bitrate:' , download_target.bitrate)
  
  filepath = os.path.join(save_path, video_id) + f".{download_target.extension}"
  download_target.download(filepath)
  return filepath

In [None]:
# 보컬 추출 (한 개)
from spleeter.separator import Separator
from spleeter.audio.adapter import AudioAdapter
import os
import librosa

separator = Separator('spleeter:2stems')

def vocal_extract(youtube_id, input_path, output_path, codec="mp3", bitrate='40k'):
  if os.path.exists(input_path):
    print(f"Processing {input_path} ...")
    separator.separate_to_file(input_path, output_path, codec=codec, bitrate=bitrate)
    return os.path.join(output_path, youtube_id, f'vocals.{codec}')
  else:
    print(f'File "{input_path}" is not exists.')


# test
# print(vocal_extract('abuTb3qMOsY', '/content/abuTb3qMOsY.webm', './'))

In [None]:
# 유튜브 다운로드 & 음성추출 & 음색 추출 & 다운로드 제거
import timbral_models
from pydub import AudioSegment
import os
import shutil

audio_folder = './'
csv_path = "/content/drive/My Drive/crawling.csv"
dataframe = make_dataframe(csv_path)

for i in range(926, dataframe.shape[0]):
  if pd.notna(dataframe['hardness'][i]):
    print(f'pass id: {i}')
    continue
  id = dataframe['id'][i]

  audio_name = audio_folder + id
  try:
    audio_file_path = get_youtube_audio(id, audio_folder) # 음원 다운로드
  except:
    print(f'get_youtube_audio failed : {id}')
    continue

  try:
    vocal_path = vocal_extract(id, audio_file_path, './') # 음성 추출
  except Exception as ex:
    print(id, ex)
    os.remove(audio_file_path) #음원 삭제
    continue

  new_vocal_format = 'wav'
  new_vocal_path = audio_name+'_vocal.'+new_vocal_format
  new_vocal_sample_rate = 22050

  y, sr = librosa.load(vocal_path, new_vocal_sample_rate)
  librosa.output.write_wav(new_vocal_path, y, sr) # format 변경

  timbre = timbral_models.timbral_extractor(new_vocal_path)
  print(i, timbre)
  for attr, val in timbre.items():
    dataframe[attr][i] = val

  os.remove(audio_file_path) #음원 삭제
  shutil.rmtree(audio_folder + id) #추출된 음성 삭제
  os.remove(new_vocal_path) #음성 wav파일 삭제

  dataframe.to_csv(csv_path, index=False) # dataframe 저장

In [None]:
#db normalization
import numpy as np

csv_path = "/content/drive/My Drive/crawling.csv"
meta_csv_path = "/content/drive/My Drive/crawling_meta.csv"
dataframe = make_dataframe(csv_path)
metadf = pd.read_csv(meta_csv_path)

attrs = ['hardness', 'depth', 'brightness', 'roughness', 'warmth', 'sharpness', 'boominess']

for i in range(dataframe.shape[0]):
  for a in attrs:
    metadf[a+"_min"][0] = min(metadf[a+"_min"][0], dataframe.loc[i][a])
    metadf[a+"_max"][0] = max(metadf[a+"_max"][0], dataframe.loc[i][a])
metadf.to_csv(meta_csv_path, index=False)

dataframe[[attr + "_norm" for attr in attrs]] = None

for i in range(dataframe.shape[0]):
  for a in attrs:
    minValue = metadf[a+"_min"][0]
    maxValue = metadf[a+"_max"][0]
    dataframe[a+"_norm"][i] = (dataframe[a][i] - minValue) / (maxValue - minValue)
dataframe.to_csv(csv_path, index=False)

In [None]:
!pip install timbral_models pydub

In [None]:
import timbral_models
import os
import pandas as pd

attrs = ['hardness', 'depth', 'brightness', 'roughness', 'warmth', 'sharpness', 'boominess']

# 음색 유사도 반환
def timbre_similarity(input_timbre, db_timbre):
  similarity = 0
  for attr in attrs:
    similarity += (input_timbre[attr] - db_timbre[attr + "_norm"]) ** 2
  return similarity

# db에서 음색이 유사한 곡순으로 리스트 반환 
def timbre_similarity_list(input_vocal_path:str, database_path:str, meta_db_path:str):

  if not os.path.exists(database_path):
    print(f'Database is not exists at {database_path}.')
    return

  dataframe = pd.read_csv(database_path)
  dataframe['timbre_similarity'] = 0
  

  if not os.path.exists(meta_db_path):
    print(f'Database is not exists at {meta_db_path}.')
    return

  meta_df = pd.read_csv(meta_db_path)


  if not os.path.exists(input_vocal_path):
    print(f'Audio file is not exists at {input_vocal_path}.')
    return

  timbre = timbral_models.timbral_extractor(input_vocal_path)
  for a in attrs:
    min = metadf[a+"_min"][0]
    max = metadf[a+"_max"][0]
    timbre[a] = (timbre[a] - min) / (max - min)
  print(timbre)

  norm_attrs = [attr + "_norm" for attr in attrs]
  for index in range(dataframe.shape[0]):
    dataframe['timbre_similarity'][index] = timbre_similarity(timbre, dict(dataframe[norm_attrs].loc[index]))
  
  new_df = dataframe.sort_values(by=['timbre_similarity'])
  return new_df[['id', 'title', 'singer','genre', 'release', 'timbre_similarity']]

In [None]:
# module test
import timbral_models
vocal_path = "/content/drive/My Drive/vocal.wav"
db_path = "/content/drive/My Drive/crawling.csv"
meta_db_path = "/content/drive/My Drive/crawling_meta.csv"

sim_list = timbre_similarity_list(vocal_path, db_path, meta_db_path)
sim_list.to_csv("/content/drive/My Drive/sim_list.csv")