# Yearly - Title, Artist, Lyrics, Genre

In [None]:
import time
import re
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Initialize Selenium WebDriver
def init_driver():
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service)

def melon_collector(url, year):
    driver = init_driver()
    time.sleep(5)
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    # Extract song information
    song_info = soup.find_all('div', attrs={'class': 'ellipsis rank01'})
    singer_info = soup.find_all('div', attrs={'class': 'ellipsis rank02'})
    
    # Extract IDs for the top 100 songs
    songid = []
    for i in range(100):
        try:
            songid.append(re.sub(r'[^0-9]', '', song_info[i].find("a")["href"][43:]))
        except:
            songid.append('')
            continue
        
    songs = []
    for i in songid:
        try:
            driver.get("https://www.melon.com/song/detail.htm?songId=" + i)
            time.sleep(2)
            title = song_info[songid.index(i)].text.strip()
            singer = singer_info[songid.index(i)].text.strip()
            singer = singer[:len(singer) // 2]
            lyric = driver.find_element(By.CLASS_NAME, "lyric").text.strip()
            meta_info = driver.find_element(By.CSS_SELECTOR, ".list").text.split('\n')
            like_count = driver.find_element(By.ID, "d_like_count").text

            songs.append({
                "Title": title, 
                "Artist": singer, 
                "Lyrics": lyric,
                "Album Name": meta_info[1],
                "Release Date": meta_info[3],
                "Genre": meta_info[5], 
                "Likes": like_count
            })
        except Exception as e:
            print(f"Error fetching data for song ID {i}: {e}")
            continue
    
    df = pd.DataFrame(songs)
    output_csv_path = f'melon_{year}.csv'
    df.to_csv(output_csv_path, index=False)
    driver.quit()

# Generate URLs and collect data for each year
def collect_year_end_charts():
    base_url = 'https://www.melon.com/chart/age/index.htm?chartType=YE&chartGenre=KPOP&chartDate='
    current_year = datetime.now().year
    start_year = 2019

    for year in range(start_year, current_year + 1):
        if year < current_year:
            url = base_url + str(year)
        else:  # For the current year, use the current month's or week's chart
            url = 'https://www.melon.com/chart/index.htm'
        
        melon_collector(url, year)
        print(f'Year {year} data collection complete.')

# Execute the collection function
collect_year_end_charts()

# Monthly - Title, Artist, Lyrics, Genre

In [None]:
import time
import re
import pandas as pd
from itertools import repeat
from bs4 import BeautifulSoup
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

def init_driver():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = wd.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    return driver

def get_melon_chart_data():
    period = 1
    month = 3
    result_df = pd.DataFrame()

    while period < 4:
        try:
            # Initialize the driver
            driver = init_driver()
            driver.maximize_window()  # Maximize the browser window

            # Access the Melon chart page
            url = 'https://www.melon.com/chart/index.htm'
            driver.get(url)
            time.sleep(2)

            # Click the chart finder
            driver.find_element(By.XPATH, '//*[@id="gnb_menu"]/ul[1]/li[1]/div/div/button/span').click()
            time.sleep(2)

            # Click the monthly chart option
            driver.find_element(By.XPATH, '//*[@id="d_chart_search"]/div/h4[2]/a').click()
            time.sleep(2)

            # Select the period
            driver.find_element(By.XPATH, f'//*[@id="d_chart_search"]/div/div/div[1]/div[1]/ul/li[{period}]/span/label').click()
            time.sleep(2)

            # Select the year
            driver.find_element(By.XPATH, '//*[@id="d_chart_search"]/div/div/div[2]/div[1]/ul/li[2]/span/label').click()
            time.sleep(2)

            # Select the month
            driver.find_element(By.XPATH, f'//*[@id="d_chart_search"]/div/div/div[3]/div[1]/ul/li[{month}]/span/label').click()
            time.sleep(2)

            # Select the genre
            driver.find_element(By.XPATH, '//*[@id="d_chart_search"]/div/div/div[5]/div[1]/ul/li[1]/span/label').click()
            time.sleep(2)

            # Click the search button
            driver.find_element(By.XPATH, '//*[@id="d_srch_form"]/div[2]/button/span/span').click()
            time.sleep(2)

            # Get the HTML content and parse it with BeautifulSoup
            html = driver.page_source
            soup = BeautifulSoup(html, 'lxml')

            # Extract song IDs
            song_info = soup.find_all('div', attrs={'class': 'ellipsis rank01'})
            singer_info = soup.find_all('div', attrs={'class': 'ellipsis rank02'})
            songid = [re.sub(r'[^0-9]', '', info.find("a")["href"][43:]) for info in song_info]

            # Collect detailed song information
            songs = []
            for i in songid:
                try:
                    driver.get(f"https://www.melon.com/song/detail.htm?songId={i}")
                    WebDriverWait(driver, 20).until(
                        EC.presence_of_element_located((By.CLASS_NAME, 'lyric'))
                    )
                    title = song_info[songid.index(i)].text.strip()
                    singer = singer_info[songid.index(i)].text.strip()
                    singer = singer[:len(singer) // 2]
                    lyric = driver.find_element(By.CLASS_NAME, "lyric").text.strip()
                    meta_info = driver.find_element(By.CSS_SELECTOR, ".list").text.split('\n')
                    like_count = driver.find_element(By.ID, "d_like_count").text

                    songs.append({
                        "Title": title, 
                        "Artist": singer, 
                        "Lyrics": lyric,
                        "Genre": meta_info[5]
                    })
                except Exception as e:
                    print(f"Error fetching data for song ID {i}: {e}")
                    continue

            # Convert to DataFrame
            df = pd.DataFrame(songs)
            result_df = pd.concat([result_df, df], ignore_index=True)

            # Increment the period for the next iteration
            period += 2

        except Exception as e:
            print(f"Error at period {period}: {e}")
            break

        finally:
            driver.quit()

    return result_df

if __name__ == "__main__":
    # Get the Melon chart data
    result_df = get_melon_chart_data()

    # Save the DataFrame to a CSV file
    result_df.to_csv('melon_chart_Mar_2024.csv', index=False, encoding='utf-8')
    print("Data collection complete. CSV file saved.")

In [None]:
# Erasing Duplicates & Combining to one file


import pandas as pd

# List of CSV files to be combined
csv_files = [
    'melon_chart_Jan_2024.csv',
    'melon_chart_Feb_2024.csv',
    'melon_chart_Mar_2024.csv',
    'melon_chart_april_2024.csv'
]

# Read and concatenate the CSV files into a single DataFrame
combined_df = pd.concat([pd.read_csv(file) for file in csv_files])

# Drop the 'Likes' column
if 'Likes' in combined_df.columns:
    combined_df.drop(columns=['Likes'], inplace=True)

# Remove duplicates, keeping only the first occurrence
combined_df.drop_duplicates(inplace=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('melon_chart_combined_2024.csv', index=False, encoding='utf-8')

print("Data combined and saved to 'melon_chart_combined_2024.csv'.")