In [1]:
import requests
import pandas as pd
import re

def extract_album_id(album_link):
    match = re.search(r'/album/.*?/([\w-]+)\.html', album_link or "")
    return match.group(1) if match else "N/A"

def determine_album_type(track_count):
    if track_count == 1:
        return "Single"
    elif 2 <= track_count <= 6:
        return "EP"
    else:
        return "Regular"

def fetch_artist_songs(artist_name):
    url = f"http://localhost:5000/api/artistsongs?name={artist_name}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        songs = data.get("songs", [])
        
        records = []
        album_tracks = {}
        
        for song in songs:
            album_id = extract_album_id(song.get("albumLink", "") or "")
            album_name = song.get("album", song.get("title", "N/A"))
            tracklist = song.get("tracklist", [])
            
            if not tracklist:
                tracklist = [{"title": song.get("title", "N/A"), "link": song.get("link", "N/A")}]  # N·∫øu kh√¥ng c√≥ tracklist, t·∫°o d·ªØ li·ªáu t·ª´ b√†i h√°t ch√≠nh
            
            if album_id not in album_tracks:
                album_tracks[album_id] = len(tracklist)
            
            for track in tracklist:
                records.append({
                    "album_id": album_id,
                    "album_name": album_name,
                    "tracklist": track.get("title", "N/A"),
                    "release_date": song.get("releaseDate", "Unknown"),
                    "provided_by": song.get("providedBy", "Unknown"),
                    "featured_artists": song.get("featuredArtists", "Unknown"),
                    "album_artist": song.get("albumOwner", "Unknown"),
                    "ZingMP3": track.get("link", "N/A")
                })
        
        df = pd.DataFrame(records)
        df = df.drop_duplicates(subset=["album_id", "album_name", "tracklist"])
        df = df.sort_values(by=["album_name", "tracklist"], ascending=[True, True])
        
        # Th√™m c·ªôt ph√¢n lo·∫°i album
        df["album_type"] = df["album_id"].map(lambda x: determine_album_type(album_tracks.get(x, 1)))
        
        return df
    else:
        print("Error fetching data:", response.status_code)
        return None

artist_name = 'Rapper-Ngan'
df = fetch_artist_songs(artist_name)



In [2]:
df.to_excel(f'{artist_name}_songZingMP3.xlsx')
df.head()

Unnamed: 0,album_id,album_name,tracklist,release_date,provided_by,featured_artists,album_artist,ZingMP3,album_type
32,60BODZB8,9X (Single),9X,04/05/2020,MIXUS,"DLblack, Ng·∫Øn, M√™k Team","DLblack, Ng·∫Øn, M√™k Team",https://zingmp3.vn/bai-hat/9X-DLblack-Ngan-Mek...,Single
20,Z6DU0B8O,9X (Single),Tr·ªü V·ªÅ,2019,MIXUS,"Ng·∫Øn, H·∫£i Ph√≤ng Sound","Ng·∫Øn, H·∫£i Ph√≤ng Sound",https://zingmp3.vn/bai-hat/Tro-Ve-Ngan-Hai-Pho...,Single
53,6909U6OC,B√¨nh D√¢n (Remix Version) (Single),B√¨nh D√¢n (Remix Version),10/09/2021,MIXUS,"Ng·∫Øn, Monkieq","Ng·∫Øn, Monkieq",https://zingmp3.vn/bai-hat/Binh-Dan-Remix-Vers...,Single
62,6BUBDBAI,B·ª•i Thi√™n Th·∫ßn (EP),Cu·ªôc ƒê·ªùi N√†y Ng·∫Øn,22/07/2022,LOOPS Music,Ng·∫Øn,Ng·∫Øn,https://zingmp3.vn/bai-hat/Cuoc-Doi-Nay-Ngan-N...,EP
66,6BUBDBAI,B·ª•i Thi√™n Th·∫ßn (EP),Ng∆∞·ªùi Ngh·ªá Sƒ© C√¥ ƒê∆°n,22/07/2022,LOOPS Music,Ng·∫Øn,Ng·∫Øn,https://zingmp3.vn/bai-hat/Nguoi-Nghe-Si-Co-Do...,EP


In [3]:
import spotipy
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials

# H√†m x√°c ƒë·ªãnh lo·∫°i album
def determine_album_type(track_count):
    if track_count <= 3:
        return "Single"
    elif 4 <= track_count <= 6:
        return "EP"
    else:
        return "Regular"

# H√†m l·∫•y t·∫•t c·∫£ b√†i h√°t c·ªßa ngh·ªá sƒ©
def get_artist_tracks_all(artist_name):
    client_id = "c9ef6e4338814489867afc9956093213"
    client_secret = "a449438b21d44525a68a575cf67c8cfd"
    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

    # üîé T√¨m ngh·ªá sƒ© theo t√™n
    results = sp.search(q=artist_name, type='artist', limit=1)
    if not results['artists']['items']:
        print("Kh√¥ng t√¨m th·∫•y ngh·ªá sƒ©.")
        return pd.DataFrame()

    artist_id = results['artists']['items'][0]['id']

    track_data = []
    seen_tracks = set()
    album_tracks_count = {}


    offset = 0
    while True:
        albums = sp.artist_albums(artist_id, album_type='album,single', limit=50, offset=offset)
        if not albums['items']:
            break
        for album in albums['items']:
            album_id = album['id']
            album_name = album['name']
            album_release_date = album.get('release_date', 'Unknown')
            album_owner = album['artists'][0]['name']

            album_info = sp.album(album_id)
            label = album_info.get('label', 'Unknown')

           
            tracks = sp.album_tracks(album_id)['items']
            album_tracks_count[album_name] = len(tracks)  # Ghi l·∫°i s·ªë l∆∞·ª£ng b√†i h√°t

            for track in tracks:
                track_id = track['id']
                if track_id in seen_tracks:
                    continue
                seen_tracks.add(track_id)

                track_title = track['name']
                link_spotify = track['external_urls']['spotify']
                featured_artists = [artist['name'] for artist in track['artists'] if artist['id'] != artist_id]
                featured_artists = ", ".join(featured_artists) if featured_artists else "None"

                # X·ª≠ l√Ω ng√†y ph√°t h√†nh
                try:
                    album_release_date = pd.to_datetime(album_release_date, errors='coerce').strftime("%d/%m/%Y")
                except:
                    album_release_date = "Unknown"

                track_data.append([album_name, track_title, album_release_date, featured_artists, album_owner, label, link_spotify])
        offset += 50  

   
    top_tracks = sp.artist_top_tracks(artist_id, country="US")['tracks']
    for track in top_tracks:
        track_id = track['id']
        if track_id in seen_tracks:
            continue
        seen_tracks.add(track_id)

        track_title = track['name']
        link_spotify = track['external_urls']['spotify']
        album = track['album']
        album_name = album['name']
        album_release_date = album.get('release_date', 'Unknown')
        album_owner = album['artists'][0]['name']
        featured_artists = [artist['name'] for artist in track['artists'] if artist['id'] != artist_id]
        featured_artists = ", ".join(featured_artists) if featured_artists else "None"

        # üîπ L·∫•y th√¥ng tin label c·ªßa album
        album_info = sp.album(album['id'])
        label = album_info.get('label', 'Unknown')

        try:
            album_release_date = pd.to_datetime(album_release_date, errors='coerce').strftime("%d/%m/%Y")
        except:
            album_release_date = "Unknown"

        track_data.append([album_name, track_title, album_release_date, featured_artists, album_owner, label, link_spotify])
        album_tracks_count[album_name] = album_tracks_count.get(album_name, 0) + 1  # C·∫≠p nh·∫≠t s·ªë b√†i h√°t


    columns = ["album_name", "tracklist", "release_date", "featured_artists", "album_owner_", "provided_by", "Link_Spotify"]
    df = pd.DataFrame(track_data, columns=columns)

    df['album_type'] = df['album_name'].map(lambda x: determine_album_type(album_tracks_count.get(x, 0)))
    df = df.sort_values(by=["album_name", "tracklist"], ascending=[True, True])
 

    return df

# Ch·∫°y ch∆∞∆°ng tr√¨nh
if __name__ == "__main__":
    artist_name_Spotify = 'Ng·∫Øn'
    df_tracks = get_artist_tracks_all(artist_name_Spotify)



    

  album_release_date = pd.to_datetime(album_release_date, errors='coerce').strftime("%d/%m/%Y")


In [4]:

df_tracks.to_excel(f"{artist_name_Spotify}_songSpotify.xlsx", index=False)
df_tracks.head()

Unnamed: 0,album_name,tracklist,release_date,featured_artists,album_owner_,provided_by,Link_Spotify,album_type
24,120Bpm (ft. Long M·ªông G√†),120Bpm (ft. Long M·ªông G√†),10/03/2023,Antoneus Maximus,Ng·∫Øn,MaiDao Music,https://open.spotify.com/track/3QSw9fOs1Qe14tg...,Single
62,9X,9X,04/05/2020,"DLBlack, M√™k Team",DLBlack,Zing MP3,https://open.spotify.com/track/2Gj8BVWfiqHTykY...,Single
50,B√¨nh D√¢n,B√¨nh D√¢n,03/09/2021,,Ng·∫Øn,Yin Yang Media,https://open.spotify.com/track/3dhDxLqZScUdYDx...,Single
51,B√¨nh D√¢n,B√¨nh D√¢n - Beat,09/03/2021,,Ng·∫Øn,Yin Yang Media,https://open.spotify.com/track/0eof9V6sNRlPtiQ...,Single
30,B·ª•i Thi√™n Th·∫ßn,Cu·ªôc ƒê·ªùi N√†y Ng·∫Øn,27/07/2022,,Ng·∫Øn,LOOPS Music,https://open.spotify.com/track/3UmZvOmW9QM7W1F...,EP


In [5]:
import pandas as pd
import re
from fuzzywuzzy import process

def is_remix(title):
    """Ki·ªÉm tra xem ti√™u ƒë·ªÅ c√≥ ph·∫£i b·∫£n remix hay kh√¥ng"""
    return bool(re.search(r'\b(remix|version|edit)\b', str(title), re.IGNORECASE))

def find_best_match(title, choices, threshold=90):
    """T√¨m album ph√π h·ª£p, ∆∞u ti√™n ƒë√∫ng lo·∫°i Remix/G·ªëc"""
    is_title_remix = is_remix(title)
    
    # L·ªçc danh s√°ch theo b·∫£n remix/g·ªëc t∆∞∆°ng ·ª©ng
    filtered_choices = [c for c in choices if is_remix(c) == is_title_remix]
    match_list = filtered_choices if filtered_choices else choices  # N·∫øu kh√¥ng c√≥, th·ª≠ to√†n b·ªô
    
    match, score = process.extractOne(title, match_list)
    return match if score >= threshold else None

# ƒê·ªçc d·ªØ li·ªáu t·ª´ file (gi·∫£ s·ª≠ df_tracks l√† danh s√°ch b√†i h√°t Spotify, df l√† d·ªØ li·ªáu t·ª´ ZingMP3)


# T·∫°o t·ª´ ƒëi·ªÉn album t·ª´ ZingMP3
album_titles_dict = {title: title for title in df["album_name"]}

# T√¨m album ph√π h·ª£p nh·∫•t gi·ªØa Spotify v√† ZingMP3
df_tracks["best_match_album"] = df_tracks["album_name"].apply(lambda x: find_best_match(x, album_titles_dict.keys()))

# G·ªôp d·ªØ li·ªáu d·ª±a tr√™n album_name kh·ªõp ƒë∆∞·ª£c
df_merged = pd.merge(df_tracks, df, left_on="best_match_album", right_on="album_name", how="outer", suffixes=("_Spotify", "_ZingMP3"))

# G√°n gi√° tr·ªã t·ª´ hai ngu·ªìn d·ªØ li·ªáu
df_merged = df_merged.assign(
    album_name=df_merged["album_name_Spotify"].combine_first(df_merged["album_name_ZingMP3"]),
    track_title=df_merged["tracklist_Spotify"].combine_first(df_merged["tracklist_ZingMP3"]),
    album_type=df_merged["album_type_Spotify"].fillna(df_merged["album_type_ZingMP3"]),
    album_owner_spotify=df_merged["album_owner_Spotify"].combine_first(df_merged["album_artist"]),
    release_date_spotify=df_merged["release_date_Spotify"],
    release_date_zingmp3=df_merged["release_date_ZingMP3"],
    featured_artists_spotify=df_merged["featured_artists_Spotify"],
    featured_artists_zingmp3=df_merged["featured_artists_ZingMP3"],
    label_spotify=df_merged["provided_by_Spotify"],
    label_zingmp3=df_merged["provided_by_ZingMP3"],
    album_id_zingmp3=df_merged["album_id"],
    link_spotify=df_merged["Spotify"],
    link_zingmp3=df_merged["ZingMP3"]
)

# X√≥a c·ªôt kh√¥ng c·∫ßn thi·∫øt
df_final = df_merged.drop(columns=["best_match_album", "album_name_Spotify", "album_name_ZingMP3"])

# ƒê·ªïi t√™n c·ªôt
df_final = df_final.rename(columns={
    "album_name": "T√™n album",
    "album_owner_spotify": "Ngh·ªá sƒ© s·ªü h·ªØu album (Spotify)",
    "album_owner_zingmp3": "Ngh·ªá sƒ© s·ªü h·ªØu album (ZingMP3)",
    "album_type": "Lo·∫°i album",
    "track_title": "Danh s√°ch b√†i h√°t",
    "release_date_spotify": "Ng√†y ph√°t h√†nh tr√™n Spotify",
    "release_date_zingmp3": "Ng√†y ph√°t h√†nh tr√™n ZingMP3",
    "featured_artists_spotify": "Ngh·ªá sƒ© tham gia (Spotify)",
    "featured_artists_zingmp3": "Ngh·ªá sƒ© tham gia (ZingMP3)",
    "label_spotify": "Cung c·∫•p b·ªüi (Spotify)",
    "label_zingmp3": "Cung c·∫•p b·ªüi (ZingMP3)",
    "album_id_zingmp3": "M√£ album ZingMP3",
    "link_spotify": "Link Spotify",
    "link_zingmp3": "Link ZingMP3"
})

# S·∫Øp x·∫øp d·ªØ li·ªáu
df_final = df_final.sort_values(by=["T√™n album", "M√£ album ZingMP3"], ascending=[True, True])

# Lo·∫°i b·ªè tr√πng l·∫∑p d·ª±a tr√™n album_name v√† tracklist
df_final = df_final.drop_duplicates(subset=["T√™n album", "Danh s√°ch b√†i h√°t"], keep="first")

# Xu·∫•t file Excel sau khi l·ªçc tr√πng
df_final.to_excel("Merged_AlbumsZingMp3_Spot.xlsx", index=False)

print("Xu·∫•t file ho√†n t·∫•t!")
df_final.head()




NameError: name 'df_tracks' is not defined

In [110]:
print(df_tracks.columns)

RangeIndex(start=0, stop=0, step=1)


In [5]:
import pandas as pd
import re

# H√†m chu·∫©n h√≥a t√™n album
def normalize_album_name(album_name):
    return re.sub(r"\s*\(.*?\)", "", album_name).strip()

# Ch·ªçn c√°c c·ªôt c·∫ßn thi·∫øt t·ª´ c·∫£ hai ngu·ªìn
df_tracks = df_tracks[
    [
        'album_name', 'tracklist', 'release_date', 'featured_artists',
       'album_owner_', 'provided_by', 'Link_Spotify', 'album_type',
      
    ]
]

df = df[
    [
      'album_id', 'album_name', 'tracklist', 'release_date', 'provided_by',
       'featured_artists', 'album_artist', 'ZingMP3', 'album_type'
    ]
]

# Chu·∫©n h√≥a t√™n album ƒë·ªÉ ƒë·ªìng b·ªô d·ªØ li·ªáu
df_tracks["album_name"] = df_tracks["album_name"].apply(normalize_album_name)
df["album_name"] = df["album_name"].apply(normalize_album_name)

# G·ªôp d·ªØ li·ªáu d·ª±a tr√™n album_name, tracklist, album_type
df_merged = pd.merge(df_tracks, df, on=["album_name", "tracklist", "album_type"], how="outer", suffixes=("_Spotify", "_ZingMP3"))


# ƒê·ªïi t√™n c·ªôt
df_ = df_merged.rename(columns={
    "album_name_Spotify": "album_name",
    "tracklist": "tracklist(danh s√°ch b√†i h√°t)",
    "featured_artists_Spotify": "Song artist(ngh·ªá sƒ© tham gia b√†i h√°t)(Spotify)",
    "album_owner_Spotify": "Album artist (ngh·ªá sƒ© s·ªü h·ªØu album)*(Spotify)",
    "release_date_Spotify": "Ng√†y ph√°t h√†nh tr√™n Spotify",
    "provided_by": "Cung c·∫•p b·ªüi(ZingMP3)",
    "label": "Cung c·∫•p b·ªüi(Spotify)",
    "featured_artists_ZingMP3": "Song artist(ngh·ªá sƒ© tham gia b√†i h√°t)(ZingMP3)",
    "album_owner_ZingMP3": "Album artist (ngh·ªá sƒ© s·ªü h·ªØu album)*(ZingMP3)",
    "release_date_ZingMP3": "Ng√†y ph√°t h√†nh tr√™n ZingMP3",
    "linkzingmp3": "ZingMP3",
    "Link_Spotify": "Spotify",
    "album_id": "M√£ ƒë·ªãnh danh album zingmp3"
})


desired_columns = [
    "album_name",
    "album_owner_",  # X√°c nh·∫≠n l·∫°i n·∫øu ƒë√¢y l√† 'Album artist'
   
    "album_type",
    "tracklist(danh s√°ch b√†i h√°t)",
    "Ng√†y ph√°t h√†nh tr√™n Spotify",
    "Ng√†y ph√°t h√†nh tr√™n ZingMP3",
    "Song artist(ngh·ªá sƒ© tham gia b√†i h√°t)(Spotify)",
    "Song artist(ngh·ªá sƒ© tham gia b√†i h√°t)(ZingMP3)",
    "provided_by_ZingMP3", 
    "provided_by_Spotify", 
    "M√£ ƒë·ªãnh danh album zingmp3",
    "ZingMP3",
    "Spotify"
]

df_ = df_[desired_columns]




df_.to_excel("song_ZingMP3+Spotify.xlsx", index=False)

# Hi·ªÉn th·ªã k·∫øt qu·∫£

In [6]:
df_

Unnamed: 0,album_name,album_owner_,album_type,tracklist(danh s√°ch b√†i h√°t),Ng√†y ph√°t h√†nh tr√™n Spotify,Ng√†y ph√°t h√†nh tr√™n ZingMP3,Song artist(ngh·ªá sƒ© tham gia b√†i h√°t)(Spotify),Song artist(ngh·ªá sƒ© tham gia b√†i h√°t)(ZingMP3),provided_by_ZingMP3,provided_by_Spotify,M√£ ƒë·ªãnh danh album zingmp3,ZingMP3,Spotify
0,120Bpm,Ng·∫Øn,Single,120Bpm (ft. Long M·ªông G√†),10/03/2023,,Antoneus Maximus,,,MaiDao Music,,,https://open.spotify.com/track/3QSw9fOs1Qe14tg...
1,9X,DLBlack,Single,9X,04/05/2020,04/05/2020,"DLBlack, M√™k Team","DLblack, Ng·∫Øn, M√™k Team",MIXUS,Zing MP3,60BODZB8,https://zingmp3.vn/bai-hat/9X-DLblack-Ngan-Mek...,https://open.spotify.com/track/2Gj8BVWfiqHTykY...
2,9X,,Single,Tr·ªü V·ªÅ,,2019,,"Ng·∫Øn, H·∫£i Ph√≤ng Sound",MIXUS,,Z6DU0B8O,https://zingmp3.vn/bai-hat/Tro-Ve-Ngan-Hai-Pho...,
3,B√¨nh D√¢n,Ng·∫Øn,Single,B√¨nh D√¢n,03/09/2021,,,,,Yin Yang Media,,,https://open.spotify.com/track/3dhDxLqZScUdYDx...
4,B√¨nh D√¢n,,Single,B√¨nh D√¢n (Remix Version),,10/09/2021,,"Ng·∫Øn, Monkieq",MIXUS,,6909U6OC,https://zingmp3.vn/bai-hat/Binh-Dan-Remix-Vers...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,ƒê∆°n Ph∆∞∆°ng Y√™u M·ªôt Ng∆∞·ªùi,,Single,ƒê∆°n Ph∆∞∆°ng Y√™u M·ªôt Ng∆∞·ªùi (Acoustic),,06/12/2023,,"Longg, Ng·∫Øn",LOOPS Music,,6BEIUO69,https://zingmp3.vn/bai-hat/Don-Phuong-Yeu-Mot-...,
101,ƒê∆°n Ph∆∞∆°ng Y√™u M·ªôt Ng∆∞·ªùi,Longg,Single,ƒê∆°n Ph∆∞∆°ng Y√™u M·ªôt Ng∆∞·ªùi - Acoustic,06/12/2023,,Longg,,,LOOPS Music,,,https://open.spotify.com/track/06qRaDxPsCUnZzM...
102,ƒê∆∞·ªùng V·ªÅ,DLBlack,Single,ƒê∆∞·ªùng V·ªÅ,01/01/2020,2019,"DLBlack, JoyBlue","DLblack, Ng·∫Øn, JoyBlue",MIXUS,Zing MP3,ZUBZ707F,https://zingmp3.vn/bai-hat/Duong-Ve-DLblack-Ng...,https://open.spotify.com/track/3Ua7AqRWM54CTyq...
103,ƒê·∫¨M S√ÇU,Lor,Single,ƒê·∫¨M S√ÇU,07/10/2022,07/10/2022,Lor,"Lor, Ng·∫Øn",Ingrooves Music Group,M Music Records,6B6OECUF,https://zingmp3.vn/bai-hat/DAM-SAU-Lor-Ngan/Z6...,https://open.spotify.com/track/0frxC5D0uIjToqh...


In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd

def search_artist_nhaccuatui(driver, artist_name):
    """T√¨m ngh·ªá sƒ© tr√™n NhacCuaTui v√† l·∫•y link trang ngh·ªá sƒ©."""
    driver.get("https://www.nhaccuatui.com/")
    time.sleep(3)

    try:
        search_box = driver.find_element(By.CSS_SELECTOR, "input#txtSearch")
        search_box.click()
        time.sleep(1)

        search_box.send_keys(artist_name)
        time.sleep(3)

        suggestions = driver.find_elements(By.CSS_SELECTOR, ".info-search .qsItem a")
        for suggestion in suggestions:
            if "nghe-si" in suggestion.get_attribute("href"):
                return suggestion.get_attribute("href")
    except Exception as e:
        print(f"L·ªói khi t√¨m ngh·ªá sƒ© {artist_name}: {e}")
    return None

def get_artist_songs(driver, artist_song_url):
    """L·∫•y danh s√°ch b√†i h√°t c·ªßa ngh·ªá sƒ© t·ª´ t·∫•t c·∫£ c√°c trang."""
    driver.get(artist_song_url)
    time.sleep(3)
    song_list = []
    while True:
        song_blocks = driver.find_elements(By.CSS_SELECTOR, ".box-content-music-list .info_song")
        for song_block in song_blocks:
            try:
                song_name = song_block.find_element(By.CSS_SELECTOR, "h3 a").text
                song_link = song_block.find_element(By.CSS_SELECTOR, "h3 a").get_attribute("href")
                song_list.append(("N/A", song_name, song_link))
            except Exception as e:
                print(f"L·ªói khi l·∫•y b√†i h√°t: {e}")
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, "a.number[rel='next']")
            next_link = next_button.get_attribute("href")
            if next_link:
                driver.get(next_link)
                time.sleep(3)
            else:
                break
        except:
            break
    return song_list

def get_artist_albums(driver, artist_album_url):
    """L·∫•y danh s√°ch Album c·ªßa ngh·ªá sƒ©."""
    driver.get(artist_album_url)
    time.sleep(3)

    album_list = []
    while True:
        album_blocks = driver.find_elements(By.CSS_SELECTOR, ".box-left-album a.box_absolute")
        for album in album_blocks:
            try:
                album_link = album.get_attribute("href")
                album_list.append(album_link)
            except Exception as e:
                print(f"L·ªói khi l·∫•y album: {e}")
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, "a.number[rel='next']")
            next_link = next_button.get_attribute("href")
            if next_link:
                driver.get(next_link)
                time.sleep(3)
            else:
                break
        except:
            break
    return album_list

def get_album_songs(driver, album_url):
    """L·∫•y danh s√°ch b√†i h√°t trong Album."""
    driver.get(album_url)
    time.sleep(3)
    try:
        album_name = driver.find_element(By.CSS_SELECTOR, ".name_title").text
        song_elements = driver.find_elements(By.CSS_SELECTOR, "li[id^='itemSong_']")
        album_songs = [(album_name,
                        s.find_element(By.CSS_SELECTOR, "meta[itemprop='name']").get_attribute("content"),
                        s.find_element(By.CSS_SELECTOR, "meta[itemprop='url']").get_attribute("content"))
                        for s in song_elements]
        return album_songs
    except Exception as e:
        print(f"L·ªói khi l·∫•y b√†i h√°t t·ª´ album {album_url}: {e}")
    return []

driver = webdriver.Chrome()
artist = "Ng·∫Øn"

artist_page = search_artist_nhaccuatui(driver, artist)
if artist_page:
    print(f"Trang ngh·ªá sƒ©: {artist_page}")

    artist_song_page = artist_page.replace(".html", ".bai-hat.html")
    artist_album_page = artist_page.replace(".html", ".playlist.html")

    artist_songs = get_artist_songs(driver, artist_song_page)
    print(pd.DataFrame(artist_songs, columns=["Album", "T√™n b√†i h√°t", "Link b√†i h√°t"]))
    
    albums = get_artist_albums(driver, artist_album_page)
    print(f"T√¨m th·∫•y {len(albums)} album.")
    artist_songs = [(song_name, song_name, song_link) for _, song_name, song_link in artist_songs]
    all_songs = artist_songs[:]
    for album_link in albums:
        album_songs = get_album_songs(driver, album_link)
        all_songs.extend(album_songs)

    df_nct = pd.DataFrame(all_songs, columns=["album_name", "tracklist(danh s√°ch b√†i h√°t)", "Link b√†i h√°t"])
    df_nct.to_excel(f"{artist}_NhacCuaTui_Songs.xlsx", index=False)
    print(f"L·∫•y ƒë∆∞·ª£c {len(all_songs)} b√†i h√°t c·ªßa {artist}, l∆∞u v√†o file Excel!")
else:
    print("Kh√¥ng t√¨m th·∫•y ngh·ªá sƒ©.")
driver.quit()


ReadTimeoutError: HTTPConnectionPool(host='localhost', port=50730): Read timed out. (read timeout=120)

In [26]:
df_nct

Unnamed: 0,album_name,tracklist(danh s√°ch b√†i h√°t),Link b√†i h√°t
0,NG·∫ÆN | TrƒÉng Tr√≤n - phi√™n b·∫£n qu·∫©y tung n√≥c ƒë√™...,NG·∫ÆN | TrƒÉng Tr√≤n - phi√™n b·∫£n qu·∫©y tung n√≥c ƒë√™...,https://www.nhaccuatui.com/bai-hat/ngan-trang-...
1,T√åM H√ÄNH TINH KH√ÅC | V≈® C√ÅT T∆Ø·ªúNG x NG·∫ÆN,T√åM H√ÄNH TINH KH√ÅC | V≈® C√ÅT T∆Ø·ªúNG x NG·∫ÆN,https://www.nhaccuatui.com/bai-hat/tim-hanh-ti...
2,NG·∫ÆN | B√åNH D√ÇN | MV Official,NG·∫ÆN | B√åNH D√ÇN | MV Official,https://www.nhaccuatui.com/bai-hat/ngan-binh-d...
3,Em ·ªû N∆°i ƒê√¢u - Ng·∫Øn aka Lil Shadow H·∫£i Ph√≤ng S...,Em ·ªû N∆°i ƒê√¢u - Ng·∫Øn aka Lil Shadow H·∫£i Ph√≤ng S...,https://www.nhaccuatui.com/bai-hat/em-o-noi-da...
4,NG·∫ÆN | N∆†I TAO SINH RA | MV OFFICIAL | Prod - ...,NG·∫ÆN | N∆†I TAO SINH RA | MV OFFICIAL | Prod - ...,https://www.nhaccuatui.com/bai-hat/ngan-noi-ta...
...,...,...,...
89,"em l√† em b√© iu (SS x DJ AM Remix) - Olew, Ng·∫Øn",em l√† em b√© iu (SS x DJ AM Remix),https://www.nhaccuatui.com/bai-hat/em-la-em-be...
90,"em l√† em b√© iu (Single) - Olew, Ng·∫Øn",em l√† em b√© iu,https://www.nhaccuatui.com/bai-hat/em-la-em-be...
91,"V∆∞·ªùn Hoa Con C√° (Lofi) - Olew, Ng·∫Øn, meChill",V∆∞·ªùn Hoa Con C√° (Lofi),https://www.nhaccuatui.com/bai-hat/vuon-hoa-co...
92,"C·∫°m B·∫´y (Single) - Dick, Ng·∫Øn, Ziog",C·∫°m B·∫´y,https://www.nhaccuatui.com/bai-hat/cam-bay-dic...


In [7]:
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# import time
# import pandas as pd

# def search_nhaccuatui(artist_name):
#     driver = webdriver.Chrome()
    
#     search_url_songs = f"https://www.nhaccuatui.com/tim-kiem/bai-hat?q={artist_name}&b=keyword&l=tat-ca&s=default"
#     search_url_playlists = f"https://www.nhaccuatui.com/tim-kiem/playlist?q={artist_name}&b=keyword&l=tat-ca&s=default"
    
#     driver.get(search_url_songs)
#     time.sleep(3)
    
#     song_list = []
    
#     while True:
#         songs = driver.find_elements(By.CSS_SELECTOR, ".box_info h3.title_song a")
#         artists = driver.find_elements(By.CSS_SELECTOR, ".box_info h4.singer_song")
        
#         for song, artist in zip(songs, artists):
#             artist_names = ", ".join([a.text for a in artist.find_elements(By.TAG_NAME, "a")])
#             song_list.append((song.text, song.get_attribute("href"), artist_names))
        
#         try:
#             next_button = driver.find_element(By.CSS_SELECTOR, "a.number[rel='next']")
#             driver.execute_script("arguments[0].click();", next_button)
#             time.sleep(3)
#         except:
#             break
    
#     driver.get(search_url_playlists)
#     time.sleep(3)
    
#     checked_playlists = set()
#     playlist_links = []
    
#     while True:
#         try:
#             playlists = driver.find_elements(By.CSS_SELECTOR, ".box_info h3.title_song a")
#             for p in playlists:
#                 link = p.get_attribute("href")
#                 if link and link not in checked_playlists:
#                     playlist_links.append(link)
#                     checked_playlists.add(link)
#         except Exception as e:
#             print(f"L·ªói khi l·∫•y danh s√°ch playlist: {e}")
        
#         try:
#             next_button = driver.find_element(By.CSS_SELECTOR, "a.number[rel='next']")
#             driver.execute_script("arguments[0].click();", next_button)
#             time.sleep(3)
#         except:
#             break
    
#     playlist_list = []
    
#     for link in playlist_links:
#         retry = 3
#         while retry > 0:
#             try:
#                 driver.get(link)
#                 time.sleep(3)
                
#                 album_name = driver.find_element(By.CSS_SELECTOR, ".name_title").text
                
#                 song_elements = driver.find_elements(By.CSS_SELECTOR, "li[id^='itemSong_']")
#                 playlist_songs = [(s.find_element(By.CSS_SELECTOR, "meta[itemprop='name']").get_attribute("content"),
#                                    s.find_element(By.CSS_SELECTOR, "meta[itemprop='url']").get_attribute("content"))
#                                    for s in song_elements]
                
#                 playlist_list.append((album_name, playlist_songs, link))
#                 break
#             except Exception as e:
#                 print(f"L·ªói khi l·∫•y d·ªØ li·ªáu playlist {link}: {e}")
#                 driver.refresh()
#                 time.sleep(3)
#                 retry -= 1
    
#     driver.quit()
#     return song_list, playlist_list

# if __name__ == "__main__":
#     artist = "MCK"
#     songs, playlists = search_nhaccuatui(artist)
    
#     data = []
    
#     for song_title, song_link, _ in songs:
#         data.append([song_title, song_title, song_link])
    
#     for album_name, playlist_songs, link in playlists:
#         for title, song_link in playlist_songs:
#             data.append([album_name, title, song_link])
    
#     df_nct = pd.DataFrame(data, columns=["album_name", "tracklist(danh s√°ch b√†i h√°t)", "Nhaccuatui"])
    
#     # L∆∞u v√†o file CSV
#     df_nct.to_excel(f"{artist}_NhacCuaTui.xlsx", index=False)
    
                


In [None]:
import pandas as pd
from fuzzywuzzy import process

def normalize_text(text):
    """Chu·∫©n h√≥a vƒÉn b·∫£n: lo·∫°i b·ªè kho·∫£ng tr·∫Øng th·ª´a, ch·ªØ th∆∞·ªùng, b·ªè k√Ω t·ª± ƒë·∫∑c bi·ªát."""
    return text.strip().lower().title() if isinstance(text, str) else text  

# üóÇ Chu·∫©n h√≥a d·ªØ li·ªáu ƒë·∫ßu v√†o
df_final['norm_album_name'] = df_final['album_name'].apply(normalize_text)
df_nct['norm_album_name'] = df_nct['album_name'].apply(normalize_text)
df_final['norm_tracklist'] = df_final['tracklist(danh s√°ch b√†i h√°t)'].apply(normalize_text)
df_nct['norm_tracklist'] = df_nct['tracklist(danh s√°ch b√†i h√°t)'].apply(normalize_text)

def find_best_match(value, choices, threshold=85):  
    """Kh·ªõp d·ªØ li·ªáu m·ªù ƒë·ªÉ t√¨m t√™n gi·ªëng nh·∫•t"""
    if pd.isna(value) or not value:  
        return None, 0  
    match, score = process.extractOne(value, choices) if choices else (None, 0)
    return (match if score >= threshold else None), score  

# üõ† T·∫°o danh s√°ch ƒë·ªÉ fuzzy match
album_choices = df_nct['norm_album_name'].dropna().unique().tolist()
tracklist_choices = df_nct['norm_tracklist'].dropna().unique().tolist()

# üîç √Åp d·ª•ng fuzzy matching
df_final[['matched_album', 'album_score']] = df_final['norm_album_name'].apply(
    lambda x: pd.Series(find_best_match(x, album_choices))
)
df_final[['matched_tracklist', 'tracklist_score']] = df_final['norm_tracklist'].apply(
    lambda x: pd.Series(find_best_match(x, tracklist_choices))
)

# üîó G·ªôp d·ªØ li·ªáu d·ª±a tr√™n k·∫øt qu·∫£ matching
df_merged = df_final.merge(
    df_nct, left_on='matched_album', right_on='norm_album_name', how='left', suffixes=('_final', '_nct')
)

# üõ† ƒê·ªïi t√™n c·ªôt h·ª£p l√Ω
df_merged.rename(columns={
    'album_name_final': 'album_name',
    'tracklist(danh s√°ch b√†i h√°t)_final': 'tracklist(danh s√°ch b√†i h√°t)',
    'Link b√†i h√°t': 'Nhaccuatui'
}, inplace=True)

# üìå Ch·ªçn v√† s·∫Øp x·∫øp l·∫°i th·ª© t·ª± c·ªôt
column_order = [
    "album_name", "tracklist(danh s√°ch b√†i h√°t)",
    "Album artist (ngh·ªá sƒ© s·ªü h·ªØu album)*(Spotify)", "Album artist (ngh·ªá sƒ© s·ªü h·ªØu album)*(ZingMP3)",
    "album_type", "Ng√†y ph√°t h√†nh tr√™n Spotify", "Ng√†y ph√°t h√†nh tr√™n ZingMP3",
    "Song artist(ngh·ªá sƒ© tham gia b√†i h√°t)(Spotify)", "Song artist(ngh·ªá sƒ© tham gia b√†i h√°t)(ZingMP3)",
    "Cung c·∫•p b·ªüi(ZingMP3)", "Cung c·∫•p b·ªüi(Spotify)",
    "M√£ ƒë·ªãnh danh album zingmp3", "ZingMP3", "Spotify", "Nhaccuatui"
]

existing_columns = [col for col in column_order if col in df_merged.columns]
df_merged = df_merged[existing_columns]
df_merged.drop_duplicates(subset=["album_name", "tracklist(danh s√°ch b√†i h√°t)"], inplace=True)

df_merged.head()




Unnamed: 0,album_name,tracklist(danh s√°ch b√†i h√°t),Album artist (ngh·ªá sƒ© s·ªü h·ªØu album)*(Spotify),Album artist (ngh·ªá sƒ© s·ªü h·ªØu album)*(ZingMP3),album_type,Ng√†y ph√°t h√†nh tr√™n Spotify,Ng√†y ph√°t h√†nh tr√™n ZingMP3,Song artist(ngh·ªá sƒ© tham gia b√†i h√°t)(Spotify),Song artist(ngh·ªá sƒ© tham gia b√†i h√°t)(ZingMP3),Cung c·∫•p b·ªüi(ZingMP3),Cung c·∫•p b·ªüi(Spotify),M√£ ƒë·ªãnh danh album zingmp3,ZingMP3,Spotify,Nhaccuatui
0,120bpm (ft. long m·ªông g√†),120Bpm (ft. Long M·ªông G√†),Ng·∫Øn,,Single,10/03/2023,,Antoneus Maximus,,,MaiDao Music,,,https://open.spotify.com/track/3QSw9fOs1Qe14tg...,https://www.nhaccuatui.com/bai-hat/120bpm-ngan...
1,9x,9X,DLBlack,"DLblack, Ng·∫Øn, M√™k Team",Single,04/05/2020,04/05/2020,"DLBlack, M√™k Team","DLblack, Ng·∫Øn, M√™k Team",MIXUS,Zing MP3,60BODZB8,https://zingmp3.vn/bai-hat/9X-DLblack-Ngan-Mek...,https://open.spotify.com/track/2Gj8BVWfiqHTykY...,
2,anh mu·ªën m√¨nh nh∆∞ con thuy·ªÅn kia lao v√†o em r·ªì...,anh mu·ªën m√¨nh nh∆∞ con thuy·ªÅn kia lao v√†o em r·ªì...,Ng·∫Øn,"Ng·∫Øn, MHee",Single,26/02/2023,26/02/2023,Mhee,"Ng·∫Øn, MHee",LOOPS Music,LOOPS Music,6B9UOFI7,https://zingmp3.vn/bai-hat/anh-muon-minh-nhu-c...,https://open.spotify.com/track/4doZRO6lNutfus7...,https://www.nhaccuatui.com/bai-hat/anh-muon-mi...
3,b√¨nh d√¢n,B√¨nh D√¢n,Ng·∫Øn,,Single,03/09/2021,,,,,Yin Yang Media,,,https://open.spotify.com/track/3dhDxLqZScUdYDx...,https://www.nhaccuatui.com/bai-hat/binh-dan-ng...
4,b√¨nh d√¢n,B√¨nh D√¢n - Beat,Ng·∫Øn,,Single,09/03/2021,,,,,Yin Yang Media,,,https://open.spotify.com/track/0eof9V6sNRlPtiQ...,https://www.nhaccuatui.com/bai-hat/binh-dan-ng...


In [28]:
df_merged.to_excel("_songZingMP3+Spotify+NCT.xlsx",index=False)

In [None]:
# import pandas as pd
# from fuzzywuzzy import process

# # H√†m fuzzy matching ƒë·ªÉ chu·∫©n h√≥a ti√™u ƒë·ªÅ m√† kh√¥ng b·ªè d·∫•u ti·∫øng Vi·ªát
# def fuzzy_standardize_title(title, reference_titles, threshold=90):
#     if not title or pd.isna(title):
#         return ""
#     best_match = process.extractOne(title, reference_titles, score_cutoff=threshold)  # So kh·ªõp gi·ªØ nguy√™n d·∫•u
#     return best_match[0] if best_match else title  # N·∫øu c√≥ ti√™u ƒë·ªÅ gi·ªëng, tr·∫£ v·ªÅ ti√™u ƒë·ªÅ chu·∫©n

# # Gi·∫£ s·ª≠ df_ v√† df_nct l√† hai DataFrame ch·ª©a danh s√°ch b√†i h√°t t·ª´ c√°c ngu·ªìn kh√°c nhau
# # Kh√¥ng thay ƒë·ªïi d·∫•u, k√Ω t·ª± ƒë·∫∑c bi·ªát ho·∫∑c vi·∫øt hoa

# # L·∫•y danh s√°ch ti√™u ƒë·ªÅ duy nh·∫•t t·ª´ df_nct
# unique_titles_nct = df_nct["tracklist(danh s√°ch b√†i h√°t)"].dropna().unique()

# # √Åp d·ª•ng fuzzy matching ƒë·ªÉ chu·∫©n h√≥a ti√™u ƒë·ªÅ tr√™n df_
# df_final["final_title"] = df_final["tracklist(danh s√°ch b√†i h√°t)"].apply(lambda x: fuzzy_standardize_title(x, unique_titles_nct))

# # G·ªôp d·ªØ li·ªáu t·ª´ NCT v√†o df_
# df_merged_nct = pd.merge(df_final, df_nct, left_on="final_title", right_on="tracklist(danh s√°ch b√†i h√°t)", how="outer", suffixes=("", "_NCT"))

# # N·∫øu thi·∫øu album_name, thay b·∫±ng NCT
# df_merged_nct["album_name"] = df_merged_nct["album_name"].fillna(df_merged_nct["album_name_NCT"])

# # N·∫øu thi·∫øu track title, thay b·∫±ng NCT
# df_merged_nct["tracklist(danh s√°ch b√†i h√°t)"] = df_merged_nct["tracklist(danh s√°ch b√†i h√°t)"].fillna(df_merged_nct["tracklist(danh s√°ch b√†i h√°t)_NCT"])
# # df_merged_nct["Album artist (ngh·ªá sƒ© s·ªü h·ªØu album)*(ZingMP3)"] = df_merged_nct["Album artist (ngh·ªá sƒ© s·ªü h·ªØu album)*(ZingMP3)"].fillna(artist_name)
# # df_merged_nct["Album artist (ngh·ªá sƒ© s·ªü h·ªØu album)*(Spotify)"] = df_merged_nct["Album artist (ngh·ªá sƒ© s·ªü h·ªØu album)*(Spotify)"].fillna(artist_name)
# # X√≥a c√°c c·ªôt kh√¥ng c·∫ßn thi·∫øt
# df_final = df_merged_nct.drop(columns=["album_name_NCT","final_title","tracklist(danh s√°ch b√†i h√°t)_NCT"])

# # Lo·∫°i b·ªè c√°c b·∫£n ghi tr√πng l·∫∑p
# df_final = df_final.drop_duplicates(subset=["tracklist(danh s√°ch b√†i h√°t)", "album_name"])

# # S·∫Øp x·∫øp theo t√™n album
# df_final = df_final.sort_values(by="album_name", ascending=True)

# # Xu·∫•t ra file Excel
# df_final.to_excel("songSpot+ZingMP3+NCT.xlsx", index=False)

# # Hi·ªÉn th·ªã 5 d√≤ng ƒë·∫ßu ti√™n
# df_final.head()

Unnamed: 0,album_name,Album artist (ngh·ªá sƒ© s·ªü h·ªØu album)*(Spotify),Album artist (ngh·ªá sƒ© s·ªü h·ªØu album)*(ZingMP3),album_type,tracklist(danh s√°ch b√†i h√°t),Ng√†y ph√°t h√†nh tr√™n Spotify,Ng√†y ph√°t h√†nh tr√™n ZingMP3,Song artist(ngh·ªá sƒ© tham gia b√†i h√°t)(Spotify),Song artist(ngh·ªá sƒ© tham gia b√†i h√°t)(ZingMP3),Cung c·∫•p b·ªüi(ZingMP3),...,album_score,matched_tracklist,tracklist_score,combined,matched_combined,match_score,Link b√†i h√°t,norm_album_name_NCT,norm_tracklist_NCT,combined_NCT
0,120Bpm (ft. Long M·ªông G√†),Ng·∫Øn,,Single,120Bpm (ft. Long M·ªông G√†),10/03/2023,,Antoneus Maximus,,,...,90.0,120Bpm,90.0,120Bpm (Ft. Long M·ªông G√†) - 120Bpm (Ft. Long M...,120Bpm - 120Bpm,86.0,https://www.nhaccuatui.com/bai-hat/120bpm-ngan...,120Bpm,120Bpm,120Bpm - 120Bpm
1,9X,DLBlack,"DLblack, Ng·∫Øn, M√™k Team",Single,9X,04/05/2020,04/05/2020,"DLBlack, M√™k Team","DLblack, Ng·∫Øn, M√™k Team",MIXUS,...,45.0,,45.0,9X - 9X,,51.0,,,,
4,B√¨nh D√¢n,Ng·∫Øn,"Ng·∫Øn, Monkieq",Single,B√¨nh D√¢n,03/09/2021,10/09/2021,,"Ng·∫Øn, Monkieq",MIXUS,...,100.0,B√¨nh D√¢n,100.0,B√¨nh D√¢n - B√¨nh D√¢n,B√¨nh D√¢n - B√¨nh D√¢n,100.0,https://www.nhaccuatui.com/bai-hat/binh-dan-ng...,B√¨nh D√¢n,B√¨nh D√¢n,B√¨nh D√¢n - B√¨nh D√¢n
5,B√¨nh D√¢n,Ng·∫Øn,"Ng·∫Øn, Monkieq",Single,B√¨nh D√¢n - Beat,09/03/2021,10/09/2021,,"Ng·∫Øn, Monkieq",MIXUS,...,100.0,B√¨nh D√¢n,90.0,B√¨nh D√¢n - B√¨nh D√¢n - Beat,B√¨nh D√¢n - B√¨nh D√¢n,95.0,https://www.nhaccuatui.com/bai-hat/binh-dan-ng...,B√¨nh D√¢n,B√¨nh D√¢n,B√¨nh D√¢n - B√¨nh D√¢n
176,B·ª•i Thi√™n Th·∫ßn,Ng·∫Øn,Ng·∫Øn,EP,Xa Ngh√¨n Tr√πng M√¢y,27/07/2022,22/07/2022,,Ng·∫Øn,LOOPS Music,...,90.0,Xa Ngh√¨n Tr√πng M√¢y,100.0,B·ª•i Thi√™n Th·∫ßn - Xa Ngh√¨n Tr√πng M√¢y,Xa Ngh√¨n Tr√πng M√¢y - Xa Ngh√¨n Tr√πng M√¢y,95.0,https://www.nhaccuatui.com/bai-hat/xa-nghin-tr...,Xa Ngh√¨n Tr√πng M√¢y,Xa Ngh√¨n Tr√πng M√¢y,Xa Ngh√¨n Tr√πng M√¢y - Xa Ngh√¨n Tr√πng M√¢y
