In [None]:
import kagglehub
import shutil
import os

# Download latest version
path = kagglehub.dataset_download("asaniczka/top-spotify-songs-in-73-countries-daily-updated")

print("Path to dataset files:", path)

# Tạo folder ./data nếu chưa có
os.makedirs("./data", exist_ok=True)

# Copy toàn bộ file từ cache sang ./data
for item in os.listdir(path):
    s = os.path.join(path, item)
    d = os.path.join("./data", item)
    if os.path.isdir(s):
        shutil.copytree(s, d, dirs_exist_ok=True)
    else:
        shutil.copy2(s, d)

print(f"Dataset copied to ./data")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sketch 
from dataprep.eda import plot, plot_correlation

In [None]:
spotify_data = pd.read_csv('./data/universal_top_spotify_songs.csv',parse_dates=['snapshot_date', 'album_release_date'])
spotify_data.head(5)

In [None]:
spotify_data.info()

In [None]:
spotify_data.describe(include='O').T

In [None]:
def describe_datetime_columns(df: pd.DataFrame, datetime_cols=None, top_n=10):
    if datetime_cols is None:
        datetime_cols = df.select_dtypes(include=["datetime64[ns]"]).columns
    
    summary = {}
    
    for col in datetime_cols:
        series = df[col].dropna()  # bỏ NaT
        stats = {}
        
        if series.empty:
            summary[col] = {"count": 0}
            continue
        
        stats["count"] = series.count()
        stats["min"] = series.min()
        stats["max"] = series.max()
        stats["mean"] = series.mean()
        stats["median"] = series.median()
        stats["mode"] = list(series.mode())  # có thể có nhiều mode
        stats["unique"] = series.nunique()
        
        # Giá trị xuất hiện nhiều nhất
        vc = series.value_counts()
        stats["top"] = vc.index[0]
        stats["freq"] = vc.iloc[0]
        
        # Nếu muốn xem thêm top_n giá trị phổ biến nhất
        stats["top_n_values"] = vc.head(top_n).to_dict()
        
        # Khoảng cách thời gian
        stats["range"] = series.max() - series.min()
        
        summary[col] = stats
    
    return pd.DataFrame(summary).T

In [None]:
print(describe_datetime_columns(spotify_data, ["snapshot_date", "album_release_date"]))

In [None]:
print(spotify_data.isnull().sum())

In [None]:
spotify_data = spotify_data.drop_duplicates()

In [None]:
spotify_data = spotify_data.dropna(subset=['name', 'artists', 'country'])

In [None]:
spotify_data['album_name'] = spotify_data['album_name'].fillna("Unknown Album")