# Utils

### Loading Files

In [None]:
import os
import pandas as pd

def load_and_concatenate_parquet_files(folder_path):
    # List all files in the directory
    files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
    
    # Sort files for consistent order if needed (optional)
    files.sort()

    # Load and concatenate all the Parquet files
    df_list = []
    for file in files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_parquet(file_path)
        df_list.append(df)

    # Concatenate all dataframes
    concatenated_df = pd.concat(df_list, ignore_index=True)
    
    return concatenated_df


### Saving Files

In [None]:
import os
import math
import joblib
import pandas as pd

def save_dataframe_as_parquet(df:pd.DataFrame, folder_path="data", folder_name=None, always_overwrite=None, model_object=None):
    if not folder_name:
        folder_name = input("Enter the name of the folder to save the files: ")

    full_path = os.path.join(folder_path, folder_name)

    # Check if the folder already exists
    if os.path.exists(full_path) and always_overwrite is not True:
        if always_overwrite is None:
            overwrite = input(f"The folder '{folder_name}' already exists. Do you want to overwrite it? (yes/no): ")
            always_overwrite = overwrite.lower() != 'yes'
        if not always_overwrite:
            suffix = 1
            new_folder_name = f"{folder_name}_{suffix}"
            while os.path.exists(os.path.join(folder_path, new_folder_name)):
                suffix += 1
                new_folder_name = f"{folder_name}_{suffix}"
            folder_name = new_folder_name
            full_path = os.path.join(folder_path, folder_name)
    
    os.makedirs(full_path, exist_ok=True)

    temp_file = os.path.join(full_path, "temp.parquet")
    df.to_parquet(temp_file)
    file_size = os.path.getsize(temp_file) / (1024 * 1024)  # Size in MB
    os.remove(temp_file)

    if file_size > 50:
        num_splits = math.ceil(file_size / 50)
        row_split = math.ceil(len(df) / num_splits)
    else:
        num_splits = 1
        row_split = len(df)
    
    for i in range(num_splits):
        start_row = i * row_split
        end_row = min((i + 1) * row_split, len(df))
        split_df = df.iloc[start_row:end_row]
        split_file_name = os.path.join(full_path, f"{folder_name}_part_{i + 1}.parquet")
        split_df.to_parquet(split_file_name)
    
    if model_object:
        for key, value in model_object.items():
            joblib.dump(value, f'{full_path}/{key}_model.pkl')

    print(f"Dataframe saved in {num_splits} files under the folder: {full_path}")
    
    return full_path

# Example usage:
# save_dataframe_as_parquet(df=kmeans_cluster, folder_path="data", folder_name="kmeans_clustered_subset", always_overwrite=False)


# Code for Clustering

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import hdbscan
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

def reduce_data(data, dimensions, metric_columns):
    numeric_columns = data[metric_columns]
    pca_standardized = PCA(n_components=dimensions)
    pca_standardized_result = pca_standardized.fit_transform(numeric_columns)
    return pca_standardized_result

def normalize_data(data:pd.DataFrame, metric_columns):
    min_max_scaler = MinMaxScaler()
    numeric_columns = data[metric_columns]
    other_columns = data.drop(columns=metric_columns).reset_index(drop=True)
    normalized_data = min_max_scaler.fit_transform(numeric_columns)
    normalized_df = pd.DataFrame(normalized_data, columns=numeric_columns.columns)
    normalized_data = pd.merge(normalized_df, other_columns, left_index=True, right_index=True)
    return normalized_df

def cluster_with_kmeans(data_for_clustering:pd.DataFrame):
    kmeans = KMeans(n_clusters=5, random_state=42)
    kmeans_labels = kmeans.fit_predict(data_for_clustering)
    return kmeans_labels, kmeans

def cluster_with_hdbscan(data_for_clustering:pd.DataFrame):
    hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=10, prediction_data=True)
    hdbscan_labels = hdbscan_clusterer.fit_predict(data_for_clustering)
    return hdbscan_labels, hdbscan_clusterer

def run_kmeans_and_hdbscan(original_data:pd.DataFrame, cluster_data:pd.DataFrame):
    data_for_clustering = cluster_data.copy()
    kmeans_labels, kmeans = cluster_with_kmeans(data_for_clustering)

    data_for_clustering["kmeans_cluster"] = kmeans_labels
    original_data["kmeans_cluster"] = kmeans_labels
    print(f"KMeans_cluster distro: {data_for_clustering['kmeans_cluster'].value_counts()}")
    hdbscan_cluster_df = []
    hdbscan_models = {}
    for cluster, data in data_for_clustering.groupby("kmeans_cluster"):
        hdbscan_cluster_labels, hdbscan_clusterer = cluster_with_hdbscan(data.drop(columns=["kmeans_cluster"]))
        current_original_data = original_data[original_data["kmeans_cluster"] == cluster].reset_index(drop=True).copy()
        current_original_data["hdbscan_cluster"] = hdbscan_cluster_labels
        hdbscan_cluster_df.append(current_original_data)
        hdbscan_models[cluster] = hdbscan_clusterer
        print(f"HDBSCAN_cluster distro for kmeans {cluster}: {current_original_data['hdbscan_cluster'].value_counts()}")

    all_cluster_results = pd.concat(hdbscan_cluster_df).reset_index(drop=True)
    
    return all_cluster_results, kmeans, hdbscan_models

def cluster_data(original_data:pd.DataFrame, subset_fraction=None, kmeans=True, use_hdbscan=True, should_normalize_data=True, pca_dimensions=None, columns_to_use=None):
    
    if subset_fraction:
        original_data = original_data.sample(frac=subset_fraction)
    
    data_for_clustering = original_data.copy().reset_index(drop=True)
    
    metric_columns = columns_to_use if columns_to_use else ["danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "time_signature"]
    
    if should_normalize_data:
        data_for_clustering = normalize_data(data_for_clustering, metric_columns=metric_columns)

    if pca_dimensions:
        data_for_clustering = reduce_data(data_for_clustering, pca_dimensions)
    
    if pca_dimensions is None:
        data_for_clustering = data_for_clustering[metric_columns]

    if kmeans and use_hdbscan:
        result_df, kmeans, hdbscan_models = run_kmeans_and_hdbscan(original_data, data_for_clustering)
        folder_path = save_dataframe_as_parquet(result_df, folder_path="data", folder_name="kmeans_hdbscan_clustered_subset", always_overwrite=False, model_object={"kmeans": kmeans})
        for key, value in hdbscan_models.items():
            joblib.dump(value, f'{folder_path}/hdbscan_model_{key}.pkl')
        result = result_df.copy()
    
    elif kmeans:
        kmeans_labels, kmeans = cluster_with_kmeans(data_for_clustering)
        original_data["kmeans_cluster"] = kmeans_labels
        save_dataframe_as_parquet(original_data, folder_path="data", folder_name="kmeans_clustered_subset", always_overwrite=False, model_object={"kmeans": kmeans})
        print(f"KMeans_cluster distro: {original_data['kmeans_cluster'].value_counts()}")
        result = original_data.copy()
        
    elif use_hdbscan:
        hdbscan_labels, hdbscan_clusterer = cluster_with_hdbscan(data_for_clustering)
        original_data["hdbscan_cluster"] = hdbscan_labels
        save_dataframe_as_parquet(original_data, folder_path="data", folder_name="hdbscan_clustered_subset", always_overwrite=False, model_object={"hdbscan": hdbscan_clusterer})
        print(f"HDBSCAN_cluster distro: {original_data['hdbscan_cluster'].value_counts()}")
        result = original_data.copy()

    return result

preprocessed_data = load_and_concatenate_parquet_files("data/preprocessed_spotify_data")
clustered_data = cluster_data(preprocessed_data, subset_fraction=0.02, kmeans=False, use_hdbscan=True,  should_normalize_data=True, pca_dimensions=None, columns_to_use=None)
display(clustered_data)