In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

pca_embeddings = np.load('embeddings/pca_embeddings.npy')
pca_embeddings.shape

scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(pca_embeddings)

def plot_clusters(labels, title):
    plt.scatter(scaled_embeddings[:, 0], scaled_embeddings[:, 1], c=labels, cmap='viridis')
    plt.title(title)
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.show()

In [None]:
import pandas as pd
abstracts = pd.read_csv('data/arxiv_abstracts.csv')

In [None]:
from sklearn.metrics import silhouette_score
cluster_labels_db = np.load('cluster_labels_db.npy')
cluster_labels_km = np.load('cluster_labels_km.npy')
cluster_labels_hdb = np.load('cluster_labels_hdb.npy')

silhouette_km = silhouette_score(scaled_embeddings, cluster_labels_km)
silhouette_db = silhouette_score(scaled_embeddings, cluster_labels_db)
silhouette_hdb = silhouette_score(scaled_embeddings, cluster_labels_hdb)
print(f"K-means Silhouette Score: {silhouette_km}")
print(f"DBSCAN Silhouette Score: {silhouette_db}")
print(f"HDBSCAN Silhouette Score: {silhouette_hdb}")

In [None]:
import pandas as pd

def save_clustered_abstracts(abstracts_path, cluster_labels, output_dir='data'):
    """
    Save abstracts to separate CSV files based on cluster labels.

    Parameters:
    - abstracts_path: str, path to the CSV file containing abstracts.
    - cluster_labels: array-like, cluster labels for each abstract.
    - output_dir: str, directory to save the output CSV files.
    """
    # Load abstracts
    abstracts = pd.read_csv(abstracts_path)
    abstracts['Cluster'] = cluster_labels

    # Filter out noise points (-1) and group by cluster
    clusters = abstracts[abstracts['Cluster'] != -1].groupby('Cluster')

    # Iterate through each cluster and save abstracts
    for cluster_id, cluster_data in clusters:
        # Save abstracts belonging to this cluster
        cluster_filename = f'{output_dir}/cluster_{cluster_id}_abstracts.csv'
        cluster_data.to_csv(cluster_filename, index=False)
        print(f"Saved Cluster {cluster_id} abstracts to {cluster_filename}")

    # Optionally, save a single file with all clusters grouped
    all_clusters_filename = f'{output_dir}/all_clusters_abstracts.csv'
    abstracts.to_csv(all_clusters_filename, index=False)
    print(f"Saved all cluster abstracts to {all_clusters_filename}")

# Assuming cluster_labels_hdb is already defined
abstracts_path = 'data/arxiv_abstracts.csv'
cluster_labels = cluster_labels_hdb  # Replace with your actual cluster labels

# Call the function to save clustered abstracts
save_clustered_abstracts(abstracts_path, cluster_labels)

In [None]:
import pandas as pd

abstracts['Cluster'] = cluster_labels_hdb

def clustered_abstracts(abstracts, cluster_labels, title):
    # Convert embeddings to a DataFrame if needed
    embeddings_df = pd.DataFrame(scaled_embeddings, columns=[f"Embedding_{i}" for i in range(scaled_embeddings.shape[1])])

    # Add cluster indices to the embeddings DataFrame
    embeddings_df["Cluster"] = cluster_labels

    # Add cluster indices to the existing abstracts DataFrame
    abstracts["Cluster"] = cluster_labels

    # Save the updated DataFrame for later use
    abstracts.to_csv(f"data/abstracts_{title}.csv", index=False)

    #cluster distribution
    return abstracts["Cluster"].value_counts()

print(clustered_abstracts(abstracts, cluster_labels_hdb, "cluster_hdb"))
print(clustered_abstracts(abstracts, cluster_labels_db, "cluster_db"))
print(clustered_abstracts(abstracts, cluster_labels_km, "cluster_km"))