# Clustering

## Imports and Data Loading

In [None]:
import pandas as pd
import numpy as np
import asyncio
import nest_asyncio
import time
import os
import re
import random
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import umap
import scipy.sparse
from typing import List, Dict, Tuple, Any, Set, Optional
import inflect
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
import matplotlib.cm as cm
import pickle
import hdbscan
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
from pandas.tseries.offsets import DateOffset
import textwrap

In [None]:
# --- File Paths ---
DATA_FILE = "checkpoint_with_keywords.parquet"
VARIABLES_FILE = "checkpoint_variables.pkl"

# --- Pipeline Parameters ---
# PCA Configuration
PCA_N_COMPONENTS = 150
RANDOM_STATE = 42

# UMAP Configuration (for clustering)
UMAP_CLUSTERING_PARAMS = {
    'n_neighbors': 15,
    'n_components': 15,
    'min_dist': 0.0,
    'metric': 'euclidean',
    'random_state': RANDOM_STATE,
    'verbose': True
}

# UMAP Configuration (for 2D visualization)
UMAP_VISUALIZATION_PARAMS = {
    'n_neighbors': 30,
    'n_components': 2,
    'min_dist': 0.1,
    'metric': 'euclidean',
    'random_state': RANDOM_STATE,
    'verbose': True
}

# HDBSCAN Configuration
HDBSCAN_PARAMS = {
    'min_cluster_size': 300,
    'min_samples': 50,
    'metric': 'euclidean',
    'cluster_selection_method': 'eom',
    'gen_min_span_tree': True
}

In [None]:
print(f"Loading data from '{DATA_FILE}'...")
df = pd.read_parquet(DATA_FILE, engine="pyarrow")
print(f"DataFrame loaded. Shape: {df.shape}")

# Load the helper variables (column lists)
print(f"Loading feature lists from '{VARIABLES_FILE}'...")
with open(VARIABLES_FILE, "rb") as f:
    loaded_variables = pickle.load(f)
metadata_features = loaded_variables["metadata_features"]
unique_domains = np.array(loaded_variables["unique_domains"])
keyword_cols_created = loaded_variables["keyword_cols_created"]
print("Helper variables loaded.")


In [None]:
# Assemble the full feature set
print("Assembling the final feature set for clustering...")

# Ensure all feature columns exist in the DataFrame
domain_features = [col for col in unique_domains if col in df.columns]
meta_features = [col for col in metadata_features if col in df.columns]
keyword_features = [col for col in keyword_cols_created if col in df.columns]

# Combine all feature lists
all_features = meta_features + domain_features + keyword_features
all_features = sorted(list(set(all_features)))  # Get unique sorted list
print(f"Total number of features: {len(all_features)}")
print(f" - Metadata & Area features: {len(meta_features)}")
print(f" - Domain features: {len(domain_features)}")
print(f" - Keyword features: {len(keyword_features)}")

In [None]:
# Create the feature matrix X
X = df[all_features]

# Handle Missing Values
# Impute with median for numeric columns. This is a robust strategy.
if X.isnull().sum().sum() > 0:
    print("NaNs found. Imputing with column medians...")
    X = X.fillna(X.median())
else:
    print("No NaNs found in the feature matrix.")

# Convert to NumPy array for scikit-learn
print("Converting feature matrix to NumPy array...")
X_np = X.to_numpy(dtype=np.float32)
print(f"Final feature matrix 'X_np' created with shape: {X_np.shape}")

## Dimensionality Reduction and Clustering Pipeline


1. Scale (StandardScaler): Standardizes all features to have a mean of 0 and a standard deviation of 1. This is crucial because PCA and UMAP are sensitive to the scale of the data, and this step ensures that all features (e.g., number of authors, binary keywords) contribute equally.

2. PCA (Principal Component Analysis): Performs an initial, fast dimensionality reduction. It takes the thousands of scaled features and reduces them to a smaller, more manageable set by capturing the main linear patterns, which helps reduce noise and speeds up the next step significantly.

3. UMAP (Uniform Manifold Approximation and Projection): Takes the PCA-reduced data and performs a second, more sophisticated dimensionality reduction. UMAP is excellent at finding the complex, non-linear structure in the data, creating a low-dimensional representation (e.g., 15 components) that best preserves the true relationships between data points.

4. HDBSCAN (Hierarchical Density-Based Clustering): Performs the final clustering on the low-dimensional, structure-rich UMAP output. It identifies clusters based on density, allowing it to find groups of varying shapes and sizes and automatically identify noise points, without requiring to specify the number of clusters in advance.

### Step 1: Scaling

Advantages: 
Denoising: Removes a massive amount of noise, giving UMAP a cleaner signal to work with.
Speed: Running UMAP on 150 components is dramatically faster than running it on thousands.


In [None]:
print("--- Step 1: Scaling data with StandardScaler ---")
start_time = time.time()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_np)
end_time = time.time()
print(f"Scaling completed in {end_time - start_time:.2f} seconds.")

### Step 2: PCA

In [None]:
print(f"\n--- Step 2: Applying PCA to reduce to {PCA_N_COMPONENTS} components ---")
start_time = time.time()
pca = PCA(n_components=PCA_N_COMPONENTS, random_state=RANDOM_STATE)
X_pca = pca.fit_transform(X_scaled)
end_time = time.time()
print(f"PCA completed in {end_time - start_time:.2f} seconds.")
print(f"Shape after PCA: {X_pca.shape}")
print(f"Total explained variance by {pca.n_components_} components: {np.sum(pca.explained_variance_ratio_):.4f}")

### Step 3: UMAP

In [None]:
print(f"\n--- Step 3: Applying UMAP to reduce to {UMAP_CLUSTERING_PARAMS['n_components']} dimensions for clustering ---")
start_time = time.time()
umap_reducer_clustering = umap.UMAP(**UMAP_CLUSTERING_PARAMS)
X_umap_clustering = umap_reducer_clustering.fit_transform(X_pca)
end_time = time.time()
print(f"UMAP for clustering completed in {end_time - start_time:.2f} seconds.")
print(f"Shape after UMAP: {X_umap_clustering.shape}")

### Step 4: HDBSCAN Clustering

In [None]:
print("\n--- Step 4: Applying HDBSCAN to find clusters ---")
start_time = time.time()
clusterer = hdbscan.HDBSCAN(**HDBSCAN_PARAMS)
cluster_labels = clusterer.fit_predict(X_umap_clustering)
end_time = time.time()
print(f"HDBSCAN completed in {end_time - start_time:.2f} seconds.")

### Add cluster labels to DataFrame

In [None]:
df['cluster_label'] = cluster_labels
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = np.sum(cluster_labels == -1)
print("\n--- Clustering Results ---")
print(f"Number of clusters found: {n_clusters}")
print(f"Number of noise points: {n_noise} ({n_noise / len(df) * 100:.2f}%)")

### Saving Results for Later Use

In [None]:
# --- Define file paths for saving ---
RESULTS_DF_FILE = "results_df_with_clusters.parquet"
RESULTS_VARS_FILE = "results_variables.pkl"

try:
    df.to_parquet(RESULTS_DF_FILE, engine="pyarrow")
    print(f"Successfully saved DataFrame to '{RESULTS_DF_FILE}'")
except Exception as e:
    print(f"Error saving DataFrame: {e}")

results_variables = {
    'n_clusters': n_clusters,
    'n_noise': n_noise,
    'cluster_labels': cluster_labels,
    'RANDOM_STATE': RANDOM_STATE
}

try:
    with open(RESULTS_VARS_FILE, "wb") as f:
        pickle.dump(results_variables, f)
    print(f"Successfully saved variables to '{RESULTS_VARS_FILE}'")
except Exception as e:
    print(f"Error saving variables: {e}")

### Reloading Results

In [None]:
import pandas as pd
import numpy as np
import asyncio
import nest_asyncio
import time
import os
import re
import random
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import umap
import scipy.sparse
from typing import List, Dict, Tuple, Any, Set, Optional
import inflect
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
import matplotlib.cm as cm
import pickle
import hdbscan
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
from pandas.tseries.offsets import DateOffset
import textwrap

In [None]:
RESULTS_DF_FILE = "results_df_with_clusters.parquet"
RESULTS_VARS_FILE = "results_variables.pkl"


print("--- Loading pre-computed results from disk ---")


# 1. Load the DataFrame
try:
    df = pd.read_parquet(RESULTS_DF_FILE, engine="pyarrow")
    print(f"Successfully loaded DataFrame from '{RESULTS_DF_FILE}'. Shape: {df.shape}")
    # Display a sample to verify that cluster and UMAP columns are present
    display(df[['id', 'title', 'cluster_label', 'umap_x', 'umap_y']].head(3))
except FileNotFoundError:
    print(f"ERROR: DataFrame file not found at '{RESULTS_DF_FILE}'. Please run the full pipeline first.")
except Exception as e:
    print(f"Error loading DataFrame: {e}")


# 2. Load the variables
try:
    with open(RESULTS_VARS_FILE, "rb") as f:
        loaded_results = pickle.load(f)


    # Re-assign to global variables so subsequent cells work correctly
    n_clusters = loaded_results['n_clusters']
    n_noise = loaded_results['n_noise']
    cluster_labels = loaded_results['cluster_labels']


    print(f"Successfully loaded variables from '{RESULTS_VARS_FILE}'")
    print(f"  - n_clusters: {n_clusters}")
    print(f"  - n_noise: {n_noise}")
    print(f"  - cluster_labels array shape: {cluster_labels.shape}")


except FileNotFoundError:
    print(f"ERROR: Variables file not found at '{RESULTS_VARS_FILE}'. Please run the full pipeline first.")
except Exception as e:
    print(f"Error loading variables: {e}")

## Visualization of Clusters

In [None]:
# --- Create 2D UMAP embedding for visualization ---
print("\n--- Creating 2D UMAP embedding for visualization ---")
start_time = time.time()
umap_reducer_viz = umap.UMAP(**UMAP_VISUALIZATION_PARAMS)
X_umap_viz = umap_reducer_viz.fit_transform(X_pca)
end_time = time.time()
print(f"2D UMAP for visualization completed in {end_time - start_time:.2f} seconds.")

# Add 2D coordinates to DataFrame for plotting
df['umap_x'] = X_umap_viz[:, 0]
df['umap_y'] = X_umap_viz[:, 1]

# %%
print("--- Generating interactive cluster plot ---")

# Prepare data for Plotly
plot_df = df.copy()
plot_df['cluster_label_str'] = plot_df['cluster_label'].astype(str)
plot_df.loc[plot_df['cluster_label'] == -1, 'cluster_label_str'] = 'Noise'

# Get cluster sizes for the legend
cluster_sizes = plot_df['cluster_label_str'].value_counts().reset_index()
cluster_sizes.columns = ['cluster_label_str', 'count']
plot_df = pd.merge(plot_df, cluster_sizes, on='cluster_label_str')
plot_df['legend_entry'] = plot_df['cluster_label_str'] + ' (' + plot_df['count'].astype(str) + ')'

# Create the plot
fig = go.Figure()

### Saving UMAP Vizualization

In [None]:

VIZ_DF_FILE = "df_with_2D_viz_coords.parquet"

print(f"--- Saving DataFrame with visualization coordinates to '{VIZ_DF_FILE}' ---")

try:
    # The 'df' DataFrame now contains everything: original data, cluster labels, and 2D coordinates.
    df.to_parquet(VIZ_DF_FILE, engine="pyarrow")
    print("Successfully saved the DataFrame.")
    print("You can now use the 'Load Visualization Data' block in future sessions to skip the UMAP steps.")
except Exception as e:
    print(f"An error occurred while saving the DataFrame: {e}")


In [None]:
with open("df_columns.txt", "w", encoding="utf-8") as f:
    for col in df.columns:
        f.write(f"{col}\n")

### Loading UMAP Vizualization

In [None]:
VIZ_DF_FILE = "df_with_2D_viz_coords.parquet"

columns_to_load = [
    'id', 
    'title', 
    'cluster_label', 
    'umap_x', 
    'umap_y'
]


print(f"--- Loading pre-computed visualization data from '{VIZ_DF_FILE}' ---")
print(f"Loading only the following columns: {columns_to_load}")


try:
    # Load ONLY the specified columns from the Parquet file
    df = pd.read_parquet(VIZ_DF_FILE, columns=columns_to_load, engine="pyarrow")
    print(f"Successfully loaded DataFrame. Shape: {df.shape}")
    
    # --- Recreate the 'plot_df' needed for the interactive plot ---
    # This step is very fast and ensures the plotting variables are ready.
    print("\n--- Preparing data for Plotly ---")
    plot_df = df.copy()
    plot_df['cluster_label_str'] = plot_df['cluster_label'].astype(str)
    plot_df.loc[plot_df['cluster_label'] == -1, 'cluster_label_str'] = 'Noise'


    # Get cluster sizes for the legend
    cluster_sizes = plot_df['cluster_label_str'].value_counts().reset_index()
    cluster_sizes.columns = ['cluster_label_str', 'count']
    plot_df = pd.merge(plot_df, cluster_sizes, on='cluster_label_str')
    plot_df['legend_entry'] = plot_df['cluster_label_str'] + ' (' + plot_df['count'].astype(str) + ')'
    
    print("Plotting data is ready.")
    
    # --- Initialize the figure object ---
    # You are now ready to add traces and show the figure in the subsequent cells.
    fig = go.Figure()

except FileNotFoundError:
    print(f"ERROR: File not found at '{VIZ_DF_FILE}'.")
    print("Please ensure you have run the full pipeline and saved the results at least once.")
except Exception as e:
    print(f"An error occurred while loading or preparing the data: {e}")

In [None]:
# Plot clustered points first
clustered_data = plot_df[plot_df['cluster_label'] != -1].sort_values('cluster_label')
fig.add_trace(go.Scattergl(
    x=clustered_data['umap_x'],
    y=clustered_data['umap_y'],
    mode='markers',
    marker=dict(
        color=clustered_data['cluster_label'],
        colorscale='Viridis',  # A nice colorscale for clusters
        showscale=False,
        size=3,
        opacity=0.7
    ),
    customdata=clustered_data[['id', 'title', 'cluster_label']],
    hovertemplate='<b>Title:</b> %{customdata[1]}<br>' +
                  '<b>ID:</b> %{customdata[0]}<br>' +
                  '<b>Cluster:</b> %{customdata[2]}<br>' +
                  'UMAP-X: %{x:.3f}<br>UMAP-Y: %{y:.3f}<extra></extra>',
    name='Clusters'
))

# Plot noise points on top, in grey
noise_data = plot_df[plot_df['cluster_label'] == -1]
fig.add_trace(go.Scattergl(
    x=noise_data['umap_x'],
    y=noise_data['umap_y'],
    mode='markers',
    marker=dict(
        color='lightgrey',
        size=2,
        opacity=0.4
    ),
    customdata=noise_data[['id', 'title']],
    hovertemplate='<b>Title:</b> %{customdata[1]}<br>' +
                  '<b>ID:</b> %{customdata[0]}<br>' +
                  '<b>Cluster:</b> Noise<br>' +
                  'UMAP-X: %{x:.3f}<br>UMAP-Y: %{y:.3f}<extra></extra>',
    name=f"Noise ({len(noise_data)})"
))

# Update layout
fig.update_layout(
    title=f'Global Clustering Results: {n_clusters} Clusters Found',
    xaxis_title='UMAP Dimension 1',
    yaxis_title='UMAP Dimension 2',
    height=800,
    legend_title_text='Cluster Labels',
    showlegend=True
)
fig.show()


plt.style.use('seaborn-v0_8-whitegrid')

cluster_counts = df[df['cluster_label'] != -1]['cluster_label'].value_counts().sort_index()

plt.figure(figsize=(16, 8))
sns.barplot(x=cluster_counts.index, y=cluster_counts.values, palette='viridis')
plt.title('Number of Records per Cluster (excluding noise)', fontsize=16)
plt.xlabel('Cluster ID', fontsize=12)
plt.ylabel('Number of Records', fontsize=12)

if len(cluster_counts) > 50:
    plt.xticks(rotation=90, fontsize=8)
    # Show every 5th tick label to avoid clutter
    for index, label in enumerate(plt.gca().get_xticklabels()):
        if index % 5 != 0:
            label.set_visible(False)
else:
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Set the plotting style
plt.style.use('seaborn-v0_8-whitegrid')

# Get the counts of records for each cluster, excluding noise points (-1)
cluster_sizes = df[df['cluster_label'] != -1]['cluster_label'].value_counts()

# --- Identify and filter out the largest cluster ---
if not cluster_sizes.empty:
    # Find the ID and size of the largest cluster
    largest_cluster_id = cluster_sizes.idxmax()
    largest_cluster_size = cluster_sizes.max()

    # Create a new Series for plotting that excludes the largest cluster
    sizes_for_plotting = cluster_sizes  # Not dropping largest cluster as per your edit
    
    print(f"Identified and excluded the largest cluster (ID: {largest_cluster_id}) with {largest_cluster_size} records to improve visualization.")
else:
    print("No clusters to plot.")
    sizes_for_plotting = pd.Series()

# --- Create the Histogram for the remaining clusters ---
if not sizes_for_plotting.empty:
    plt.figure(figsize=(14, 7))
    
    # Use stat='percent' to get percentage on the y-axis
    sns.histplot(data=sizes_for_plotting, binwidth=500, kde=False, color='royalblue', stat='percent')

    # Update the title to reflect the data
    plt.title(f'Distribution of Cluster Sizes', fontsize=16)
    plt.xlabel('Cluster Size (Number of Records)', fontsize=12)
    plt.ylabel('Percentage of Clusters (%)', fontsize=12)  # Updated y-label
    plt.grid(axis='y', alpha=0.75)

    plt.tight_layout()
    plt.show()



In [None]:
# Set the plotting style
plt.style.use('seaborn-v0_8-whitegrid')

# Get the counts of records for each cluster, excluding noise points (-1)
cluster_sizes = df[df['cluster_label'] != -1]['cluster_label'].value_counts()

# --- Identify and filter out the largest cluster ---
if not cluster_sizes.empty:
    # Find the ID and size of the largest cluster
    largest_cluster_id = cluster_sizes.idxmax()
    largest_cluster_size = cluster_sizes.max()

    # Create a new Series for plotting that excludes the largest cluster
    sizes_for_plotting = cluster_sizes  # Not dropping largest cluster as per your edit
    
    print(f"Identified and excluded the largest cluster (ID: {largest_cluster_id}) with {largest_cluster_size} records to improve visualization.")
    
    # Calculate percentage of clusters with between 300 and 500 records
    target_range_clusters = sizes_for_plotting[(sizes_for_plotting >= 300) & (sizes_for_plotting <= 800)]
    percentage_in_range = (len(target_range_clusters) / len(sizes_for_plotting)) * 100
    print(f"Percentage of clusters with between 300 and 500 records: {percentage_in_range:.2f}%")
    print(f"Count of clusters with between 300 and 500 records: {len(target_range_clusters)} out of {len(sizes_for_plotting)} total clusters")
else:
    print("No clusters to plot.")
    sizes_for_plotting = pd.Series()

# --- Create the Histogram for the remaining clusters ---
if not sizes_for_plotting.empty:
    plt.figure(figsize=(14, 7))
    
    # Use stat='percent' to get percentage on the y-axis
    sns.histplot(data=sizes_for_plotting, binwidth=500, kde=False, color='royalblue', stat='percent')

    # Update the title to reflect the data
    plt.title(f'Distribution of Cluster Sizes', fontsize=16)
    plt.xlabel('Cluster Size (Number of Records)', fontsize=12)
    plt.ylabel('Percentage of Clusters (%)', fontsize=12)  # Updated y-label
    plt.grid(axis='y', alpha=0.75)
    
    plt.axvspan(300, 800, alpha=0.2, color='red')
    plt.axvline(x=300, linestyle='--', color='red', alpha=0.7)
    plt.axvline(x=800, linestyle='--', color='red', alpha=0.7)
    
    # Add annotation about percentage in this range
    plt.annotate(f'{percentage_in_range:.2f}% of clusters\nhave between 300-800 records', 
                xy=(400, plt.ylim()[1]*0.9),
                xytext=(400, plt.ylim()[1]*0.9),
                ha='center',
                bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="red", alpha=0.7))

    plt.tight_layout()
    plt.show()


## Cluster Analysis and Characterization

1.  **High-Level Summary**: Summary table that provides a quick overview of every cluster, including its size, top keywords, most representative domains, and median publication date. This helps in getting a first impression of the landscape.
2.  **Identifying Emerging Topics**: Temporal trends of each cluster to identify which topics are gaining popularity over time ("emerging topics").
3.  **Deep Dive Analysis**: Specific clusters of interest: More detailed analysis of its internal keyword trends over time.

### High-Level Cluster Summary

In [None]:
VARIABLES_FILE = "checkpoint_variables.pkl"

with open(VARIABLES_FILE, "rb") as f:
            loaded_vars = pickle.load(f)
        
keyword_features = loaded_vars.get('keyword_cols_created', [])
domain_features = loaded_vars.get('unique_domains', [])
area_features = loaded_vars.get('unique_areas', [])

In [None]:
def identify_emerging_cluster_ids(
    df: pd.DataFrame,
    cluster_column: str = 'cluster_label',  # Default column for cluster labels
    date_column: str = 'first_date',
    recent_months_window: int = 12,
    min_papers_recent_period: int = 5,
    emerging_ratio_threshold: float = 1.5,
    emerging_diff_threshold: float = 0.001,
    newly_active_min_recent_prop: float = 0.0005
) -> List[int]:
    """
    Identifies emerging cluster IDs from the entire dataset by comparing their
    proportion of publications in a recent period versus a baseline period.
    This version operates globally, without filtering for a specific domain.
    """
    print("\n--- Identifying Emerging Cluster IDs (Global) ---")
    emerging_cluster_ids: List[int] = []


    # --- 1. Input Validation and Data Preparation ---
    # Filter out noise points and operate on a copy
    df_analysis = df[df[cluster_column] != -1].copy()


    if not pd.api.types.is_datetime64_any_dtype(df_analysis[date_column]):
        df_analysis[date_column] = pd.to_datetime(df_analysis[date_column])


    if df_analysis.empty:
        print("  No valid clustered data found after filtering noise/NA.")
        return emerging_cluster_ids


    # --- 2. Define Time Windows ---
    max_date = df_analysis[date_column].max()
    recent_period_start_date = max_date - DateOffset(months=recent_months_window)


    print(f"  Recent period starts: {recent_period_start_date.strftime('%Y-%m-%d')}")


    df_recent = df_analysis[df_analysis[date_column] >= recent_period_start_date]
    df_baseline = df_analysis[df_analysis[date_column] < recent_period_start_date]


    if df_recent.empty or df_baseline.empty:
        print("  Not enough data in both recent and baseline periods to assess emergence.")
        return emerging_cluster_ids


    # The total is now across the entire dataset for the period
    total_papers_recent = len(df_recent)
    total_papers_baseline = len(df_baseline)


    unique_clusters = sorted(df_analysis[cluster_column].unique())


    for cluster_id in unique_clusters:
        papers_cluster_recent = (df_recent[cluster_column] == cluster_id).sum()
        if papers_cluster_recent < min_papers_recent_period:
            continue


        papers_cluster_baseline = (df_baseline[cluster_column] == cluster_id).sum()


        prop_recent = papers_cluster_recent / total_papers_recent
        prop_baseline = papers_cluster_baseline / total_papers_baseline
        
        is_emerging = False
        if prop_baseline == 0:
            if prop_recent >= newly_active_min_recent_prop:
                print(f"    Cluster {cluster_id}: Newly Active (Recent Prop: {prop_recent:.4f})")
                is_emerging = True
        else:
            emergence_ratio = prop_recent / prop_baseline
            emergence_difference = prop_recent - prop_baseline
            if emergence_ratio >= emerging_ratio_threshold and emergence_difference >= emerging_diff_threshold:
                print(f"    Cluster {cluster_id}: Emerging (Ratio: {emergence_ratio:.2f}, Diff: {emergence_difference:.4f})")
                is_emerging = True
        
        if is_emerging:
            emerging_cluster_ids.append(cluster_id)


    print(f"\n  Identified {len(emerging_cluster_ids)} emerging/newly active cluster IDs: {emerging_cluster_ids}")
    return sorted(list(set(emerging_cluster_ids)))


def plot_combined_trends(
    proportions_df: pd.DataFrame,
    emerging_ids: List[int],
    top_keywords_map: Dict[int, List[str]],
    title: str,
    default_color: str = 'grey'
):
    """
    Plots smoothed proportion trends for all clusters, ensuring distinct colors
    for the highlighted emerging ones by using a colormap.
    """
    print(f"\n--- Plotting Combined Temporal Trends with Distinct Colors ---")
    fig, ax = plt.subplots(figsize=(16, 9))


    # Plot non-emerging clusters first in grey
    for cluster_id in proportions_df.columns:
        if cluster_id != -1 and cluster_id not in emerging_ids:
            ax.plot(proportions_df.index, proportions_df[cluster_id] * 100, color=default_color, lw=1.0, alpha=0.5)


    # --- Generate a set of distinct colors for the emerging clusters ---
    n_emerging = len(emerging_ids)
    # Use a vibrant colormap like 'jet' or 'viridis' to get many distinct colors.
    # np.linspace creates an evenly spaced sequence of numbers, and cm.jet maps
    # each of these numbers to a unique color.
    colors = cm.jet(np.linspace(0, 1, n_emerging))


    # Plot emerging clusters on top, each with a unique color from our generated list
    for i, cluster_id in enumerate(emerging_ids):
        if cluster_id in proportions_df.columns:
            #label_text = f"Cluster {cluster_id}: {top_keywords_map.get(cluster_id, ['N/A'])[0]}"
            label_text = f"Cluster {cluster_id}"
            # The key change is here: assign a unique color from our list
            ax.plot(
                proportions_df.index, 
                proportions_df[cluster_id] * 100, 
                lw=2.5, 
                label=label_text,
                color=colors[i] 
            )


    ax.set_title(title, fontsize=16)
    ax.set_xlabel('Time')
    ax.set_ylabel('Share of Monthly Publications (%)')
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    ax.set_ylim(bottom=0)
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=100.0, decimals=1))
    
    if emerging_ids:
        # Adjust legend font size if there are many items to prevent overlap
        legend_fontsize = 'small' if n_emerging > 20 else 'medium'
        ax.legend(
            title='Emerging Clusters', 
            bbox_to_anchor=(1.02, 1), 
            loc='upper left',
            fontsize=legend_fontsize
        )
    
    plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for legend
    plt.show()

def plot_emerging_trends_only(
    proportions_df: pd.DataFrame,
    emerging_ids: List[int],
    top_keywords_map: Dict[int, List[str]],
    title: str
):
    """
    Plots smoothed proportion trends for ONLY the emerging clusters, ensuring
    each has a distinct color for clear visualization. Omits all non-emerging
    (grey) lines.
    """
    print(f"\n--- Plotting Trends for Emerging Clusters Only ---")
    fig, ax = plt.subplots(figsize=(16, 9))


    # --- Generate a set of distinct colors for the emerging clusters ---
    n_emerging = len(emerging_ids)
    if n_emerging == 0:
        print("  No emerging clusters to plot.")
        ax.text(0.5, 0.5, 'No emerging clusters found.', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)
        plt.show()
        return


    colors = cm.jet(np.linspace(0, 1, n_emerging))


    # --- Plot ONLY the emerging clusters ---
    for i, cluster_id in enumerate(emerging_ids):
        if cluster_id in proportions_df.columns:
            #label_text = f"Cluster {cluster_id}: {top_keywords_map.get(cluster_id, ['N/A'])[0]}"
            label_text = f"Cluster {cluster_id}"
            ax.plot(
                proportions_df.index, 
                proportions_df[cluster_id] * 100, 
                lw=2.5, 
                label=label_text,
                color=colors[i] 
            )


    ax.set_title(title, fontsize=16)
    ax.set_xlabel('Time')
    ax.set_ylabel('Share of Monthly Publications (%)')
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    ax.set_ylim(bottom=0)
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=100.0, decimals=1))
    
    # Adjust legend font size if there are many items to prevent overlap
    legend_fontsize = 'small' if n_emerging > 20 else 'medium'
    ax.legend(
        title='Emerging Clusters', 
        bbox_to_anchor=(1.02, 1), 
        loc='upper left',
        fontsize=legend_fontsize
    )
    
    plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for legend
    plt.show()

def plot_cluster_domain_trends_plotly(
    df: pd.DataFrame,
    cluster_id: int,
    domain_features: List[str],
    num_domains_to_plot: int = 5,
    date_column: str = 'first_date',
    cluster_column: str = 'cluster_label',
    rolling_window: int = 12
):
    """
    Plots the temporal trend of the top N domains/areas for a specific cluster using Plotly.
    """
    print(f"\n--- Plotting Domain Trends for Cluster: {cluster_id} ---")


    # Filter the DataFrame for the selected cluster
    df_filtered = df[df[cluster_column] == cluster_id].copy()
    if df_filtered.empty:
        print(f"  No data found for cluster '{cluster_id}'.")
        return


    # --- Identify the top N domains for THIS specific cluster ---
    # This is calculated on the fly from the filtered data
    top_domains = df_filtered[domain_features].mean().sort_values(ascending=False).head(num_domains_to_plot)
    domains_to_plot = top_domains.index.tolist()
    print(f"  Top {len(domains_to_plot)} domains: {domains_to_plot}")


    # Add a YearMonth column for grouping
    df_filtered.loc[:, 'YearMonth'] = pd.to_datetime(df_filtered[date_column]).dt.to_period('M')


    # Prepare a dictionary to store monthly proportions for each domain
    domain_monthly_proportions_dict = {}
    grouped_by_month = df_filtered.groupby('YearMonth')


    for domain in domains_to_plot:
        # The domain name is the column name, so no transformation is needed
        monthly_proportions = grouped_by_month[domain].mean()
        domain_monthly_proportions_dict[domain] = monthly_proportions


    if not domain_monthly_proportions_dict:
        print("  No domain data could be processed for plotting.")
        return


    # Create a DataFrame for the proportions
    proportions_df = pd.DataFrame(domain_monthly_proportions_dict)


    # Reindex to fill missing months
    if not proportions_df.empty:
        full_date_range = pd.period_range(
            start=proportions_df.index.min().to_timestamp(),
            end=proportions_df.index.max().to_timestamp(),
            freq='M'
        )
        proportions_df = proportions_df.reindex(full_date_range, fill_value=0)


    # Smooth the proportions using a rolling window
    smoothed_proportions_df = proportions_df.rolling(window=rolling_window, center=True, min_periods=1).mean()
    smoothed_proportions_df.index = smoothed_proportions_df.index.to_timestamp()


    # Plot the trends using Plotly
    fig = go.Figure()
    for domain in smoothed_proportions_df.columns:
        fig.add_trace(go.Scatter(
            x=smoothed_proportions_df.index,
            y=smoothed_proportions_df[domain] * 100,
            mode='lines',
            name=domain,
            line=dict(width=2)
        ))


    fig.update_layout(
        title=f'Internal Domain/Area Trends for Cluster {cluster_id}',
        title_x=0.5,
        xaxis_title='Time',
        yaxis_title='Share of Papers in Cluster (%)',
        yaxis_tickformat='.1f',
        yaxis_ticksuffix='%',
        legend_title_text='Domains / Areas',
        height=600,
        hovermode='x unified'
    )
    fig.show()

def plot_cluster_area_trends_plotly(
    df: pd.DataFrame,
    cluster_id: int,
    area_features: List[str],
    num_areas_to_plot: int = 5,
    date_column: str = 'first_date',
    cluster_column: str = 'cluster_label',
    rolling_window: int = 12
):
    """
    Plots the temporal trend of the top N areas for a specific cluster using Plotly.
    """
    print(f"\n--- Plotting Area Trends (Plotly) for Cluster: {cluster_id} ---")


    # Filter the DataFrame for the selected cluster
    df_filtered = df[df[cluster_column] == cluster_id].copy()
    if df_filtered.empty:
        print(f"  No data found for cluster '{cluster_id}'.")
        return


    # --- Identify the top N areas for THIS specific cluster ---
    top_areas = df_filtered[area_features].mean().sort_values(ascending=False).head(num_areas_to_plot)
    areas_to_plot = top_areas.index.tolist()
    print(f"  Top {len(areas_to_plot)} areas for this cluster: {areas_to_plot}")


    # Add a YearMonth column for grouping
    df_filtered.loc[:, 'YearMonth'] = pd.to_datetime(df_filtered[date_column]).dt.to_period('M')


    # Prepare a dictionary to store monthly proportions for each area
    area_monthly_proportions_dict = {}
    grouped_by_month = df_filtered.groupby('YearMonth')


    for area in areas_to_plot:
        monthly_proportions = grouped_by_month[area].mean()
        area_monthly_proportions_dict[area] = monthly_proportions


    if not area_monthly_proportions_dict:
        print("  No area data could be processed for plotting.")
        return


    # Create a DataFrame for the proportions
    proportions_df = pd.DataFrame(area_monthly_proportions_dict)


    # Reindex to fill missing months
    if not proportions_df.empty:
        full_date_range = pd.period_range(
            start=proportions_df.index.min().to_timestamp(),
            end=proportions_df.index.max().to_timestamp(),
            freq='M'
        )
        proportions_df = proportions_df.reindex(full_date_range, fill_value=0)


    # Smooth the proportions using a rolling window
    smoothed_proportions_df = proportions_df.rolling(window=rolling_window, center=True, min_periods=1).mean()
    smoothed_proportions_df.index = smoothed_proportions_df.index.to_timestamp()


    # Plot the trends using Plotly
    fig = go.Figure()
    for area in smoothed_proportions_df.columns:
        fig.add_trace(go.Scatter(
            x=smoothed_proportions_df.index,
            y=smoothed_proportions_df[area] * 100,
            mode='lines',
            name=area,
            line=dict(width=2)
        ))


    fig.update_layout(
        title=f'Internal Area Trends for Cluster {cluster_id}<br>({rolling_window}-Month Rolling Average)',
        title_x=0.5,
        xaxis_title='Time',
        yaxis_title='Share of Papers in Cluster (%)',
        yaxis_tickformat='.1f',
        yaxis_ticksuffix='%',
        legend_title_text='Areas',
        height=600,
        hovermode='x unified'
    )
    fig.show()



In [None]:
# %% [markdown]
# ## (Corrected) Load Data for Full Analysis (Memory Optimized)
# 
# **This is the definitive block for loading data after the pipeline has been run.**
# 
# It robustly combines the computed results (clusters, coordinates) with the original features needed for analysis, without ever loading the entire combined dataset into memory at once.

# %%
# --- Define file paths ---
# The file with the final UMAP coordinates and cluster labels
RESULTS_FILE = "df_with_2D_viz_coords.parquet" 
# The original data file with all metadata and keyword features
ORIGINAL_DATA_FILE = "checkpoint_with_keywords.parquet" 

print("--- Loading results and features separately and merging them ---")
all_features
try:
    # --- Step 1: Load ONLY the computed results ---
    # These are the columns that took hours to generate.
    results_cols = ['id', 'cluster_label', 'umap_x', 'umap_y']
    print(f"Loading computed results from '{RESULTS_FILE}'...")
    df_results = pd.read_parquet(RESULTS_FILE, columns=results_cols)
    print(f"Loaded results. Shape: {df_results.shape}")

    # --- Step 2: Load the original data features needed for analysis ---
    # We load from the original file, which is guaranteed to have all columns.
    # We exclude columns already in df_results (except the 'id' key) to save memory.
    print(f"Loading original features from '{ORIGINAL_DATA_FILE}'...")
    df_features = pd.read_parquet(ORIGINAL_DATA_FILE)
    
    # Define which original columns to keep for the merge
    # We need everything EXCEPT the results columns we already loaded
    cols_to_keep = [col for col in df_features.columns if col not in ['cluster_label', 'umap_x', 'umap_y']]
    df_features = df_features[cols_to_keep]
    print(f"Loaded features. Shape: {df_features.shape}")

    # --- Step 3: Merge the two DataFrames on the 'id' column ---
    # This joins the results with the features needed to analyze them.
    print("\nMerging results and features...")
    df = pd.merge(df_results, df_features, on='id', how='left')
    print(f"Successfully created final merged DataFrame. Shape: {df.shape}")

    # Clean up to free up memory
    del df_results
    del df_features
    
    # --- Step 4: Prepare the 'plot_df' for visualization ---
    # This part is fast and uses the newly created full 'df'
    print("\n--- Preparing data for Plotly ---")
    plot_df = df.copy()
    plot_df['cluster_label_str'] = plot_df['cluster_label'].astype(str)
    plot_df.loc[plot_df['cluster_label'] == -1, 'cluster_label_str'] = 'Noise'

    # Calculate cluster sizes for the legend
    cluster_sizes = plot_df['cluster_label_str'].value_counts().reset_index()
    cluster_sizes.columns = ['cluster_label_str', 'count']
    plot_df = pd.merge(plot_df, cluster_sizes, on='cluster_label_str')
    plot_df['legend_entry'] = plot_df['cluster_label_str'] + ' (' + plot_df['count'].astype(str) + ')'
    
    print("Plotting data is ready.")
    
    # --- Initialize the figure object ---
    fig = go.Figure()
    print("Plotly figure object initialized. You can now proceed with plotting and analysis.")

except FileNotFoundError as e:
    print(f"ERROR: A required file was not found: {e}")
    print("Please ensure both 'df_with_2D_viz_coords.parquet' and 'checkpoint_with_keywords.parquet' are present.")
except Exception as e:
    print(f"An error occurred while loading or preparing the data: {e}")



In [None]:
try:
    all_features
    meta_features
    domain_features
    keyword_features
except NameError:
    print("Error: Feature lists (all_features, etc.) not found. Please re-run the feature preparation cells.")
    # As a fallback, try to reconstruct them
    domain_features = [col for col in unique_domains if col in df.columns]
    meta_features = [col for col in metadata_features if col in df.columns]
    keyword_features = [col for col in keyword_cols_created if col in df.columns]

# We are interested in the actual clusters, so we exclude noise points (label -1)
df_clustered = df[df['cluster_label'] != -1].copy()

if df_clustered.empty:
    print("No clustered data available to analyze (all points might be noise).")
else:
    # Group by the new global cluster label
    grouped = df_clustered.groupby('cluster_label')

# --- Define a helper function to get top features ---
def get_top_n_features(group, feature_list, n=5):
    """Calculates the mean for features and returns the top N."""
    # Ensure features exist in the group's columns
    valid_features = [f for f in feature_list if f in group.columns]
    if not valid_features:
        return []
    
    # Calculate mean, sort, and get top N feature names
    top_features = group[valid_features].mean().sort_values(ascending=False).head(n)
    return top_features.index.tolist()

# --- Calculate summary statistics for each cluster ---
print("Calculating summary statistics for each cluster...")
summary_list = []

# Get a list of all cluster labels to iterate over
all_cluster_ids = sorted(df_clustered['cluster_label'].unique())

for cluster_id in all_cluster_ids:
    group = grouped.get_group(cluster_id)
    
    # Basic stats
    size = len(group)
    median_date = group['first_date'].median().strftime('%Y-%m')
    avg_authors = group['number_of_authors'].mean()
    
    # Top keywords (remove 'kw_' prefix for readability)
    top_keywords_raw = get_top_n_features(group, keyword_features, n=10)
    top_keywords = [kw.replace('kw_', '').replace(' ', '_') for kw in top_keywords_raw]

    # Top domains
    top_domains = get_top_n_features(group, domain_features, n=3)
    
    summary_list.append({
        'Cluster ID': cluster_id,
        'Size': size,
        'Median Date': median_date,
        'Avg Authors': f"{avg_authors:.2f}",
        'Top 5 Keywords': ', '.join(top_keywords[:5]),
        'Top Domains': ', '.join(top_domains),
        '_full_keyword_list': top_keywords  # Store for later use
    })

# Create the summary DataFrame
cluster_summary_df = pd.DataFrame(summary_list)

# Create a dictionary mapping cluster ID to its full list of top keywords for later use
top_keywords_per_cluster_map = cluster_summary_df.set_index('Cluster ID')['_full_keyword_list'].to_dict()

# Drop the helper column before displaying
cluster_summary_df = cluster_summary_df.drop(columns=['_full_keyword_list'])

# Sort by size for a more organized view
cluster_summary_df = cluster_summary_df.sort_values('Size', ascending=False).reset_index(drop=True)

print("\n--- Cluster Summary Table ---")
# Display the full DataFrame
pd.set_option('display.max_rows', len(cluster_summary_df) + 5)
pd.set_option('display.max_colwidth', 120)
display(cluster_summary_df)
pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')

### Identifying Emerging Topics

In [None]:
def calculate_temporal_proportions(df, cluster_column='cluster_label', date_column='first_date', rolling_window=6):
    """Calculates the monthly proportion of all papers belonging to each cluster."""
    print("\n--- Calculating Temporal Proportions for All Clusters ---")
    # Ensure date column is datetime
    if not pd.api.types.is_datetime64_any_dtype(df[date_column]):
        df[date_column] = pd.to_datetime(df[date_column])
    
    # Create a YearMonth period
    df_temp = df.copy()
    df_temp['YearMonth'] = df_temp[date_column].dt.to_period('M')

    # Count papers per YearMonth and cluster (including noise)
    monthly_counts = df_temp.groupby(['YearMonth', cluster_column]).size().unstack(fill_value=0)

    # Total papers per YearMonth (overall)
    total_monthly_counts = monthly_counts.sum(axis=1)

    # Calculate proportion, avoiding division by zero
    proportions_df = monthly_counts.divide(total_monthly_counts, axis=0).fillna(0)

    # Smooth the proportions
    smoothed_proportions_df = proportions_df.rolling(window=rolling_window, center=True, min_periods=1).mean()

    # Convert index to timestamp for plotting
    smoothed_proportions_df.index = smoothed_proportions_df.index.to_timestamp()

    print(f"Temporal proportions calculated. Shape: {smoothed_proportions_df.shape}")
    return smoothed_proportions_df

# --- Execute the Emergence Analysis ---
# 1. Calculate proportions for all clusters over time
temporal_proportions = calculate_temporal_proportions(df, rolling_window=12)

# 2. Identify emerging clusters
emerging_cluster_ids = identify_emerging_cluster_ids(
    df,
    recent_months_window=12,
    min_papers_recent_period=100,
    emerging_ratio_threshold=2.0,
    emerging_diff_threshold=0.001
)


In [None]:
plot_combined_trends(
    proportions_df=temporal_proportions,
    emerging_ids=emerging_cluster_ids,
    top_keywords_map=top_keywords_per_cluster_map,
    title='Temporal Trends of All Clusters'
)

In [None]:
plot_emerging_trends_only(
    proportions_df=temporal_proportions,
    emerging_ids=emerging_cluster_ids,
    top_keywords_map=top_keywords_per_cluster_map,
    title='Temporal Trends of Emerging Topics'
)

In [None]:
print(f"--- Generating UMAP plot for {len(emerging_cluster_ids)} emerging clusters ---")
print(f"Using colors consistent with the time series plots.")

# 1. Filter the DataFrame to get only the data for emerging clusters
emerging_df = df[df['cluster_label'].isin(emerging_cluster_ids)].copy()

# 2. Generate the exact same color sequence as the time series plot
n_emerging = len(emerging_cluster_ids)
# This line is identical to the one in your plot_emerging_trends_only function [[11]]
colors_rgba = cm.jet(np.linspace(0, 1, n_emerging))

# Convert matplotlib's RGBA (0-1 scale) to Plotly's 'rgb(r,g,b)' string (0-255 scale)
colors_rgb_str = [f'rgb({int(r*255)}, {int(g*255)}, {int(b*255)})' for r, g, b, a in colors_rgba]

# 3. Create a dictionary that maps each cluster ID to its specific color
# Since emerging_cluster_ids is sorted, this assigns colors in the same order as the plot
color_map = dict(zip(emerging_cluster_ids, colors_rgb_str))

# 4. Create the Plotly figure
fig_emerging_colored = go.Figure()

# Add a separate trace for each emerging cluster. This is the best way to
# assign specific colors and create a clean legend.
for cluster_id in emerging_cluster_ids:
    cluster_data = emerging_df[emerging_df['cluster_label'] == cluster_id]
    
    fig_emerging_colored.add_trace(go.Scattergl(
        x=cluster_data['umap_x'],
        y=cluster_data['umap_y'],
        mode='markers',
        marker=dict(
            color=color_map[cluster_id],  # Use the specific color from our map
            size=5,
            opacity=0.9
        ),
        # Use the same hovertemplate as your other plots for consistency
        customdata=cluster_data[['id', 'title', 'cluster_label']],
        hovertemplate='<b>Title:</b> %{customdata[1]}<br>' +
                      '<b>ID:</b> %{customdata[0]}<br>' +
                      '<b>Cluster:</b> %{customdata[2]}<br>' +
                      '<extra></extra>',
        name=f'Cluster {cluster_id}'  # This creates the legend entry
    ))

# 5. Update the layout
fig_emerging_colored.update_layout(
    title='UMAP Visualization of Emerging Clusters (with Time Series Colors)',
    xaxis_title='UMAP Dimension 1',
    yaxis_title='UMAP Dimension 2',
    height=800,
    legend_title_text='Emerging Cluster ID',
    # Optional: If you prefer a dark theme like your music taste might suggest
    # template='plotly_dark' 
)

fig_emerging_colored.show()

In [None]:
def display_overall_emerging_summary_table(
    df: pd.DataFrame,
    emerging_ids: List[int],
    keyword_features: List[str],
    domain_features: List[str],
    area_features: List[str]
):
    """
    Generates and displays a summary table for all emerging clusters,
    showing the top items and their prevalence as a percentage.
    """
    print("\n--- Generating Overall Summary Table with Percentages for All Emerging Clusters ---")

    if not emerging_ids:
        print("No emerging clusters to summarize.")
        return

    # --- 1. Aggregate all emerging clusters into a single DataFrame ---
    df_emerging = df[df['cluster_label'].isin(emerging_ids)]
    
    if df_emerging.empty:
        print("No documents found for the given emerging cluster IDs.")
        return

    # Helper function to get top features with their scores (mean values).
    def get_top_n_features_with_scores(group, feature_list, n=5):
        valid_features = [f for f in feature_list if f in group.columns]
        if not valid_features:
            return pd.Series(dtype=float)
        top_features_series = group[valid_features].mean().sort_values(ascending=False).head(n)
        return top_features_series

    # Helper function to format the Series for table display
    def format_series_for_table(series, prefix_to_remove=''):
        if series.empty:
            return "", ""
        names = [str(name).replace(prefix_to_remove, '') for name in series.index]
        scores = [f"{score * 100:.1f}" for score in series.values]
        
        names_str = '<br>'.join(names)
        scores_str = '<br>'.join(scores)
        return names_str, scores_str

    # --- 2. Calculate top features and their scores ---
    top_keywords_series = get_top_n_features_with_scores(df_emerging, keyword_features, n=10)
    top_domains_series = get_top_n_features_with_scores(df_emerging, domain_features, n=5)
    top_areas_series = get_top_n_features_with_scores(df_emerging, area_features, n=10)

    # --- 3. Format the results for the table ---
    top_keywords_str, keyword_scores_str = format_series_for_table(top_keywords_series, prefix_to_remove='kw_')
    top_domains_str, domain_scores_str = format_series_for_table(top_domains_series)
    top_areas_str, area_scores_str = format_series_for_table(top_areas_series)

    # --- 4. Create the Plotly Table ---
    fig = go.Figure(data=[go.Table(
        # --- KEY CHANGE: Set the relative widths of the columns ---
        # These numbers are ratios. Here, the second column is given much more space.
        columnwidth = [30, 50, 20],
        header=dict(
            values=['<b>Characteristic</b>', '<b>Top Items</b>', '<b>Prevalence [%]</b>'],
            line_color='darkslategray',
            fill_color='#9E9E9E',
            align=['left', 'left', 'center'],
            font=dict(color='black', size=14)
        ),
        cells=dict(
            values=[
                ['<b>Top 10 Keywords</b>', '<b>Top 5 Domains</b>', '<b>Top 10 Areas</b>'],
                [top_keywords_str, top_domains_str, top_areas_str],
                [keyword_scores_str, domain_scores_str, area_scores_str]
            ],
            line_color='darkslategray',
            fill_color='white',
            align=['left', 'left', 'center'],
            font=dict(color='black', size=12),
            height=40
        )
    )])

    fig.update_layout(
        title_text="<b>Overall Prevalence of Features in Emerging Research</b>",
        title_x=0.5,
        height=600,
        width=900, # Increased width slightly to better fit the wider column
        margin=dict(l=20, r=20, t=50, b=20),
        plot_bgcolor="#FFFFFF",
        paper_bgcolor="#FFFFFF",
        font_color='black'
    )
    
    fig.show()

display_overall_emerging_summary_table(
    df=df,
    emerging_ids=emerging_cluster_ids,
    keyword_features=keyword_features,
    domain_features=domain_features,
    area_features=area_features
)

In [None]:
def display_emerging_cluster_summary_tables(
    df: pd.DataFrame,
    emerging_ids: List[int],
    keyword_features: List[str],
    domain_features: List[str],
    area_features: List[str]
):
    """
    Generates a dynamically sized summary table with prevalence percentages for each
    individual emerging cluster, removing extra whitespace.
    """
    print(f"\n--- Generating Individual Summary Tables for {len(emerging_ids)} Emerging Clusters ---")

    if not emerging_ids:
        print("No emerging clusters to display.")
        return

    # Helper functions remain the same
    def get_top_n_features_with_scores(group, feature_list, n=5):
        valid_features = [f for f in feature_list if f in group.columns]
        if not valid_features:
            return pd.Series(dtype=float)
        top_features_series = group[valid_features].mean().sort_values(ascending=False).head(n)
        return top_features_series

    def format_series_for_table(series, prefix_to_remove=''):
        if series.empty:
            return "", ""
        names = [str(name).replace(prefix_to_remove, '') for name in series.index]
        scores = [f"{score * 100:.1f}" for score in series.values]
        names_str = '<br>'.join(names)
        scores_str = '<br>'.join(scores)
        return names_str, scores_str

    # Loop through each emerging cluster ID
    for cluster_id in emerging_ids:
        group = df[df['cluster_label'] == cluster_id]
        if group.empty:
            print(f"Skipping Cluster {cluster_id}: No documents found.")
            continue

        # Calculate top features
        top_keywords_series = get_top_n_features_with_scores(group, keyword_features, n=5)
        top_domains_series = get_top_n_features_with_scores(group, domain_features, n=3)
        top_areas_series = get_top_n_features_with_scores(group, area_features, n=5)

        # Format results for the table
        num_records = len(group)
        top_keywords_str, keyword_scores_str = format_series_for_table(top_keywords_series, prefix_to_remove='kw_')
        top_domains_str, domain_scores_str = format_series_for_table(top_domains_series)
        top_areas_str, area_scores_str = format_series_for_table(top_areas_series)

        # --- KEY CHANGE 1: Dynamically calculate figure height ---
        # Count the total number of lines that will be displayed in the table.
        num_lines = (
            1  # For "Number of Records"
            + len(top_keywords_series)
            + len(top_domains_series)
            + len(top_areas_series)
        )
        
        # Define base heights and calculate the total needed height
        header_height = 40
        row_height = 28 
        margin_height = 80 # For title and bottom margin
        dynamic_height = (num_lines * row_height) + header_height + margin_height

        # --- Create the Plotly Table ---
        fig = go.Figure(data=[go.Table(
            columnwidth=[30, 50, 25],
            header=dict(
                values=['<b>Characteristic</b>', '<b>Top Items</b>', '<b>Prevalence [%]</b>'],
                line_color='darkslategray',
                fill_color="#9E9E9E",
                align=['left', 'left', 'center'],
                font=dict(color='black', size=14),
                height=header_height
            ),
            cells=dict(
                values=[
                    ['Number of Records', 'Top 5 Keywords', 'Top 3 Domains', 'Top 5 Areas'],
                    [num_records, top_keywords_str, top_domains_str, top_areas_str],
                    # --- KEY CHANGE 2: Replace '100.0%' with '-' ---
                    ['-', keyword_scores_str, domain_scores_str, area_scores_str]
                ],
                line_color='darkslategray',
                fill_color='white',
                align=['left', 'left', 'center'],
                font=dict(color='black', size=12),
                height=row_height 
            )
        )])

        fig.update_layout(
            title_text=f"<b>Summary for Emerging Cluster {cluster_id}</b>",
            title_x=0.5,
            height=dynamic_height, # Use the calculated dynamic height
            width=800,
            margin=dict(l=20, r=20, t=50, b=20),
            plot_bgcolor="#FFFFFF",
            paper_bgcolor="#FFFFFF",
            font_color='black'
        )
        
        fig.show()





display_emerging_cluster_summary_tables(
    df=df,
    emerging_ids=emerging_cluster_ids,
    keyword_features=keyword_features,
    domain_features=domain_features,
    area_features=area_features
)

### Deep Dive into Specific Clusters

In [None]:
def plot_cluster_keyword_trends_plotly(
    df: pd.DataFrame,
    cluster_id: int,
    all_cluster_keywords: List[str],
    num_keywords_to_plot: int = 10,
    date_column: str = 'first_date',
    text_column: str = 'abstract',
    cluster_column: str = 'cluster_label',
    rolling_window: int = 12
):
    """Plots the temporal trend of the top N keywords for a specific cluster using Plotly."""
    print(f"\n--- Plotting Keyword Trends (Plotly) for Cluster: {cluster_id} ---")

    # Filter the DataFrame for the selected cluster
    df_filtered = df[df[cluster_column] == cluster_id].copy()
    if df_filtered.empty:
        print(f"  No data found for cluster '{cluster_id}'.")
        return

    # Add a YearMonth column for grouping
    df_filtered.loc[:, 'YearMonth'] = pd.to_datetime(df_filtered[date_column]).dt.to_period('M')

    # Select the top keywords for the cluster
    keywords_to_plot = all_cluster_keywords[:num_keywords_to_plot]

    # Prepare a dictionary to store monthly proportions for each keyword
    keyword_monthly_proportions_dict = {}
    grouped_by_month = df_filtered.groupby('YearMonth')

    for keyword in keywords_to_plot:
        # The keyword from the summary is already cleaned, but we search for the original form
        keyword_col_name = 'kw_' + keyword.replace('_', ' ')
        if keyword_col_name not in df.columns:
            print(f"  Warning: Keyword column '{keyword_col_name}' not found. Skipping keyword '{keyword}'.")
            continue

        # Calculate proportion based on the binary feature column
        monthly_proportions = grouped_by_month[keyword_col_name].mean()
        keyword_monthly_proportions_dict[keyword] = monthly_proportions

    if not keyword_monthly_proportions_dict:
        print("  No keyword data could be processed for plotting.")
        return

    # Create a DataFrame for the proportions
    proportions_df = pd.DataFrame(keyword_monthly_proportions_dict)

    # Reindex to fill missing months
    if not proportions_df.empty:
        full_date_range = pd.period_range(
            start=proportions_df.index.min().to_timestamp(),
            end=proportions_df.index.max().to_timestamp(),
            freq='M'
        )
        proportions_df = proportions_df.reindex(full_date_range, fill_value=0)

    # Smooth the proportions using a rolling window
    smoothed_proportions_df = proportions_df.rolling(window=rolling_window, center=True, min_periods=1).mean()
    smoothed_proportions_df.index = smoothed_proportions_df.index.to_timestamp()

    # Plot the trends using Plotly
    fig = go.Figure()
    for keyword in smoothed_proportions_df.columns:
        fig.add_trace(go.Scatter(
            x=smoothed_proportions_df.index,
            y=smoothed_proportions_df[keyword] * 100,
            mode='lines',
            name=keyword,
            line=dict(width=2)
        ))

    fig.update_layout(
        title=f'Internal Keyword Trends for Cluster {cluster_id}<br>({rolling_window}-Month Rolling Average)',
        title_x=0.5,
        xaxis_title='Time',
        yaxis_title='Share of Papers in Cluster (%)',
        yaxis_tickformat='.1f',
        yaxis_ticksuffix='%',
        legend_title_text='Keywords',
        height=600,
        hovermode='x unified'
    )
    fig.show()

# %%
# --- Execute the Deep Dive ---
# Let's pick the first emerging cluster to analyze.
# If no clusters were identified as emerging, we can pick the largest one from the summary table.
if emerging_cluster_ids:
    cluster_to_analyze = emerging_cluster_ids[0]
    print(f"Analyzing the first emerging cluster: {cluster_to_analyze}")
elif not cluster_summary_df.empty:
    cluster_to_analyze = cluster_summary_df.iloc[0]['Cluster ID']
    print(f"No emerging clusters found. Analyzing the largest cluster: {cluster_to_analyze}")
else:
    cluster_to_analyze = None
    print("No clusters available for deep dive analysis.")

if cluster_to_analyze is not None:
    # Get the top keywords for this cluster from our map
    keywords_for_cluster = top_keywords_per_cluster_map.get(cluster_to_analyze, [])

    if keywords_for_cluster:
        plot_cluster_keyword_trends_plotly(
            df=df,
            cluster_id=cluster_to_analyze,
            all_cluster_keywords=keywords_for_cluster,
            num_keywords_to_plot=10,
            rolling_window=12
        )
    else:
        print(f"Could not find top keywords for cluster {cluster_to_analyze}.")


In [None]:
cluster_to_analyze = emerging_cluster_ids[0]

plot_cluster_domain_trends_plotly(
            df=df,
            cluster_id=cluster_to_analyze,
            domain_features=domain_features,
            num_domains_to_plot=5, # Plotting top 5 is usually clear
            rolling_window=12
        )

In [None]:
plot_cluster_area_trends_plotly(
                df=df,
                cluster_id=cluster_to_analyze,
                area_features=area_features,
                num_areas_to_plot=5,
                rolling_window=12
            )