In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import plotly.graph_objects as go
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
alldf = pd.read_csv('processed_data/computation_courses.csv').drop_duplicates('course name').sort_values('course name')
documents = alldf['cleaned_text'].values
alldf['course department2'] = alldf['course department'] 
alldf.loc[~alldf['course department'].isin(['MATH','STAT','APMTH','BIOSTAT','APCOMP']),'course department2'] = 'OTHERS'
departments = alldf['course department2'].values
alldf

In [None]:
stop_words = stopwords.words('english')
def get_top_words_with_department_dataframe(documents, departments, stop_words, top_n=10):
    if len(documents) != len(departments):
        raise ValueError("Documents and departments must have the same length.")
    
    # Initialize CountVectorizer with custom preprocessing to remove stopwords
    vectorizer = CountVectorizer(stop_words=stop_words)
    
    # Fit and transform the documents to term-document matrix
    term_matrix = vectorizer.fit_transform(documents)
    
    # Sum term counts across all documents
    term_totals = term_matrix.sum(axis=0)
    term_totals = {word: term_totals[0, idx] for word, idx in vectorizer.vocabulary_.items()}
    
    # Calculate document frequency (number of documents containing each term)
    doc_freqs = (term_matrix > 0).sum(axis=0)
    doc_freqs = {word: doc_freqs[0, idx] for word, idx in vectorizer.vocabulary_.items()}
    
    # Calculate department-wise breakdown
    department_counts = defaultdict(lambda: defaultdict(int))
    for idx, doc in enumerate(documents):
        department = departments[idx]
        for word in set(doc.lower().split()):  # Use set to avoid duplicate word counts in a document
            if word in vectorizer.vocabulary_:
                department_counts[word][department] += 1
    
    # Combine term totals, document frequencies, and department breakdowns into DataFrame format
    rows = []
    all_departments = sorted(set(departments))  # Get unique department names
    for word in vectorizer.vocabulary_:
        row = {
            'Word': word,
            'Total Count': term_totals[word],
            'Document Frequency': doc_freqs[word]
        }
        for dept in all_departments:
            row[dept] = department_counts[word].get(dept, 0)
        rows.append(row)
    
    # Create DataFrame and sort
    df = pd.DataFrame(rows)
    df = df.sort_values(by=['Total Count', 'Document Frequency'], ascending=False).head(top_n)
    return df

# Get top words with department breakdown as DataFrame
top_words_df = get_top_words_with_department_dataframe(documents, departments, stop_words, top_n=100).reset_index(drop = True)

top_words_df 

In [None]:
N = 10 #number of words for coherence score

# Initialize lists to store metrics
coherence_scores = []
perplexities = []

vectorizer = CountVectorizer(
    max_df=0.9,
    min_df=10,
    stop_words='english',
    token_pattern=r'(?u)\b[A-Za-z]+\b'  # Only keep words with alphabetic characters
)
doc_term_matrix = vectorizer.fit_transform(documents)

# Define function to calculate coherence score
def coherence_score(topic_words, doc_term_matrix):
    scores = []
    for words in topic_words:
        word_indices = [vectorizer.vocabulary_[w] for w in words if w in vectorizer.vocabulary_]
        sub_matrix = doc_term_matrix[:, word_indices].toarray()
        similarities = cosine_similarity(sub_matrix.T)
        upper_tri_indices = np.triu_indices_from(similarities, k=1)
        scores.append(similarities[upper_tri_indices].mean())
    return np.mean(scores)

# Loop through different numbers of topics
num_topics_range = np.arange(3, 10)
for num_topics in num_topics_range:
    
    
    # Fit LDA
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(doc_term_matrix)
    
    # Extract topic-word distributions
    topic_word_distributions = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]
    top_words = np.argsort(topic_word_distributions, axis=1)[:, -N:]
    vocabulary = np.array(vectorizer.get_feature_names_out())
    topic_top_words = vocabulary[top_words]
    
    # Calculate coherence score
    coherence = coherence_score(topic_top_words, doc_term_matrix)
    coherence_scores.append(coherence)
    
    # Calculate perplexity
    perplexity = lda.perplexity(doc_term_matrix)
    perplexities.append(perplexity)
    
    print(f"Topics: {num_topics}, Coherence: {np.round(coherence, 3)}, Perplexity: {np.round(perplexity, 1)}")

# Find indices of minimum/maximum values
min_perplexity_idx = np.argmin(perplexities)
max_coherence_idx = np.argmax(coherence_scores)

# Plot metrics in subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Plot Coherence Score
axes[0].plot(num_topics_range, coherence_scores, label='Coherence Score', marker='o', color='b')
axes[0].scatter(num_topics_range[max_coherence_idx], coherence_scores[max_coherence_idx], color='green', s=100, label='Max Coherence', zorder=5)
axes[0].set_title("Coherence Score vs Number of Topics")
axes[0].set_xlabel("Number of Topics")
axes[0].set_ylabel("Coherence Score")
axes[0].legend()
axes[0].grid()

# Plot Perplexity
axes[1].plot(num_topics_range, perplexities, label='Perplexity', marker='o', color='r')
axes[1].scatter(num_topics_range[min_perplexity_idx], perplexities[min_perplexity_idx], color='green', s=100, label='Min Perplexity', zorder=5)
axes[1].set_title("Perplexity vs Number of Topics")
axes[1].set_xlabel("Number of Topics")
axes[1].set_ylabel("Perplexity")
axes[1].legend()
axes[1].grid()

# Adjust layout and show plot
plt.tight_layout()
plt.show()

In [None]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(doc_term_matrix)

# Display the topics
feature_names = vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx + 1}: ", [feature_names[i] for i in topic.argsort()[-10:]])

In [None]:
# Get the topic distribution for each document
doc_topic_distribution = lda.transform(doc_term_matrix)  # Shape: [n_docs, n_topics]

# Find the most representative document for each topic
def find_representative_documents(doc_topic_distribution, top_n=1):
    """
    Find the most representative documents for each topic.

    Parameters:
        doc_topic_distribution: ndarray
            The topic distribution for each document (output of LDA's `transform` method).
        top_n: int
            Number of top documents to find for each topic.

    Returns:
        dict: A dictionary where keys are topic IDs and values are lists of document indices.
    """
    n_topics = doc_topic_distribution.shape[1]
    representative_docs = {}

    for topic_id in range(n_topics):
        # Get the indices of the top-n documents for this topic
        top_docs = np.argsort(doc_topic_distribution[:, topic_id])[-top_n:][::-1]
        representative_docs[topic_id] = top_docs.tolist()

    return representative_docs

# Find the most representative documents for each topic
top_n = 3  # Change to 2, 3, etc., for more representative documents
representative_docs = find_representative_documents(doc_topic_distribution, top_n)

# Display results
for topic, docs in representative_docs.items():
    print(f"Topic {topic+1}: Document indices {docs}")
    for doc in docs:
        print(documents[doc][:100])
        print('----')

In [None]:
rounded_doc_topic_distribution = pd.DataFrame(doc_topic_distribution).round(2)
ldf2 = pd.concat([alldf.reset_index(drop = True), rounded_doc_topic_distribution], axis=1, ignore_index=True)
ldf2.columns = list(alldf.columns) + [f"topic {i}" for i in range(1, 6)]
df = ldf2[ldf2['course name'].str.contains('')][['course name','Name']+[f"topic {i}" for i in range(1, 6)]]\
.sort_values('topic 5', ascending = False)
df[:20]

In [None]:
ldf2[ldf2['course department'] == 'STAT']

In [None]:
# Function to calculate exclusivity of words across topics
def get_anchor_words_exclusive(topic_word_matrix, feature_names, top_n=10, exclusivity_threshold=0.8):
    n_topics, n_words = topic_word_matrix.shape
    anchor_words = {}

    # Normalize topic-word probabilities to convert them into probabilities
    topic_word_probs = topic_word_matrix / topic_word_matrix.sum(axis=1, keepdims=True)

    # For each topic, find anchor words
    for topic_idx in range(n_topics):
        topic_probs = topic_word_probs[topic_idx]
        exclusivity_scores = topic_probs / topic_word_probs.sum(axis=0)  # Score for each word
        
        # Select words based on exclusivity scores and top probabilities
        candidate_indices = topic_probs.argsort()[::-1]  # Sort words by topic probability
        exclusive_words = [
            feature_names[i]
            for i in candidate_indices
            if exclusivity_scores[i] >= exclusivity_threshold
        ][:top_n]
        
        anchor_words[f"Topic {topic_idx+1}"] = exclusive_words

    return anchor_words

# Example usage: Assuming 'lda' is the trained LDA model and 'vectorizer' is the CountVectorizer
topic_word_matrix = lda.components_  # Shape: [n_topics, n_words]
feature_names = vectorizer.get_feature_names_out()

# Parameters
top_n = 50
exclusivity_threshold = 0.9  # A word must have at least 80% exclusivity for one topic to qualify

anchor_words = get_anchor_words_exclusive(topic_word_matrix, feature_names, top_n, exclusivity_threshold)

# Display anchor words
for topic, words in anchor_words.items():
    print(f"{topic}: {', '.join(words)}")


In [None]:
# Step 2: Determine the dominant topic for each document
dominant_topics = np.argmax(doc_topic_distribution, axis=1)  # Majority topic for each document

# Step 3: Add dominant topic to DataFrame for reference
df['Dominant Topic'] = dominant_topics

# Step 4: Reduce topic distribution to 2D using PCA for visualization
pca = PCA(n_components=2)
topic_distribution_2d = pca.fit_transform(doc_topic_distribution)  # Shape: [n_docs, 2]

# Step 5: Define a color palette for topics
num_topics = lda.n_components
colors = [f"hsl({i * 360 / num_topics}, 70%, 50%)" for i in range(num_topics)]  # HSL for distinct colors

# Step 6: Create an interactive scatter plot
fig = go.Figure()

# Add scatter plot, grouping by dominant topic for coloring
for topic_id in range(num_topics):
    indices = dominant_topics == topic_id
    fig.add_trace(go.Scatter(
        x=topic_distribution_2d[indices, 0],
        y=topic_distribution_2d[indices, 1],
        mode='markers',
        name=f"Topic {topic_id}",
        text=df.loc[indices, 'Name'],  # Use course names for hover text
        hovertemplate=(
            "<b>Course Name:</b> %{text}<br>" +
            "<b>Topic:</b> %{meta}"  # Include topic number in hover
        ),
        meta=[f"Topic {topic_id}" for _ in range(sum(indices))],
        marker=dict(
            size=10,
            color=colors[topic_id],
            line=dict(width=1, color='black')
        )
    ))

# Step 7: Update layout
fig.update_layout(
    title="Topic Distribution of Documents (PCA-reduced)",
    xaxis_title="Principal Component 1",
    yaxis_title="Principal Component 2",
    template="plotly",
    showlegend=True,
    width=1000,  # Width of the plot (in pixels)
    height=600   # Height of the plot (in pixels)
)

# Show the plot
fig.show()

In [None]:
# Step 2: Determine the dominant topic for each document
dominant_topics = np.argmax(doc_topic_distribution, axis=1)  # Majority topic for each document

# Step 3: Add dominant topic to DataFrame for reference
df['Dominant Topic'] = dominant_topics

# Step 4: Reduce topic distribution to 3D using PCA for visualization
pca = PCA(n_components=3)
topic_distribution_3d = pca.fit_transform(doc_topic_distribution)  # Shape: [n_docs, 3]

# Step 5: Define a color palette for topics
num_topics = lda.n_components
colors = [f"hsl({i * 360 / num_topics}, 70%, 50%)" for i in range(num_topics)]  # HSL for distinct colors

# Step 6: Create an interactive 3D scatter plot
fig = go.Figure()

# Add scatter plot, grouping by dominant topic for coloring
for topic_id in range(num_topics):
    indices = dominant_topics == topic_id
    fig.add_trace(go.Scatter3d(
        x=topic_distribution_3d[indices, 0],
        y=topic_distribution_3d[indices, 1],
        z=topic_distribution_3d[indices, 2],
        mode='markers',
        name=f"Topic {topic_id+1}",
        text=df.loc[indices, 'Name'],  # Use document names for hover text
        hovertemplate=(
            "<b>Document Name:</b> %{text}<br>" +
            "<b>Topic:</b> %{meta}"  # Include topic number in hover
        ),
        meta=[f"Topic {topic_id+1}" for _ in range(sum(indices))],
        marker=dict(
            size=5,
            color=colors[topic_id],
            line=dict(width=1, color='black')
        )
    ))

# Step 7: Update layout
fig.update_layout(
    title="Topic Distribution of Documents by Department (PCA-reduced to 3D)",
    scene=dict(
        xaxis_title="Principal Component 1",
        yaxis_title="Principal Component 2",
        zaxis_title="Principal Component 3"
    ),
    template="plotly",
    showlegend=True,
    width=800,  # Width of the plot (in pixels)
    height=600,  # Height of the plot (in pixels)
    font=dict(
        size=9  # Corrected key for setting font size
    )
)

# Show the plot
fig.show()

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import plotly.graph_objects as go

# Step 2: Reduce topic distribution to 3D using PCA for visualization
pca = PCA(n_components=3)
topic_distribution_3d = pca.fit_transform(doc_topic_distribution)  # Shape: [n_docs, 3]

# Step 3: Define a color palette for unique course departments
unique_departments = alldf['course department2'].unique()
num_departments = len(unique_departments)
colors = [
    "#1f77b4",  # Blue
    "#ff7f0e",  # Orange
    "#2ca02c",  # Green
    "#d62728",  # Red
    "#bcbd22",  # Gold
    "#8c564b"   # Brown
]
# Step 4: Create an interactive 3D scatter plot
fig = go.Figure()

# Add scatter plot, grouping by course department
for i, department in enumerate(unique_departments):
    indices = alldf['course department2'] == department
    fig.add_trace(go.Scatter3d(
        x=topic_distribution_3d[indices, 0],
        y=topic_distribution_3d[indices, 1],
        z=topic_distribution_3d[indices, 2],
        mode='markers',
        name=department,  # Use department name as legend
        text=alldf.loc[indices, 'Name'],  # Use document names for hover text
        hovertemplate=(
            "<b>Document Name:</b> %{text}<br>" +
            "<b>Department:</b> %{meta}"  # Include department name in hover
        ),
        meta=[department for _ in range(sum(indices))],
        marker=dict(
            size=5,
            color=colors[i],
            line=dict(width=1, color='black')
        )
    ))

fig.update_layout(
    title="Topic Distribution of Documents by Department (PCA-reduced to 3D)",
    scene=dict(
        xaxis_title="Principal Component 1",
        yaxis_title="Principal Component 2",
        zaxis_title="Principal Component 3"
    ),
    template="plotly",
    showlegend=True,
    width=800,  # Width of the plot (in pixels)
    height=600,  # Height of the plot (in pixels)
    font=dict(
        size=9  # Corrected key for setting font size
    )
)

# Show the plot
fig.show()


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Step 1: Load a pre-trained model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # A lightweight, efficient sentence embedding model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Step 2: Define a function for sentence embeddings
def get_sentence_embedding(sentences):
    # Tokenize input sentences
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    
    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the embeddings
    # Use the [CLS] token or mean pooling
    embeddings = outputs.last_hidden_state
    attention_mask = inputs["attention_mask"]
    sentence_embeddings = mean_pooling(embeddings, attention_mask)
    return sentence_embeddings

def mean_pooling(token_embeddings, attention_mask):
    # Perform mean pooling on token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

#embeddings = get_sentence_embedding(list(documents))

# Step 4: Convert to numpy for further analysis
#embeddings_np = embeddings.numpy()
#print(embeddings_np.shape)  # Shape: [num_sentences, embedding_dim]


In [None]:
# Step 2: Reduce embeddings to 3D using PCA for visualization
def plot_embedding():
    pca = PCA(n_components=3)
    embeddings_3d = pca.fit_transform(embeddings_np)  # Shape: [n_docs, 3]
    
    # Step 3: Define a color palette for unique course departments
    unique_departments = alldf['course department2'].unique()
    num_departments = len(unique_departments)
    #colors = [f"hsl({i * 360 / num_departments}, 80%, 40%)" for i in range(num_departments)]
    colors = [
        "#1f77b4",  # Blue
        "#ff7f0e",  # Orange
        "#2ca02c",  # Green
        "#d62728",  # Red
        "#bcbd22",  # Gold
        "#8c564b"   # Brown
    ]
    fig = go.Figure()
    
    # Add scatter plot, grouping by course department
    for i, department in enumerate(unique_departments):
        indices = alldf['course department2'] == department
        fig.add_trace(go.Scatter3d(
            x=embeddings_3d[indices, 0],
            y=embeddings_3d[indices, 1],
            z=embeddings_3d[indices, 2],
            mode='markers',
            name=department,  # Use department name as legend
            text=alldf.loc[indices, 'Name'],  # Use document names for hover text
            hovertemplate=(
                "<b>Document Name:</b> %{text}<br>" +
                "<b>Department:</b> %{meta}"  # Include department name in hover
            ),
            meta=[department for _ in range(sum(indices))],
            marker=dict(
                size=5,
                color=colors[i],
                line=dict(width=1, color='black')
            )
        ))
    
    # Step 5: Update layout
    fig.update_layout(
        title="Document Embeddings by Department (PCA-reduced to 3D)",
        scene=dict(
            xaxis_title="Principal Component 1",
            yaxis_title="Principal Component 2",
            zaxis_title="Principal Component 3"
        ),
        template="plotly",
        showlegend=True,
        width=800,  # Width of the plot (in pixels)
        height=600,  # Height of the plot (in pixels)
        font=dict(
            size=9  # Corrected key for setting font size
        )
    )
    
    # Step 6: Show the plot
    fig.show()


In [None]:
from bertopic import BERTopic
from umap import UMAP
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

umap_model = UMAP(random_state=42)  # Set random seed for UMAP

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
vectorizer_model = CountVectorizer(stop_words="english")

topic_model1 = BERTopic(umap_model=umap_model,)
topic_model2 = BERTopic(vectorizer_model=vectorizer_model)
topic_model3 = BERTopic(ctfidf_model=ctfidf_model, top_n_words = 10, calculate_probabilities = True)
topic_model4 = BERTopic(vectorizer_model=vectorizer_model,nr_topics = 5, top_n_words = 10)

topic_model1.fit(documents)
topic_model2.fit(documents)
topic_model3.fit(documents)
topic_model4.fit(documents)

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

topic_model = topic_model4
# Assuming 'topic_model' and 'documents' are already defined and transformed
topics, probs = topic_model.transform(documents)
df = topic_model.get_topic_info()

# Define a new CountVectorizer for coherence calculation
vectorizer = CountVectorizer(stop_words="english", token_pattern=r"(?u)\b[A-Za-z]+\b")
doc_term_matrix = vectorizer.fit_transform(documents)

# Define coherence score function
def coherence_score(topic_words, doc_term_matrix):
    scores = []
    for words in topic_words:
        word_indices = [vectorizer.vocabulary_.get(w) for w in words if w in vectorizer.vocabulary_]
        if len(word_indices) > 1:
            sub_matrix = doc_term_matrix[:, word_indices].toarray()
            similarities = cosine_similarity(sub_matrix.T)
            upper_tri_indices = np.triu_indices_from(similarities, k=1)
            scores.append(similarities[upper_tri_indices].mean())
    return np.mean(scores) if scores else 0

# Extract topics and compute coherence scores
topics = topic_model.get_topics()  # Get topics as a dictionary
topic_coherence_scores = {}

for topic_id, topic_words in topics.items():
    words = [word for word, _ in topic_words]
    topic_coherence_scores[topic_id] = coherence_score([words], doc_term_matrix)

# Add coherence scores to the DataFrame
df['Coherence Score'] = df['Topic'].map(topic_coherence_scores).round(2)

print(df['Coherence Score'].mean().round(2))
df

In [None]:
# Step 2: Reduce embeddings to 3D using PCA for visualization
topic_distribution, _ = topic_model4.approximate_distribution(documents)
pca = PCA(n_components=3)
embeddings_3d = pca.fit_transform(topic_distribution)  # Shape: [n_docs, 3]

# Step 3: Define a color palette for unique course departments
unique_departments = alldf['course department2'].unique()
num_departments = len(unique_departments)
#colors = [f"hsl({i * 360 / num_departments}, 80%, 40%)" for i in range(num_departments)]
colors = [
    "#1f77b4",  # Blue
    "#ff7f0e",  # Orange
    "#2ca02c",  # Green
    "#d62728",  # Red
    "#bcbd22",  # Gold
    "#8c564b"   # Brown
]
fig = go.Figure()

# Add scatter plot, grouping by course department
for i, department in enumerate(unique_departments):
    indices = alldf['course department2'] == department
    fig.add_trace(go.Scatter3d(
        x=embeddings_3d[indices, 0],
        y=embeddings_3d[indices, 1],
        z=embeddings_3d[indices, 2],
        mode='markers',
        name=department,  # Use department name as legend
        text=alldf.loc[indices, 'Name'],  # Use document names for hover text
        hovertemplate=(
            "<b>Document Name:</b> %{text}<br>" +
            "<b>Department:</b> %{meta}"  # Include department name in hover
        ),
        meta=[department for _ in range(sum(indices))],
        marker=dict(
            size=5,
            color=colors[i],
            line=dict(width=1, color='black')
        )
    ))

# Step 5: Update layout
fig.update_layout(
    title="Document Embeddings by Department (PCA-reduced to 3D)",
    scene=dict(
        xaxis_title="Principal Component 1",
        yaxis_title="Principal Component 2",
        zaxis_title="Principal Component 3"
    ),
    template="plotly",
    showlegend=True,
    width=800,  # Width of the plot (in pixels)
    height=600,  # Height of the plot (in pixels)
    font=dict(
        size=9  # Corrected key for setting font size
    )
)

# Step 6: Show the plot
fig.show()


In [None]:
def find_unique_anchor_words_bertopic(model, top_n=100):
    """
    Extract unique anchor words for each topic in a BERTopic model.

    Parameters:
        model: BERTopic object
            A trained BERTopic model.
        top_n: int
            Number of top words to consider per topic.

    Returns:
        dict: A dictionary where keys are topic IDs and values are lists of unique anchor words.
    """
    topics = model.get_topics()
    word_to_topic = {}

    # Map each word to the list of topics it appears in
    for topic_id, terms in topics.items():
        for term, _ in terms[:top_n]:
            if term not in word_to_topic:
                word_to_topic[term] = set()
            word_to_topic[term].add(topic_id)

    # Filter words that belong to only one topic
    anchor_words = {topic_id: [] for topic_id in topics.keys()}
    for word, topic_ids in word_to_topic.items():
        if len(topic_ids) == 1:  # The word is unique to one topic
            unique_topic = list(topic_ids)[0]
            anchor_words[unique_topic].append(word)

    return anchor_words
find_unique_anchor_words_bertopic(topic_model4 , top_n=15)

In [None]:
topics, probs = topic_model4.fit_transform(documents)

# Step 2: Add topics to DataFrame
alldf['Topic'] = topics  # Assign the generated topics to the DataFrame

# Step 3: Create a breakdown of topics by department
# Group by 'course department' and 'Topic', and count occurrences
topic_by_department = alldf.groupby(['course department2', 'Topic']).size().reset_index(name='Count')

# Step 4: Pivot to make the data more readable
topic_by_department_pivot = topic_by_department.pivot_table(index='course department2', 
                                                           columns='Topic', 
                                                           values='Count', 
                                                           aggfunc='sum', 
                                                           fill_value=0)

# Step 5: Print or visualize the breakdown
print(topic_by_department_pivot)

# Optionally: If you'd like to visualize the breakdown with a heatmap or bar plot:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.heatmap(topic_by_department_pivot, annot=True, cmap='viridis', fmt='d')
plt.title('Topic Breakdown by Department')
plt.xlabel('Topic')
plt.ylabel('Course Department')
plt.show()
