In [1]:
from config import config
from Library import Library

library = Library(config['bibliography']['file_path'], is_test=False, sample_size=None, source='paper', granularity='paragraph')
print('Entries in library:', len(library.entries))

Found config files: ['init.cfg', 'locals.cfg']
Missing files     : []
Library folder path: /Users/paul/Documents/FOM/MasterArbeit/Thesis/dev/data/paper/paragraph
Entries in library: 341


In [None]:
#library.delete_serialized_entries()
library.deserialize()
#library.deserialize()

In [None]:
custom_stopwords = [
            'data governance', 'data', 'governance', 'paper', 'research', 'policy', 
            'management', 'framework', 'government', 
            'utc', 'pp', 'free', 'mail', 'et', 'al', 'fig', 'ed', 'vol', 
            'citation', 'publication', 'review', 'question',
            'license', 'authorized', 'restriction',
            'library', 'academic', 'service', 'librarian',
            'copy', 'permission', 'fee', 'copyright',
            'universitatsbibliothek', 'licensed', 'utc'
            ]
library.set_stopwords(custom_stopwords)

In [None]:
#library.save_embeddings()

In [None]:
corpus, years, embeddings = library.load_embeddings()
print('Corpus:', len(years), '\nYears:', len(corpus), '\nEmbeddings:',  len(embeddings))

# Helper Function to store generated visuals

In [None]:
from datetime import datetime

def get_metadata():

    try:
        umap_model_metadata = {
            'n_neighbors': umap_model.n_neighbors,
            'min_dist': umap_model.min_dist,
            'n_components': umap_model.n_components,
            'metric': umap_model.metric,
            'random_state': umap_model.random_state
        }
    except NameError as ne:
        umap_model_metadata = None
        print(f'Error! N{str(ne)[1:]}.')

    try:
        hdbscan_model_metadata = {
            'min_cluster_size': hdbscan_model.min_cluster_size,
            'metric': hdbscan_model.metric,
            'cluster_selection_method': hdbscan_model.cluster_selection_method,
            'prediction_data': hdbscan_model.prediction_data
        }
    except NameError as ne:
        hdbscan_model_metadata = None
        print(f'Error! N{str(ne)[1:]}.')

    try:
        ctfidf_model_metadata = {
            'reduce_frequent_words': ctfidf_model.reduce_frequent_words,
            'seed_words': ctfidf_model.seed_words,
            'seed_multiplier': ctfidf_model.seed_multiplier
        }
    except NameError as ne:
        ctfidf_model_metadata = None
        print(f'Error! N{str(ne)[1:]}.')


    return {
        'timestamp': datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f"),
        'library.is_test': library.is_test,
        'library.sample_size': library.sample_size,
        'library.library_folder_path': library.library_folder_path,
        'library.source_path': library.source_path,
        'library.granularity': library.granularity,
        'library.source': library.source,
        'library.serialized_entries_path': library.serialized_entries_path,
        'library.entries [count]': len(library.entries),
        'library.docs [count]': len(library.docs),
        'library.stopwords': library.stopwords,
        'umap_model': umap_model_metadata,
        'hdbscan_model': hdbscan_model_metadata,
        'ctfidf_model': ctfidf_model_metadata
    }

In [None]:
import os
import json
import plotly
import matplotlib
import wordcloud

from matplotlib import pyplot as plt

def save_figure(fig, name, metadata: dict):

    visuals_path = os.path.join(library.serialized_entries_path, 'visuals', name)
    if not os.path.exists(visuals_path):
        os.makedirs(visuals_path)



    if isinstance(fig, matplotlib.figure.Figure):
        filename = str(metadata['timestamp'])+'.png'
        filepath = os.path.join(visuals_path, filename)
        print('Saving figure to:', filepath)
        fig.savefig(filepath)
    elif isinstance(fig, plotly.graph_objs._figure.Figure):
        filename = str(metadata['timestamp'])+'.html'
        filepath = os.path.join(visuals_path, filename)
        print('Saving figure to:', filepath)
        fig.write_html(filepath)
    elif isinstance(fig, wordcloud.wordcloud.WordCloud):
        filename = str(metadata['timestamp'])+'.png'
        filepath = os.path.join(visuals_path, filename)
        print('Saving figure to:', filepath)
        plt.imshow(fig, interpolation="bilinear")
        plt.axis("off")
        plt.savefig(filepath)
    else:
        print('Unknown figure type:', type(fig), 'Cannot save figure.')
        return


    # if isinstance(fig, )

    # Save the metadata to a JSON file
    visuals_metadata_filename = os.path.join(visuals_path, 'visuals_metadata.json')


    if os.path.exists(visuals_metadata_filename):
        with open(visuals_metadata_filename, 'r') as file:
            data = json.load(file)
    else:
        data = []

    data.append(metadata)

    with open(visuals_metadata_filename, 'w') as file:
        json.dump(data, file, indent=4)

# Optimal Number of Clusters

## Elbow curve
https://www.geeksforgeeks.org/elbow-method-for-optimal-value-of-k-in-kmeans/

In [None]:

from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [None]:
from itertools import chain

distortions = []
inertias = []
mapping1 = {}
mapping2 = {}

K = list(chain(range(2, 10), range(10, 50, 5), range(50, 101, 10)))
print(K)

for k in tqdm(K):

    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(embeddings)
    kmeanModel.fit(embeddings)
    distortions.append(sum(np.min(cdist(embeddings, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / embeddings.shape[0]) 
    inertias.append(kmeanModel.inertia_)
    mapping1[k] = sum(np.min(cdist(embeddings, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / embeddings.shape[0]
    mapping2[k] = kmeanModel.inertia_

In [None]:
fig_distortions = plt.figure(figsize=(7, 2))
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()
save_figure(fig_distortions, 'elbow_distortion', get_metadata())

In [None]:

fig_inertias = plt.figure(figsize=(7, 2))
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()
save_figure(fig_inertias, 'elbow_inertia', get_metadata())

## Silhouette Analysis

In [None]:
# Silhouette analysis

import matplotlib.cm as cm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

for k in [2, 5, 10, 15, 20, 25]:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(embeddings) + (k + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    cluster_labels = kmeanModel.predict(embeddings)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(embeddings, cluster_labels)
    print(
        "For n_clusters =",
        k,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(embeddings, cluster_labels)

    y_lower = 10
    for i in range(k):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / k)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / k)
    ax2.scatter(
        embeddings[:, 0], embeddings[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    )

    # Labeling the clusters
    centers = kmeanModel.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(
        centers[:, 0],
        centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(
        "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
        % k,
        fontsize=14,
        fontweight="bold",
    )

plt.show()


# BERTopic Model

In [None]:
# Set up model for Dimensionality Reduction

from umap import UMAP

umap_model = UMAP(n_neighbors=10,   # Controls the size of the local neighborhood around each data point. 
                                    # Smaller values will lead to more local representations, while larger 
                                    # values will capture more global structure. 
                  n_components=10,  # Determine the dimensionality of the reduced dimension space.
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=42)

In [None]:
# Set up clustering model

from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=50,  # The smallest size grouping that should be considered a cluster.
                        metric='euclidean', 
                        cluster_selection_method='eom', 
                        #prediction_data=True   # Whether to construct a fuzzy membership vector for each data point.
                    )

In [None]:
# from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# Fine-tune topic representations after training BERTopic
vectorizer_model = CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=4)



In [None]:
# Set up model for Class-based TF-IDF
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer()

In [None]:
# Azure AI Client Setup (for Chat GPT usage)

from config import config
from openai import AzureOpenAI
    
client = AzureOpenAI(
    azure_endpoint=config['azure']['endpoint'],
    api_key=config['azure']['api_key'],
    api_version="2024-02-01",
)

In [None]:
from bertopic.representation import OpenAI

label_prompt =  """
                    ---- CONTEXT ----
                    You are an AI Assistant that helps people assigning topics to a collection of the following paragraphs:
                    [DOCUMENTS].

                    The topic is described by the following keywords:
                    [KEYWORDS]
                    
                    ---- TASK ----
                    The maximum word count for the topic label is 2 words. Please assign a topic.

                    ---- EXAMPLE RESULT ----
                    Topic Label
                """

label_model = OpenAI(client=client, delay_in_seconds=3, chat=True, model='gpt-4', prompt=label_prompt, nr_docs=10, tokenizer='vectorizer', )

In [None]:
representation_model = {
    'Main': None,
    #'Label': label_model   # Use the label model for the topic representation.
                            # If uncommented, the label model will not be used for the topic representation.
}

In [None]:
from bertopic import BERTopic

# Initialize BERTopic as Topic Model
topic_model = BERTopic(
                    umap_model=umap_model
                    ,hdbscan_model=hdbscan_model
                    ,vectorizer_model=vectorizer_model
                    ,ctfidf_model=ctfidf_model
                    ,representation_model=representation_model
                    )
topics, probs = topic_model.fit_transform(corpus, embeddings=embeddings)

In [None]:
topic_model.get_topic_info()

In [None]:
principle_prompt =  """ 
                        ---- CONTEXT ----
                        You are an AI Assistant that helps people summarizing [KEYWORDS] into a short [PRINCIPLE].
                        Your advice needs to be very concise and very short. 

                        ---- TASK ----
                        Draft one Principle based on the given [KEYWORDS].

                    """

framwork_prompt = """   
                        ---- CONTEXT ----
                        You are an AI Assistant that helps people assinging a defined [LABEL] to the given keywords. Each [LABEL] is followed by a short discription. 
                        
                        ---- LABELS & DESCRIPTION ----
                        [Antecedents]: external - legal and regulatory requirements, market volaitlity, industry, country; internal - organizationa strategy, it strategy, diversification breadth, it achitecture, organizational culture, senior management support
                        [Data Scope]: traditional data - master data, transactional data, reference data; big data - web and social media data, machine generated data, streaming data, biometric data
                        [Domain Scope]: data quality, data security, data architecture, data lifecycle, meta data, data storage and infrastructure
                        [Organizational Scope]: intra-organizational - data governance on project-level / firm-level; inter-organizational - data governance between firms / ecosystems
                        [Governance Mechanisms]: structural mechanisms - roles and responsibilites, location of decision-making authority ; proecedural mechanisms - policies, standards, processes, procedures, contractual agreements, performance measurement, compliance monitoring, issue management; relational mechanisms - communication, training, coordination of decision-making
                        [Consequences]: intermediate performance effects, risk management
                        [Other] 

                        ---- RESULT ----
                        The potential [LABELS] are:
                        
                        [Antecedents]
                        [Data Scope]
                        [Domain Scope]
                        [Organizational Scope]
                        [Governance Mechanisms]
                        [Consequences]
                        [Other]

                        Each prompt should return just on of the potential labels. Anything other but the [LABEL] is prohibited.
                        An example result could be: "Data Scope"

                    """

In [None]:
# Define a function to get a response from ChatGPT.
def query_gpt(prompt, keywords):

    def flatten_list(nested_list):
        flattened = []
        for element in nested_list:
            if isinstance(element, list):
                flattened.extend(flatten_list(element))
            else:
                flattened.append(element)
        return flattened

    flattened = flatten_list(keywords)

    completion = client.chat.completions.create(
        model='gpt-4',
        messages=[
            {
                "role": "system",
                "content": prompt
            },
            {
                "role": "user",
                "content": f"{', '.join(flattened)}"
            }
        ]
    )
    return completion.choices[0].message.content

In [None]:
import re
import pandas as pd

topics = topic_model.get_topic_info()

# Reformat Label and Representation
topics['Representation'] = topics['Representation'].apply(lambda x: ', '.join(x))

# def extract_first_sublist(input_str):
#     # Use a regular expression to find all substrings enclosed in square brackets
#     sublists = re.findall(r'\[.*?\]', input_str)
    
#     # Return the first sublist, if any are found
#     return sublists[0] if sublists else None

topics['Label'] = topics['Label'].apply(lambda x: x[0])

# Classify Area based on Representation
topics['Area'] = topics.apply(lambda x: query_gpt(framwork_prompt , [x.Representation, x.Label]).strip['['].strip(']').strip('"').strip('.'), axis=1)

# Generate Principle for each Cluster
topics['Principle'] = topics.apply(lambda x: query_gpt(principle_prompt, [x.Representation, x.Label]).strip('"') if x['Area'] != 'Other' else '', axis=1)

# Set topic ID starting with 1.
topics['ID'] = pd.Series(range(0, len(topics))) 

# Show topics table results
pd.set_option('display.max_rows', topics.shape[0]+1)
topics


In [None]:
import os
import pandas as pd
# Create directory for storing the results.
if not os.path.exists(os.path.join(library.serialized_entries_path, 'results')):
    os.makedirs(os.path.join(library.serialized_entries_path, 'results'), exist_ok=True)

## Commented to avoid overwriting the results
# topics = topics[topics['Topic']!=-1] # Exlude noise with topic ID -1
# topics.to_csv(
#     os.path.join(library.serialized_entries_path, 
#     'results', 'topics.csv'), sep=';', 
#     index=False
#     )

## visualization of paragraphs per area and year

In [None]:
from collections import defaultdict
import pandas as pd
import os

loaded_topics = pd.read_csv(config['results']['topics'], sep=';')

# Assuming your dataframe is named 'df' and the two columns are 'column1' and 'column2'
topic_dict = {key: value for key, value in zip(loaded_topics['Topic'], loaded_topics['Area'])}

apd = [topic_dict.get(item) for item in topic_model.topics_] #documents per area for each year
yapd = list(zip(apd, years)) #year, documents per area
yapd

# List of tuples
data = yapd

# Grouping and counting
counts = defaultdict(lambda: defaultdict(int))
for item in data:
    if item[0] == None:
        counts['Unassigned'][item[1]] += 1
    else:
        counts[item[0]][item[1]] += 1
counts

In [None]:
areas = ['Other', 'Data Scope', 'Domain Scope', 'Organizational Scope', 'Governance Mechanisms', 'Antecedents', 'Consequences']

selected_counts = { key: value for key, value in counts.items() if key in areas }
selected_counts

In [None]:
import numpy as np 

area_counts = {}
year_ax = tuple(sorted(set(years)))

default_dict = {}
for year in year_ax:
        default_dict[year] = 0

# Printing the counts
for area_name, area_dict in selected_counts.items():

    sub_dict = default_dict.copy()

    for year, counted in area_dict.items():
        sub_dict[year] = counted
        sorted_dict = dict(sorted(sub_dict.items()))

    area_counts[area_name] = np.array(list(sorted_dict.values()))

# Summing up the counts for all areas
paragraph_counts = sum(area_counts.values())

In [None]:
import matplotlib.dates as mdates
year_dt = mdates.datestr2num(year_ax)

coefficients = np.polyfit(year_dt, paragraph_counts, 1)
reg_line = np.poly1d(coefficients)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

sns.set_style('white')

pal = sns.color_palette("Greens_d", len(area_counts))

width = 0.5

fig, ax = plt.subplots()
bottom = np.zeros(15)

for boolean, area_count in area_counts.items():
    p = ax.bar(year_ax, area_count, width, label=boolean, bottom=bottom)
    bottom += area_count

ax.plot(year_ax, reg_line(year_dt), '--', label='Trend', color='black')

ax.set_title("Count of validated paragraphs per year and area")
ax.legend(loc="upper left", title="Area", frameon=False)

ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_xlabel("Year")
ax.set_ylabel("Count")

plt.show()

# Visualization

## Wordcloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib

def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    #plt.show()
    return wc

topic_ids = topic_model.get_topic_info()['Topic'].values

for id in topic_ids:
    wc = create_wordcloud(topic_model, topic=id)
    #print(type(wc))
    md = get_metadata()
    save_figure(wc, f'wordcloud_topic', md)


## Topics

In [None]:
# Extract hierarchical topics and their representations
hierarchical_topics = topic_model.hierarchical_topics(corpus)

# Visualize these representations
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
bc = topic_model.visualize_barchart(n_words=6, )
bc.show()
save_figure(bc, 'barchart', get_metadata())

In [None]:
tc = topic_model.visualize_topics()
tc.show()
save_figure(tc, 'topicchart', get_metadata())

In [None]:
hm = topic_model.visualize_heatmap()
hm.show()
save_figure(hm, 'heatmap', get_metadata())

## Topics over Time

In [None]:
# https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html

tot = topic_model.topics_over_time(corpus, years)
topic_ids = range(0, 20)
vtot = topic_model.visualize_topics_over_time(tot, topics=topic_ids)
vtot.show()
save_figure(vtot, 'topics_over_time', get_metadata())

## Document Data Map

In [None]:
# Reducing dimensionality to 2d -> Play around with n_neighbors and min_dist
# Note that these 2D embeddings are very sensitive to hyperparameters

umap_embeddings = UMAP( n_neighbors=15, # Controls the size of the local neighborhood around each data point. 
                                        # Smaller values will lead to more local representations, while larger 
                                        # values will capture more global structure. 
                        n_components=2, # Determine the dimensionality of the reduced dimension space.
                        min_dist=0.0, 
                        metric='cosine', 
                        random_state=42
                        ).fit_transform(embeddings)


In [None]:
from matplotlib import pyplot as plt
# Combine data
df = pd.DataFrame(umap_embeddings, columns=['x', 'y'])
df['topic'] = pd.Series(topic_model.topics_)

# Visualize static clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = df.loc[df.topic <= 0, :] # largest 2 clusters are consired not distinct enough and therfore removed from visualization.
clustered = df.loc[df.topic >0, :]

#plt.scatter(outliers.x, outliers.y, color='#BEBEBE', s=.5) # Grey color for outliers
plt.scatter(clustered.x, clustered.y, c=clustered.topic, s=5, cmap=plt.cm.viridis)
plt.colorbar(ticks=range(len(set(clustered.topic))+1), label='Cluster')
plt.axis('off')
plt.clim(-0.5, (len(set(clustered.topic))+.5))

In [None]:
# Create a dictionary from two columns
labels = topics.set_index('Topic')['Label'].to_dict()

In [None]:
topic_sizes = pd.Series(topic_model.topics_).value_counts()
topic_sizes.reset_index()

In [None]:
#Filter topics based on conditions
filter_topics = []

filter_topics.extend([-1]) # Remove outliers

# Remove topics with more than 2000 documents
filter_topics.extend(list(topic_sizes[topic_sizes > 2000].index)) 

# Remove topics with less than 70 documents
filter_topics.extend(list(topic_sizes[topic_sizes < 70].index))

# Remove topics with specific names
filter_names = ['download', 'classroom', 'article', 'food', '- hide -']
filter_topics.extend([key for key, value in labels.items() if value in filter_names]) 

flter_ids = [8,57]
filter_topics.extend(flter_ids)

filter_topics = set(filter_topics)
filter_topics

In [None]:
simplified_topics = pd.Series(topic_model.topics_)

# Convert topic ids to topic labels based on label dictionary
simplified_topics = [
                    (topicID, labels[topicID]) 
                    if topicID not in filter_topics
                    else (topicID, '- hide -')
                    for topicID in simplified_topics
                    ]

# Print sample of simplified topics
simplified_topics[1:100:10]

In [None]:
import numpy as np 

def filter_topics(embeddings, topics):

    filtered_embeddings = []
    filtered_topics = []

    for embedding, topic in zip(embeddings, topics):
        id, keyword = topic
        
        if keyword != '- hide -':
            filtered_topics.append(': '.join([str(id+1), keyword])) # Add 1 to topic id to start from 1 after omitting -1 for outliers
            filtered_embeddings.append(embedding)
    
    return np.vstack(filtered_embeddings), filtered_topics


filtered_umap_embeddings, selected_topics = filter_topics(
    umap_embeddings, 
    simplified_topics, 
    #for visualization purposes, we filter out the topics that are outliers or most likely not relevant
    )

In [None]:
# Creating a data map plot in WordCloud style using datamapplot-library
import datamapplot
import matplotlib.cm as cm
wmp, ax = datamapplot.create_plot(
    filtered_umap_embeddings, 
    selected_topics,
    noise_color='white',
    noise_label='- hide -',
    label_over_points=True,
    # dynamic_label_size=True,
    # dynamic_label_size_scaling_factor=0.5,
    # max_font_size=12,
    # min_font_size=4,
    # max_font_weight=200,
    # min_font_weight=100,
    darkmode=False,
    color_label_text=True,
    font_family='Helvetica',
    cmap=cm.viridis,
)


In [None]:
save_figure(wmp, 'datamapplot', get_metadata())

## Hierarchical Document Clusters

In [None]:
topic_model.visualize_hierarchy()

# Library Stats

In [None]:
keys = list(set(key for entry in library.entries for key in entry.fields.keys()))

result = {}
for key in keys:
    for entry in library.entries:
        if key in entry.fields.keys():
            if key in result.keys():
                result[key] += 1
            else:
                result[key] = 1
result

In [None]:
years = [entry.fields['year'] for entry in library.entries]

import matplotlib.pyplot as plt

# Count the number of documents per year
document_count = {}
for year in years:
    if year in document_count:
        document_count[year] += 1
    else:
        document_count[year] = 1

count_sorted = {}
for key in sorted(document_count.keys()):
    count_sorted[key] = document_count[key]


In [None]:
dpy, ax = plt.subplots()

# Create a bar plot
ax.bar(count_sorted.keys(), count_sorted.values())

# Set labels and title
ax.set_xlabel('Year')
ax.set_ylabel('Document Count')
ax.set_title('Number of Documents per Year')
ax.tick_params(axis='x', rotation=90)


# Show the plot
dpy.show()

In [None]:
save_figure(dpy, 'document_count_per_year', get_metadata())