In [5]:
import re
import ContentLibrary as cl
from pdfminer.high_level import extract_text

library = cl.Library('/Users/paul/Desktop/FOM_MSc_Thesis.bib')


In [6]:
class Document():

    def __init__(self, entry):
        base_path = '/Users/paul/Zotero/storage/' 
        self.entry = entry
        self.title = self.entry.fields['title']
        self.fields = self.entry.fields.keys()
        self.is_valid = False
        if 'file' in self.fields:
           self.file = self.entry.fields['file'].split(base_path)[1].split(':')[0]
        else:
            self.file = ''
            self.text = None
        self.is_pdf = bool(re.search('.pdf', self.file))
        if self.is_pdf:
            self.text = extract_text(base_path + self.file)
            self.is_valid = True


In [7]:
texts = [Document(entry).text for entry in library.entries if Document(entry).is_valid]
titles = [Document(entry).title for entry in library.entries if Document(entry).is_valid]

In [12]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import download
download('punkt')
sentences = [sent_tokenize(text) for text in texts]
sentences = [sentence for doc in sentences for sentence in doc]

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1108)>


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/paul/nltk_data'
    - '/Users/paul/Documents/FOM/MasterArbeit/Thesis/dev/TopGov/venv/nltk_data'
    - '/Users/paul/Documents/FOM/MasterArbeit/Thesis/dev/TopGov/venv/share/nltk_data'
    - '/Users/paul/Documents/FOM/MasterArbeit/Thesis/dev/TopGov/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [None]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(texts, show_progress_bar=True)

In [None]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)


In [None]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [None]:
# import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# # GPT-3.5
# client = openai.OpenAI(api_key="sk-...")
# prompt = """
# I have a topic that contains the following documents: 
# [DOCUMENTS]
# The topic is described by the following keywords: [KEYWORDS]

# Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
# topic: <topic label>
# """
# openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    # "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model,
    "POS": pos_model
}

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(texts, embeddings)

# Show topics
topic_model.get_topic_info()

In [None]:
# # Label the topics yourself
# topic_model.set_topic_labels({1: "Space Travel", 7: "Religion"})

# or use one of the other topic representations, like KeyBERTInspired
keybert_topic_labels = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in topic_model.topic_aspects_["KeyBERT"].items()}
topic_model.set_topic_labels(keybert_topic_labels)

# # or ChatGPT's labels
# chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
# chatgpt_topic_labels[-1] = "Outlier Topic"
# topic_model.set_topic_labels(chatgpt_topic_labels)

In [None]:
# `topic_distr` contains the distribution of topics in each document
topic_distr, _ = topic_model.approximate_distribution(texts, window=8, stride=4)


In [None]:
# Visualize the topic-document distribution for a single document
topic_model.visualize_distribution(topic_distr[texts], custom_labels=True)


In [None]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(texts[abstract_id], calculate_tokens=True)

# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(texts[abstract_id], topic_token_distr[0])

In [None]:
# Reduce outliers
new_topics = topic_model.reduce_outliers(abstracts, topics)

# Reduce outliers with pre-calculate embeddings instead
new_topics = topic_model.reduce_outliers(abstracts, topics, strategy="embeddings", embeddings=embeddings)


In [None]:
# Visualize topics with custom labels
topic_model.visualize_topics(custom_labels=True)

# Visualize hierarchy with custom labels
topic_model.visualize_hierarchy(custom_labels=True)


In [None]:
# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)


In [None]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("my_model_dir", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)
