In [41]:
import os
import json
import numpy as np
import torch  # For checking if GPU is available
import time  # For time tracking
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from file_handling import FileHandler  # Import the FileHandler class
from text_processing import TextProcessor  # Import the TextProcessor class
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from utils import print_configuration


# Start total execution time tracking
total_start_time = time.time()

# Load configuration from config.json
print("Loading configuration...")
with open(r'C:\Users\nikla\OneDrive\Dokumente\winfoMaster\Masterarbeit\bertopic_ecc\config.json', 'r') as config_file:
    config = json.load(config_file)
print_configuration(config)

# Set random seed
random_seed = config["random_seed"]
np.random.seed(random_seed)

# Extract variables from the config
index_file_ecc_folder = config["index_file_ecc_folder"]
folderpath_ecc = config["folderpath_ecc"]
sample_size = config["sample_size"]
document_split = config["document_split"]
section_to_analyze = config["section_to_analyze"]
max_documents = config["max_documents"]

# Initialize FileHandler and TextProcessor with the imported configuration
print("Initializing file handler and text processor...")
file_handler = FileHandler(index_file_path=config["index_file_path"], folderpath_ecc=folderpath_ecc)
text_processor = TextProcessor(method=document_split, section_to_analyze=section_to_analyze)

# Start splitting process time tracking
splitting_start_time = time.time()

# Create the sample and extract relevant sections
print("Reading index file and creating ECC sample...")
index_file = file_handler.read_index_file()
ecc_sample = file_handler.create_ecc_sample(sample_size)
all_relevant_sections = text_processor.extract_all_relevant_sections(ecc_sample, max_documents)

# End splitting process time tracking
splitting_end_time = time.time()
splitting_duration = splitting_end_time - splitting_start_time
print(f"Splitting process took {splitting_duration:.2f} seconds.")

if not all_relevant_sections:
    print("No relevant sections found to fit BERTopic.")

docs = all_relevant_sections

zeroshot_topic_list = [
      "Welcome to the Conference Call",
      "Revenue and Sales",
      "Expenses and Costs",
      "Earnings and Profit",
      "Marketing",
      "Strategy",
      "Risk and Forward Looking statements"
    ]

topic_model = BERTopic(
    embedding_model="all-MiniLM-L12-v2", 
    min_topic_size=50,
    zeroshot_topic_list=zeroshot_topic_list,
    zeroshot_min_similarity=0.1,
    representation_model=KeyBERTInspired()
)

# Start training time tracking
print("Training BERTopic model...")
training_start_time = time.time()
topics, _ = topic_model.fit_transform(docs)
training_end_time = time.time()
training_duration = training_end_time - training_start_time

print("BERTopic model training and saving completed.")


# End total execution time tracking
total_end_time = time.time()
total_duration = total_end_time - total_start_time
print(f"Total execution time: {total_duration:.2f} seconds.")

# We fit our model using the zero-shot topics
# and we define a minimum similarity. For each document,
# if the similarity does not exceed that value, it will be used
# for clustering instead.


Loading configuration...
Configuration:
index_file_ecc_folder: D:/daten_masterarbeit/
folderpath_ecc: D:/daten_masterarbeit/Transcripts_Masterarbeit_full/
model_save_path: D:/daten_masterarbeit/bertopic_model_dir
model_load_path: D:/daten_masterarbeit/bertopic_model_dir_sentences_100_zeroshot_9_topics
model_load_path_with_data: D:/daten_masterarbeit/bertopic_model_dir_regular_1729_300_l12/bertopic_model_with_data_and_docs_cpu_works
index_file_path: D:/daten_masterarbeit/list_earnings_call_transcripts.csv
embedding_model_choice: all-MiniLM-L12-v2
ecc_plots_folder: C:/Users/nikla/OneDrive/Dokumente/winfoMaster/Masterarbeit/bertopic_ecc/plots/ecc_plots
modeling_type: regular
sample_size: 8
document_split: sentences
random_seed: 43
section_to_analyze: Presentation
max_documents: 300000000
nr_topics: 20
batch_size: 64
vectorizer_model_params: {'ngram_range': [1, 3], 'stop_words': 'english', 'min_df': 0.4}
umap_model_params: {'n_neighbors': 15, 'n_components': 5, 'min_dist': 0.0, 'metric': '

In [42]:
topic_model.visualize_topics()

In [43]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11,-1_nitric_lung_cf_antimicrobial,"[nitric, lung, cf, antimicrobial, bronchial, o...","[Nitric oxide, or NO, is known to have potent ..."
1,0,8703,Welcome to the Conference Call,"[earnings, income, increase, revenue, segment,...","[In our lithium business, we expect to realize..."
2,1,4731,Revenue and Sales,"[customers, customer, businesses, sales, ameri...",[The second half of the year is expected to sh...
3,2,4493,Expenses and Costs,"[call, pierre, bill, turn, pleasure, note, ple...",[With that I will now turn the call back to yo...
4,3,3723,Earnings and Profit,"[statements, future, risks, uncertainties, fac...",[A list of factors that could cause actual res...
5,4,3613,Marketing,"[revenues, sales, revenue, increase, growth, p...",[For the fourth quarter of 2008 revenue in ind...
6,5,3189,Strategy,"[strategy, strategic, progress, plan, growth, ...",[We have a good combination of businesses and ...
7,6,2535,Risk and Forward Looking statements,"[expenses, expense, costs, cost, spending, pri...",[General and administrative expenses for the f...
8,7,1344,7_sapacitabine_inhibitors_inhibitor_cancers,"[sapacitabine, inhibitors, inhibitor, cancers,...","[During the quarter, we announced that first p..."
9,8,306,8_crops_quarter_soybean_fmc,"[crops, quarter, soybean, fmc, march, season, ...",[FMC Lithium will remain a reporting segment o...


In [44]:
topic_model.visualize_barchart()

In [45]:
topic_model.visualize_hierarchy()