In [None]:
from datetime import datetime
import glob
import pandas as pd
import re
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
import nltk
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#functions
def createTwDB():
    """
    Creates a consolidated DataFrame from multiple CSV files containing tweet data,
    and saves the resulting DataFrame to a CSV file.

    Returns:
    str: The filename of the saved CSV file containing all the tweets.
    """
    all_files = glob.glob("savedTweets/*/*.csv")

    #ceate df from all files
    li_tw_files = []
    for f in all_files:
        df = pd.read_csv(f, index_col=None, header=None, sep="\t")
        li_tw_files.append(df)
    df_tw = pd.concat(li_tw_files, axis=0, ignore_index=True)
    df_tw.columns = ['id', 'user', 'created_at', 'source', 'in_reply_to_status_id', 'in_reply_to_user_id', 'in_reply_to_screen_name', 'retweet_count', 'text']
    df_tw = df_tw.set_index("id")

    #save df
    filename = "savedTweets/db_all_tweets.csv"
    df_tw.to_csv(filename, sep="\t", encoding="utf-8")

    return filename


def extract_hashtags(text):
    """
    Extracts hashtags from the given text.

    Args:
    text (str): The text from which hashtags will be extracted.

    Returns:
    list: A list of hashtags found in the given text.
    """

    return re.findall(r'#\w+', text)

In [None]:
#create tweet-database
csv_tweets = createTwDB()

In [None]:
#create dataframe from tweet-database
df_all_tweets = pd.read_csv(csv_tweets, sep = '\t', encoding = 'utf-8')
df_all_tweets.head()

In [None]:
#select relevant columns and preprocess text
df_tweets = df_all_tweets[['id','user','text']]
df_tweets['text'] = df_tweets['text'].str.lower()
df_tweets

In [None]:
#count tweets per user	
df_tweets_count = df_tweets[['id','user']].groupby('user').count().sort_values(by='id',ascending=False)
df_tweets_count.head(10)

In [None]:
#extract hashtags
df_tweets['hashtags'] = df_tweets['text'].apply(extract_hashtags)
li_hashtags=[item for sublist in df_tweets['hashtags'].tolist() for item in sublist]
df_hashtags = pd.DataFrame(li_hashtags, columns=['hashtag'])

#count hashtags
df_hashtags_count = df_hashtags.groupby('hashtag').size().sort_values(ascending=False)
df_hashtags_count = df_hashtags_count.drop('#imzentrum')
df_hashtags_count.head(10)

**Brief discussion on the implications for Topic Modeling:**

- Given the prominence of health-related hashtags, it is reasonable to expect that health-related topics, especially those related to COVID-19, may dominate the discussions.
- Political topics, particularly those associated with the Austrian People's Party (#övp) and specific figures like August Wöginger, may also be significant. The hashtags regarding the ORF (Austrian Broadcasting Corporation) such as #zib2 refer to the fact, that the ZIB2 is the news show just before ImZentrum.
- The variety of hashtags suggests a diverse range of topics, which could make topic modeling more challenging.

-> This leads to the decision to remove hashtags and mentions from the text.

# Topic modelling

In [None]:
#download stopwords
nltk.download('stopwords')
german_stopwords = nltk.corpus.stopwords.words('german')

#add elements to stopwords
url_elements = ["co", "https co", "https", "http", "www", "imzentrum"]
german_stopwords.extend(url_elements)

#delete hastags and mentions from text
df_tweets['text'] = df_tweets['text'].str.replace(r'@\S+', '')
df_tweets['text'] = df_tweets['text'].str.replace(r'#\S+', '')

In [None]:
# Define the model
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words=german_stopwords)
ctfidf_model = ClassTfidfTransformer()

# Create the model
topic_model = BERTopic(
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  calculate_probabilities=True,        
  verbose=True,
  language="german"
)

In [None]:
# Fit the model to data
topics, _ = topic_model.fit_transform(df_tweets['text'])

In [None]:
#similarity heatmap
topic_model.visualize_heatmap(top_n_topics=30)

In [None]:
#similarity matrix
sim_matrix = cosine_similarity(topic_model.c_tf_idf_)
df_sim = pd.DataFrame(sim_matrix, columns=topic_model.topic_labels_.values(), index=topic_model.topic_labels_.values())
df_sim = df_sim.stack().reset_index().sort_values(0, ascending=False)
df_sim.columns = ['Topic 1', 'Topic 2', 'Similarity']

#select where Topic 1 != Topic 2 AND Similarity > 0.65
df_sim = df_sim[(df_sim['Topic 1'] != df_sim['Topic 2']) & (df_sim['Similarity'] > 0.65)]
df_sim

In [None]:
#merge topics
topics_to_merge = [[150,20],[135,29]]
topic_model.merge_topics(df_tweets['text'], topics_to_merge)

In [None]:
# Further reduce topics
topic_model.reduce_topics(df_tweets['text'], nr_topics=20)

In [None]:
topic_model.get_topic_info()

In [None]:
# Visualize topics
topic_model.visualize_topics()

In [None]:
# Visualize topics as a barchart
topic_model.visualize_barchart(top_n_topics=10)

In [None]:
#sort topics by frequency
df_topics = topic_model.get_topic_info()
df_topics = df_topics.sort_values('Count', ascending=False)
df_topics.head(5)

The provided topic modeling results showcase the most prevalent topics derived from the tweets related to the Austrian public TV show "Im Zentrum" and the hashtag #imzentrum during the specified timeframe. Let's briefly discuss each identified topic:

1. **Topic 1: Political Representation and Pandemic Response**
   - Keywords: ['rt', 'petrovic', 'övp', 'pandemie', 'kurz', 'heute', 'orfimzentrum', 'warum', 'politik', 'arminwolf']
   - Representative Documents: A sample includes tweets from Armin Wolf discussing the mandatory smallpox vaccination and political aspects.
   - Interpretation: This topic revolves around political representation, with a focus on figures like Petrovic and ÖVP, and discussions about the pandemic response. Armin Wolf's involvement suggests discussions on the political landscape and decision-making during the pandemic.

2. **Topic 2: Criticism of ORF and Impfpflicht (Vaccination Obligation)**
   - Keywords: ['rt', 'orf', 'ja', 'schon', 'immer', 'mehr', 'heute', 'impfpflicht', 'reiterec', 'frau']
   - Representative Documents: Tweets expressing criticism toward ORF, questioning the need for certain guests, and discussing the topic of vaccination obligation.
   - Interpretation: This topic reflects public sentiments critical of ORF's choices in guests and discussions surrounding the vaccination obligation. There seems to be disagreement or dissatisfaction with the show's content.

3. **Topic 3: Long COVID and Neurological Perspectives**
   - Keywords: ['neurostingl', 'milde', 'verläufe', 'neurologe', 'long', 'covid', 'longcovid', 'asymptomatis', 'nutzen', 'langzeitfolgen']
   - Representative Documents: Tweets featuring a neurologist emphasizing the long-term effects of mild COVID-19 cases and the impact on asymptomatic individuals.
   - Interpretation: This topic centers around the neurological aspects of COVID-19, particularly long COVID. The discussion includes insights from a neurologist and highlights the importance of considering long-term consequences, even in mild cases.

4. **Topic 4: Critique of Virus Response and Impfpflicht (Vaccination Obligation)**
   - Keywords: ['impfpflicht', 'impfung', 'virus', 'rt', 'verschiedene', 'niemanden', 'schützt', 'lieber', 'beschäftig', 'überrascht']
   - Representative Documents: Tweets critiquing the response to the virus, questioning the effectiveness of various vaccines, and criticizing the focus on vaccination obligation.
   - Interpretation: This topic involves skepticism and criticism related to the virus response, vaccine efficacy, and the idea of vaccination obligation. The tweets suggest a discussion around the perceived shortcomings in addressing the pandemic.