In [12]:
from datetime import datetime
import glob
import pandas as pd
import re
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
import nltk
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
#functions
def createTwDB():
    """
    Creates a consolidated DataFrame from multiple CSV files containing tweet data,
    and saves the resulting DataFrame to a CSV file.

    Returns:
    str: The filename of the saved CSV file containing all the tweets.
    """
    all_files = glob.glob("savedTweets/*/*.csv")

    #ceate df from all files
    li_tw_files = []
    for f in all_files:
        df = pd.read_csv(f, index_col=None, header=None, sep="\t")
        li_tw_files.append(df)
    df_tw = pd.concat(li_tw_files, axis=0, ignore_index=True)
    df_tw.columns = ['id', 'user', 'created_at', 'source', 'in_reply_to_status_id', 'in_reply_to_user_id', 'in_reply_to_screen_name', 'retweet_count', 'text']
    df_tw = df_tw.set_index("id")

    #save df
    filename = "savedTweets/db_all_tweets.csv"
    df_tw.to_csv(filename, sep="\t", encoding="utf-8")

    return filename


def extract_hashtags(text):
    """
    Extracts hashtags from the given text.

    Args:
    text (str): The text from which hashtags will be extracted.

    Returns:
    list: A list of hashtags found in the given text.
    """

    return re.findall(r'#\w+', text)

In [14]:
#create tweet-database
csv_tweets = createTwDB()

In [15]:
#create dataframe from tweet-database
df_all_tweets = pd.read_csv(csv_tweets, sep = '\t', encoding = 'utf-8')
df_all_tweets.head()

Unnamed: 0,id,user,created_at,source,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,retweet_count,text
0,1457452860258520000,PaulSchmidinger,2021-11-07 21:00:14+00:00,Twitter Web App,,,,0,Gleich geht's los! #ImZentrum
1,1457452886959480000,PaulSchmidinger,2021-11-07 21:00:21+00:00,Twitter Web App,,,,0,Gleich geht's los mit ImZentrum
2,1457454027030020000,ExplainSimple,2021-11-07 21:04:52+00:00,Twitter Web App,,222435529.0,ORFImZentrum,0,@ORFImZentrum Welchen radikalen Wandel es welt...
3,1457455881650670000,YveBiskupska,2021-11-07 21:12:15+00:00,Twitter for Android,,,,0,Club of Rome ... das des no wer kennt... Heute...
4,1457457694214860000,StefanKaineder,2021-11-07 21:19:27+00:00,Twitter for iPhone,,,,0,Jetzt geht’s los! Heute #imzentrum mit @reiter...


In [16]:
#select relevant columns and preprocess text
df_tweets = df_all_tweets[['id','user','text']]
df_tweets['text'] = df_tweets['text'].str.lower()
df_tweets

Unnamed: 0,id,user,text
0,1457452860258520000,PaulSchmidinger,gleich geht's los! #imzentrum
1,1457452886959480000,PaulSchmidinger,gleich geht's los mit imzentrum
2,1457454027030020000,ExplainSimple,@orfimzentrum welchen radikalen wandel es welt...
3,1457455881650670000,YveBiskupska,club of rome ... das des no wer kennt... heute...
4,1457457694214860000,StefanKaineder,jetzt geht’s los! heute #imzentrum mit @reiter...
...,...,...,...
16407,1493088225870684166,AmsHeier,selbst als schwerst süchtiger polittalk-junkie...
16408,1493080172240719874,HombergEv,"rt @dieraffa: anschober: „es wäre schade, wenn..."
16409,1493050211354267651,Kei_Neoption,„die nächsten wochen werden entscheidend sein“...
16410,1493046589774090241,Retseflis,"rt @shirleyinaktiv: #gartlehner meint, dass ma..."


In [17]:
#count tweets per user	
df_tweets_count = df_tweets[['id','user']].groupby('user').count().sort_values(by='id',ascending=False)
df_tweets_count.head(10)

Unnamed: 0_level_0,id
user,Unnamed: 1_level_1
KnutOgris,211
ORFImZentrum,155
drakkalas,151
TraudePinter,117
Mhs_ThatFace,104
marcus_didius,86
grafkaroly,84
reiterec,81
uebersleben,81
zuschoen,77


In [18]:
#extract hashtags
df_tweets['hashtags'] = df_tweets['text'].apply(extract_hashtags)
li_hashtags=[item for sublist in df_tweets['hashtags'].tolist() for item in sublist]
df_hashtags = pd.DataFrame(li_hashtags, columns=['hashtag'])

#count hashtags
df_hashtags_count = df_hashtags.groupby('hashtag').size().sort_values(ascending=False)
df_hashtags_count = df_hashtags_count.drop('#imzentrum')
df_hashtags_count.head(10)

hashtag
#longcovid      298
#zib2           268
#petrovic       258
#impfpflicht    235
#orf            169
#övp            152
#wöginger       108
#reich          102
#im              99
#sideletter      96
dtype: int64

**Brief discussion on the implications for Topic Modeling:**

- Given the prominence of health-related hashtags, it is reasonable to expect that health-related topics, especially those related to COVID-19, may dominate the discussions.
- Political topics, particularly those associated with the Austrian People's Party (#övp) and specific figures like August Wöginger, may also be significant. The hashtags regarding the ORF (Austrian Broadcasting Corporation) such as #zib2 refer to the fact, that the ZIB2 is the news show just before ImZentrum.
- The variety of hashtags suggests a diverse range of topics, which could make topic modeling more challenging.

-> This leads to the decision to remove hashtags and mentions from the text.

# Topic modelling

In [19]:
#download stopwords
nltk.download('stopwords')
german_stopwords = nltk.corpus.stopwords.words('german')

#add elements to stopwords
url_elements = ["co", "https co", "https", "http", "www", "imzentrum"]
german_stopwords.extend(url_elements)

#delete hastags and mentions from text
df_tweets['text'] = df_tweets['text'].str.replace(r'@\S+', '')
df_tweets['text'] = df_tweets['text'].str.replace(r'#\S+', '')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PaulSchmidinger\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# Define the model
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words=german_stopwords)
ctfidf_model = ClassTfidfTransformer()

# Create the model
topic_model = BERTopic(
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  calculate_probabilities=True,        
  verbose=True,
  language="german"
)

In [21]:
# Fit the model to data
topics, _ = topic_model.fit_transform(df_tweets['text'])

2024-01-28 23:25:13,140 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/513 [00:00<?, ?it/s]

2024-01-28 23:30:45,301 - BERTopic - Embedding - Completed ✓
2024-01-28 23:30:45,301 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-28 23:31:21,649 - BERTopic - Dimensionality - Completed ✓
2024-01-28 23:31:21,649 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-28 23:34:00,232 - BERTopic - Cluster - Completed ✓
2024-01-28 23:34:00,422 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-28 23:34:01,294 - BERTopic - Representation - Completed ✓


In [22]:
#similarity heatmap
topic_model.visualize_heatmap(top_n_topics=30)

In [23]:
#similarity matrix
sim_matrix = cosine_similarity(topic_model.c_tf_idf_)
df_sim = pd.DataFrame(sim_matrix, columns=topic_model.topic_labels_.values(), index=topic_model.topic_labels_.values())
df_sim = df_sim.stack().reset_index().sort_values(0, ascending=False)
df_sim.columns = ['Topic 1', 'Topic 2', 'Similarity']

#select where Topic 1 != Topic 2 AND Similarity > 0.65
df_sim = df_sim[(df_sim['Topic 1'] != df_sim['Topic 2']) & (df_sim['Similarity'] > 0.65)]
df_sim

Unnamed: 0,Topic 1,Topic 2,Similarity
11532,41_horror_wahre_barbarakaufmann_experte,233_horror_wahre_barbarakaufmann_experte,0.995543
62988,233_horror_wahre_barbarakaufmann_experte,41_horror_wahre_barbarakaufmann_experte,0.995543
6101,21_fü_scherbenhaufen_beschäftigt_stehen,182_eva_scherbenhaufen_beschäftigt_stehen,0.700318
49249,182_eva_scherbenhaufen_beschäftigt_stehen,21_fü_scherbenhaufen_beschäftigt_stehen,0.700318


In [24]:
#merge topics
topics_to_merge = [[41,233],[21,182]]
topic_model.merge_topics(df_tweets['text'], topics_to_merge)

In [25]:
# Further reduce topics
topic_model.reduce_topics(df_tweets['text'], nr_topics=15)

2024-01-28 23:34:25,939 - BERTopic - Topic reduction - Reducing number of topics


2024-01-28 23:34:26,727 - BERTopic - Topic reduction - Reduced number of topics from 267 to 15


<bertopic._bertopic.BERTopic at 0x1af3bf58a90>

In [26]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4028,-1_rt_schon_orf_mehr,"[rt, schon, orf, mehr, ja, impfpflicht, reiter...",[rt @bertieroberts54: also eine frage beschäft...
1,0,9427,0_rt_orfimzentrum_övp_petrovic,"[rt, orfimzentrum, övp, petrovic, covid, long,...",[rt @orfimzentrum: 💬„milde verläufe nutzen nic...
2,1,1336,1_impfpflicht_impfung_rt_virus,"[impfpflicht, impfung, rt, virus, verschiedene...","[rt @shourahashemi: nein lieber gust, „das vir..."
3,2,450,2_3000_kind_hungrig_verständigen,"[3000, kind, hungrig, verständigen, wichtigste...","[rt @mom_inst: ""wir sollten uns darauf verstän..."
4,3,424,3_österreich_lafilledevienne_inflation_menschen,"[österreich, lafilledevienne, inflation, mensc...",[rt @lafilledevienne: 289.000 menschen sind in...
5,4,244,4_museum_dollfuss_dollfuß_zeitgemäß,"[museum, dollfuss, dollfuß, zeitgemäß, wöginge...",[rt @dieraffa: wöginger: „wegen dem dollfuß-mu...
6,5,115,5_strafe_verhängt_impfstrafverfügung_rauskommen,"[strafe, verhängt, impfstrafverfügung, rauskom...","[rt @peterbussjaeger: in dem punkt, dass über ..."
7,6,104,6_fatigue_symptom_gesamtstaatliche_chronische,"[fatigue, symptom, gesamtstaatliche, chronisch...",[rt @a_huss1: #imzentrum wir brauchen eine ges...
8,7,82,7_schulterschluss_brauche_gemeinsamen_liviaklingl,"[schulterschluss, brauche, gemeinsamen, liviak...",[rt @liviaklingl: wir brauchen einen gemeinsam...
9,8,69,8_pip_collage_photo_editor,"[pip, collage, photo, editor, camera, maker, i...",[rt @shakirk66596159: pip collage photo collag...


In [27]:
# Visualize topics
topic_model.visualize_topics()

In [29]:
# Visualize topics as a barchart
topic_model.visualize_barchart(top_n_topics=15)

In [34]:
#sort topics by frequency
df_topics = topic_model.get_topic_info()
df_topics = df_topics.sort_values('Count', ascending=False)
df_topics.head(6)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
1,0,9427,0_rt_orfimzentrum_övp_petrovic,"[rt, orfimzentrum, övp, petrovic, covid, long,...",[rt @orfimzentrum: 💬„milde verläufe nutzen nic...
0,-1,4028,-1_rt_schon_orf_mehr,"[rt, schon, orf, mehr, ja, impfpflicht, reiter...",[rt @bertieroberts54: also eine frage beschäft...
2,1,1336,1_impfpflicht_impfung_rt_virus,"[impfpflicht, impfung, rt, virus, verschiedene...","[rt @shourahashemi: nein lieber gust, „das vir..."
3,2,450,2_3000_kind_hungrig_verständigen,"[3000, kind, hungrig, verständigen, wichtigste...","[rt @mom_inst: ""wir sollten uns darauf verstän..."
4,3,424,3_österreich_lafilledevienne_inflation_menschen,"[österreich, lafilledevienne, inflation, mensc...",[rt @lafilledevienne: 289.000 menschen sind in...
5,4,244,4_museum_dollfuss_dollfuß_zeitgemäß,"[museum, dollfuss, dollfuß, zeitgemäß, wöginge...",[rt @dieraffa: wöginger: „wegen dem dollfuß-mu...


The provided excerpts represent different topics identified through topic modeling from tweets related to the Austrian public TV show "Im Zentrum" and the hashtag #imzentrum during the specified timeframe. Let's briefly discuss each identified topic:

1. **Topic 1: Long COVID and Neurological Aspects**
   - Keywords: ['rt', 'orfimzentrum', 'övp', 'petrovic', 'covid', 'long', 'neurostingl', 'milde', 'verläufe', 'warum']
   - Representative Documents: Tweets highlighting the importance of understanding the long-term effects of mild COVID-19 cases, particularly emphasizing insights from a neurologist (Neurostingl).

2. **Topic 2: Criticism of ORF and Impfpflicht (Vaccination Obligation)**
   - Keywords: ['rt', 'schon', 'orf', 'mehr', 'ja', 'impfpflicht', 'reiterec', 'heute', 'orfimzentrum', 'zib2']
   - Representative Documents: Tweets expressing questions and criticism about the need for certain guests on the show, as well as discussions about the vaccination obligation and related programs (ZIB2).

3. **Topic 3: Skepticism and Critique on Virus Response and Impfpflicht**
   - Keywords: ['impfpflicht', 'impfung', 'rt', 'virus', 'verschiedene', 'niemanden', 'shourahashemi', 'infektion', 'schützt', 'beschäftig']
   - Representative Documents: Tweets expressing skepticism and critique related to virus response, questioning the efficacy of various vaccines, and criticizing the focus on vaccination obligation.

4. **Topic 4: Social Issues - Child Hunger**
   - Keywords: ['3000', 'kind', 'hungrig', 'verständigen', 'wichtigste', 'sozia', 'mom_inst', 'bett', 'trifft', 'sollten']
   - Representative Documents: Tweets discussing the importance of addressing child hunger and the suggestion to agree on not letting any child go to bed hungry.

5. **Topic 5: Social and Economic Challenges in Austria**
   - Keywords: ['österreich', 'lafilledevienne', 'inflation', 'menschen', '000', 'einmalzahlungen', 'könnten', 'haushalten', 'rt', 'sozialleistungen']
   - Representative Documents: Tweets highlighting social and economic challenges in Austria, including discussions on working poor, inflation, and the financial situation of households.