In [1]:
from datetime import datetime
import glob
import pandas as pd
import re

In [2]:
#functions
def createTwDB():
    """
    Creates a consolidated DataFrame from multiple CSV files containing tweet data,
    and saves the resulting DataFrame to a CSV file.

    Returns:
    str: The filename of the saved CSV file containing all the tweets.
    """
    all_files = glob.glob("savedTweets/*/*.csv")

    #ceate df from all files
    li_tw_files = []
    for f in all_files:
        df = pd.read_csv(f, index_col=None, header=None, sep="\t")
        li_tw_files.append(df)
    df_tw = pd.concat(li_tw_files, axis=0, ignore_index=True)
    df_tw.columns = ['id', 'user', 'created_at', 'source', 'in_reply_to_status_id', 'in_reply_to_user_id', 'in_reply_to_screen_name', 'retweet_count', 'text']
    df_tw = df_tw.set_index("id")

    #save df
    filename = "savedTweets/db_all_tweets.csv"
    df_tw.to_csv(filename, sep="\t", encoding="utf-8")

    return filename


def extract_hashtags(text):
    """
    Extracts hashtags from the given text.

    Args:
    text (str): The text from which hashtags will be extracted.

    Returns:
    list: A list of hashtags found in the given text.
    """

    return re.findall(r'#\w+', text)

In [3]:
#create tweet-database
csv_tweets = createTwDB()

In [4]:
#create dataframe from tweet-database
df_all_tweets = pd.read_csv(csv_tweets, sep = '\t', encoding = 'utf-8')
df_all_tweets.head()

Unnamed: 0,id,user,created_at,source,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,retweet_count,text
0,1457452860258520000,PaulSchmidinger,2021-11-07 21:00:14+00:00,Twitter Web App,,,,0,Gleich geht's los! #ImZentrum
1,1457452886959480000,PaulSchmidinger,2021-11-07 21:00:21+00:00,Twitter Web App,,,,0,Gleich geht's los mit ImZentrum
2,1457454027030020000,ExplainSimple,2021-11-07 21:04:52+00:00,Twitter Web App,,222435529.0,ORFImZentrum,0,@ORFImZentrum Welchen radikalen Wandel es welt...
3,1457455881650670000,YveBiskupska,2021-11-07 21:12:15+00:00,Twitter for Android,,,,0,Club of Rome ... das des no wer kennt... Heute...
4,1457457694214860000,StefanKaineder,2021-11-07 21:19:27+00:00,Twitter for iPhone,,,,0,Jetzt geht’s los! Heute #imzentrum mit @reiter...


In [5]:
#select relevant columns and preprocess text
df_tweets = df_all_tweets[['id','user','text']]
df_tweets['text'] = df_tweets['text'].str.lower()
df_tweets.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets['text'] = df_tweets['text'].str.lower()


Unnamed: 0,id,user,text
0,1457452860258520000,PaulSchmidinger,gleich geht's los! #imzentrum
1,1457452886959480000,PaulSchmidinger,gleich geht's los mit imzentrum
2,1457454027030020000,ExplainSimple,@orfimzentrum welchen radikalen wandel es welt...
3,1457455881650670000,YveBiskupska,club of rome ... das des no wer kennt... heute...
4,1457457694214860000,StefanKaineder,jetzt geht’s los! heute #imzentrum mit @reiter...


In [18]:
#count tweets per user	
df_tweets_count = df_tweets[['id','user']].groupby('user').count().sort_values(by='id',ascending=False)
df_tweets_count.head(10)

Unnamed: 0_level_0,id
user,Unnamed: 1_level_1
KnutOgris,211
ORFImZentrum,155
drakkalas,151
TraudePinter,117
Mhs_ThatFace,104
marcus_didius,86
grafkaroly,84
reiterec,81
uebersleben,81
zuschoen,77


In [23]:
#extract hashtags
df_tweets['hashtags'] = df_tweets['text'].apply(extract_hashtags)
li_hashtags=[item for sublist in df_tweets['hashtags'].tolist() for item in sublist]
df_hashtags = pd.DataFrame(li_hashtags, columns=['hashtag'])

#count hashtags
df_hashtags_count = df_hashtags.groupby('hashtag').size().sort_values(ascending=False)
df_hashtags_count = df_hashtags_count.drop('#imzentrum')
df_hashtags_count.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets['hashtags'] = df_tweets['text'].apply(extract_hashtags)


hashtag
#longcovid      298
#zib2           268
#petrovic       258
#impfpflicht    235
#orf            169
#övp            152
#wöginger       108
#reich          102
#im              99
#sideletter      96
dtype: int64

In [None]:
#topic modelling with bertopic
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Define the model
model = SentenceTransformer('sentence-transformers/bert-base-multilingual-uncased')
topic_model = BERTopic(language="german", embedding_model=model)

# Fit the model on your data
topics, _ = topic_model.fit_transform(df['text'])

# Get the most frequent topics
most_frequent_topics = topic_model.get_topic_freq()

# Visualize the topics
topic_model.visualize_topics()

# Get the most representative tweets for each topic
representative_tweets = topic_model.get_representative_docs()

# Get the dominant topic for each document
df['dominant_topic'] = topic_model.predict(df['text'])

# Print or analyze the results as needed
print(most_frequent_topics)
print(representative_tweets)
print(df[['text', 'dominant_topic']])