In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import pandas as pd
import pickle
from pprint import pprint
import re
from stop_words import get_stop_words
from trankit import Pipeline

## 1. Load Data

In [2]:
#tweet_json_climate_path = '/home/robin/research/phd/twitter_climate_politics/data/tweets/total/tweets_climate.json'
#tweet_climate_concise_path = '/home/robin/research/phd/twitter_climate_politics/data/tweets/total/tweets_climate_concise.csv'
tweet_climate_lemmatized_concise_path = '/home/robin/research/phd/twitter_climate_politics/data/tweets/total/tweets_climate_lemmatized_concise.csv'
tweet_total_concise_path = '/home/robin/research/phd/twitter_climate_politics/data/tweets/total/tweets_total_concise.csv'

topic_model_path = '/home/robin/research/phd/twitter_climate_politics/models/topic_models/'

In [None]:
#tweets_climate_df = pd.read_csv(tweet_climate_concise_path, lineterminator='\n')
#print(tweets_climate_df.shape)

tweets_climate_df = pd.read_csv(tweet_climate_lemmatized_concise_path, lineterminator='\n') # includes lemmatized data
print(tweets_climate_df.shape)
tweets_total_df = pd.read_csv(tweet_total_concise_path, lineterminator='\n')
print(tweets_total_df.shape)

tweets_climate = tweets_climate_df['text'].tolist()
tweets_climate_lemmatized = tweets_climate_df['text_lemmatized'].tolist()
tweets_total = tweets_total_df['text'].tolist()

print('Tweets Climate Size: {}'.format(len(tweets_climate)))
print('Tweets Climate Lemmatized Size: {}'.format(len(tweets_climate_lemmatized)))
print('Tweets Total Size: {}'.format(len(tweets_total)))

In [4]:
# convert floats to str
for i in range(len(tweets_climate_lemmatized)):
    tweet = tweets_climate_lemmatized[i]
    if type(tweet) == float:
        tweets_climate_lemmatized[i] = str(tweet)

## 2. LDA

### 2.1. Preprocess Preprocessing

#### 2.1.2 Lemmatization

In [None]:
# remove whitespace
tweets_climate = [tweet.strip() for tweet in tweets_climate]

In [None]:
trankit_pipeline = Pipeline('german')

In [8]:
def lemmatize_tweets(tweets):
    '''
    
    '''
    
    lemmatized_tweets = []
    
    for i in range(len(tweets)):
        if len(tweets[i]) > 0:
            lemmatized_tweet = trankit_pipeline.lemmatize(tweets[i])
            lemmas = []
            for sent in lemmatized_tweet['sentences']:
                for token in sent['tokens']:
                    try:
                        lemmas.append(token['lemma'])
                    except:
                        expanded_lemmas = [expanded_token['lemma'] for expanded_token in token['expanded']]
                        lemmas.extend(expanded_lemmas)
        
            lemmatized_tweets.append(' '.join(lemmas))
        else:
            lemmatized_tweets.append(tweets[i])
        
        if i % 100 == 0:
            print('Lemmatized Tweets: {}'.format(i))
    
    
    return lemmatized_tweets

In [None]:
tweets_climate = lemmatize_tweets(tweets_climate)

In [3]:
lemmatized_path = '/home/robin/research/phd/twitter_climate_politics/notebooks/data_test/lemmatized_tweets.pkl'

with open(lemmatized_path, 'rb') as f_in:
    tweets_climate = pickle.load(f_in)

#### 2.1.3 Lower Casing

In [4]:
# lower case
tweets_climate = [tweet.lower() for tweet in tweets_climate]

In [None]:
for tweet in tweets_climate:
    if 'parole' in tweet:
        print(tweet)
        print('\n')

#### 2.1.4 Remove Stop Words

In [6]:
def remove_stopwords(tweets):
    '''
    '''
    stopwords_german = get_stop_words('de').copy()
    
    ## extend stopwords
    stopwords_german.extend(['amp', '&ampf', 'er|es|sie', 'sie|sie', 'er|es'])
    stopwords_german.extend(['müssen', 'sollen', 'dürfen', 'lassen'])
    stopwords_german.extend(['mehr', 'weniger'])
    # add climate tokens to stopwords
    #stopwords_german.extend(['klimaschutz', 'klima'])
    # add climate keywords (used for tweet filtering)
    #stopwords_german.extend(['cop26', 'cop26glasgow', 'klimagipfel', 'Weltklimagipfel', 
    #        'klimakonferenz', 'klimakrise', 'klimawandel',
    #        'klimanotstand', 'klimahysterie', 'energiewende', 'fff'])
    
    umlaut_tokens = re.findall(r'\w*[üöä]\w+', ' '.join(stopwords_german))
    no_umlaut_tokens = []

    for umlaut_token in umlaut_tokens:
        if 'ü' in umlaut_token:
            no_umlaut_tokens.append(re.sub('ü', 'u', umlaut_token))
        elif 'ö' in umlaut_token:
            no_umlaut_tokens.append(re.sub('ö', 'o', umlaut_token))
        elif 'ä' in umlaut_token:
            no_umlaut_tokens.append(re.sub('ä', 'a', umlaut_token))
    stopwords_german.extend(no_umlaut_tokens)
    
    stopwords_german = set(stopwords_german)
    print('Number of Stopwords: {}'.format(len(stopwords_german)))
    
    ## remove stopwords
    tweets_no_stopwords = []
    
    for tweet in tweets_climate:
        tweet_no_stopwords = ' '.join([token for token in tweet.split() if not token in stopwords_german])
        # remove 'er' following years, e.g. 1960er -> 1960
        tweet_no_stopwords = re.sub(r'([0-9]+)er', r'\1', tweet_no_stopwords)
        tweets_no_stopwords.append(tweet_no_stopwords)
    
    return tweets_no_stopwords, stopwords_german

In [None]:
tweets_climate, stopwords_german = remove_stopwords(tweets_climate)
print(len(tweets_climate))          

### 2.2. Vectorization

In [132]:
def sent_to_words(tweets):
    for tweet in tweets:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(tweet), deacc=False, min_len=4))

In [133]:
tweet_words = list(sent_to_words(tweets_climate))

In [135]:
def remove_stopwords(tweets):
    return [[word for word in simple_preprocess(str(tweet)) 
             if word not in stopwords_german] for tweet in tweets]

In [136]:
tweet_words = remove_stopwords(tweet_words)

### 2.3. Modeling

Topic Distribution

P(topic t | post d) = number of words in post d allocated to topic t / total number of words in post d

Word Distribution

P(word w | topic t) = number of times word w is assigned to topic t in all posts in the collection / total number of occurrences of word w

In [137]:
id2word = corpora.Dictionary(tweet_words)

In [None]:
len(id2word)

In [139]:
corpus = [id2word.doc2bow(words) for words in tweet_words]

In [143]:
# number of topics
k = 5
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=k)

In [None]:
# Print the Keyword in the k topics
pprint(lda_model.print_topics(num_topics=k, num_words=10))
doc_lda = lda_model[corpus]