# LDA & NMF models

This script is used to train and visualise LDA and NMF models There are three different ways to aggregate the existing tweets.
<ol>
<li>Individual documents correspond to individual tweets</li>
    
<li>Individual documents correspond to the aggregate of all tweets a single account authored</li>

<li>Individual documents correspond to the aggregate of all tweets a single account authored in a given month</li>
</ol>

Through the itterative process the first (and simplest way) resulted in the most interpretable models.That is also the aggregation used to generate all models presented in the appendix.

The first three blocks of code correspond to these three wyas of aggregating the tweets and are compatible with either the LDA model or the NMF model that follow. 

In [None]:
#This loads individual tweets and converts them into a corpus for gensim
#IMPORTS
from os import listdir
import json
import logging
from gensim import corpora
from nltk.corpus import stopwords

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#Path to processed tweets
path = r'PATH'
tweets = []
stops = set(stopwords.words('dutch'))

for month in ['01','02','03','04','05','06','07','08','09','10','11','12']: # controls for month 
    for file in listdir(path):
        if file.split('_')[0] == month:   
            with open(path + '/' + file, 'r') as infile:
                data = json.loads(infile.read())
                for identifier in data.keys():
                    tweet = []
                    for token in range(len(data[identifier]['full_frog'])):
                        #Removing punctuation
                        if data[identifier]['full_frog'][token]['dep'] != 'punct':
                            #Removing stopwords
                            if data[identifier]['full_frog'][token]['lemma'] not in stops:
                                #Lowercasing all tokens
                                tweet.append(data[identifier]['full_frog'][token]['lemma'].lower())
                    tweets.append(tweet)


In [None]:
#ALTERNATIVE TO THE SCRIPT ABOVE
#This loads tweets aggregated to the level of authors (accounts), resulting in an author-LDA model. 
#IMPORTS
from os import listdir
from collections import defaultdict
import json
from gensim import corpora
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#Path to processed tweets
path = 'PATH'
author_tweets = defaultdict(dict)

for month in ['01','02','03','04','05','06','07','08','09','10','11','12']: #controls for month 
    for file in listdir(path):
        if file.split('_')[0] == month:
            with open(path + '/' + file, 'r') as infile:
                data = json.loads(infile.read())
                for identifier in data.keys():
                    user = data[identifier]['user']['id']
                    tweet = []
                    for token in range(len(data[identifier]['full_frog'])):
                        #Removing punctuation
                        if data[identifier]['full_frog'][token]['dep'] != 'punct':
                            #Removing stopwords
                            if data[identifier]['full_frog'][token]['lemma'] not in stops:
                                #Lowercasing all tokens
                                tweet.append(data[identifier]['full_frog'][token]['lemma'].lower())
                    if user in author_tweets.keys():
                        author_tweets[keyword][user]['text'] = author_tweets[user]['text'] + tweet
                        author_tweets[keyword][user]['tweets'] += 1

                    else:
                        author_tweets[keyword][user] = {}
                        author_tweets[keyword][user]['text'] = tweet
                        author_tweets[keyword][user]['tweets'] = 1
                        author_tweets[keyword][user]['user_info'] = data[identifier]['user']

#Creating list of author-tweets
tweets = [author_tweets[keyword][user]['text'] for user in author_tweets[keyword].keys()]


In [None]:
#ALTERNATIVE TO THE TWO SCRIPTS ABOVE
#This loads tweets aggregated to the level of authors (accounts) AND to individual months.
#IMPORTS
from os import listdir
from collections import defaultdict
import json
from gensim import corpora
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#Path to processed tweets
path = 'PATH'
author_tweets = {}
for month in ['01','02','03','04','05','06','07','08','09','10','11','12']: #controls for month 
    for file in listdir(path):
        if file.split('_')[0] == month:
            monthly = defaultdict(list)
            with open(path + '/' + file, 'r') as infile:
                data = json.loads(infile.read())
            for identifier in data.keys():
                user = data[identifier]['user']['id']
                tweet = []
                for token in range(len(data[identifier]['full_frog'])):
                    #Removing punctuation
                    if data[identifier]['full_frog'][token]['dep'] != 'punct':
                        #Removing stopwords
                        if data[identifier]['full_frog'][token]['lemma'] not in stops:
                            #Lowercasing all tokens
                            tweet.append(data[identifier]['full_frog'][token]['lemma'].lower())
                if user in author_tweets.keys():
                    monthly[user] = monthly[user] + tweet
                    author_tweets[user]['tweets'] += 1
                    
                else:
                    author_tweets[user] = {}
                    author_tweets[user]['text'] = []
                    monthly[user] = monthly[user] + tweet
                    author_tweets[user]['tweets'] = 1
                    author_tweets[user]['user_info'] = data[identifier]['user']
            
            for user in monthly.keys():
                author_tweets[user]['text'].append(monthly[user])
                
#Creating list of author-tweets
tweets = []
for user in author_tweets.keys():
    tweets = tweets + author_tweets[user]['text']



### The LDA Model

In [None]:
#This trains, saves, and visualises an LDA model
#IMPORTS
from gensim.models import LdaModel
from collections import defaultdict
from gensim import corpora
import logging
import json
import pyLDAvis
import pyLDAvis.gensim

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#generating corpus and dictionary for gensim
dictionary = corpora.Dictionary(tweets, prune_at=None)
corpus = [dictionary.doc2bow(tweet) for tweet in tweets]

#Include topic numbers to model for in this list
numbers = []
                     
#Models
from gensim.models import LdaModel

for number in numbers:
    #This trains the model itself.
    lda = LdaModel(corpus, num_topics=number, id2word=dictionary, alpha='auto', eta='auto', random_state=808,
                   passes=10, iterations=100000000, gamma_threshold=0.002, chunksize=50000)
    
    #Change the temp file here to where you want the visualisation stored
    temp_file = r"C:/LDA_" + str(number) + ".html"
    vignette = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics = False)
    pyLDAvis.save_html(vignette, temp_file)

    
    #Change the temp file here to where you want the model file stored
    temp_file = r"C:/Models/LDA_" + str(number)
    lda.save(temp_file)
    
    print('Finished for '+ str(number) +' topics')  



### The NMF Model

In [None]:
#This trains, saves, and visualises an NMF model
#IMPORTS
import numpy as np
from sklearn.decomposition import NMF
import pyLDAvis
import pyLDAvis.sklearn
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#generating vectorizer for SKlearn
def nothing(x):
    return x
cv = CountVectorizer(tokenizer = nothing, preprocessor = nothing)
fitted = cv.fit_transform(tweets)

#Include topic numbers to model for in this list
numbers = [35]
                     
#Models
for number in numbers:
    #This trains the NMF model
    nmf = NMF(n_components=number, alpha=0.5, max_iter=100000, init='nndsvd', random_state=808)
    W = nmf.fit_transform(fitted)

    #Change the temp file here to where you want the model file stored    
    temp_file = r"C:/Notebooks/Appendix_Models/NMF_"+str(number)+"_label"
    pickle.dump(nmf, open(temp_file, 'wb'))
    
    #Change the temp file here to where you want the visualisation stored
    temp_file = r"C:/Notebooks/Appendix_Vignette/NMF_"+str(number)+"_label.html"
    vignette = pyLDAvis.sklearn.prepare(nmf, fitted, cv, lambda_step=1.1, sort_topics=False)
    pyLDAvis.save_html(vignette, temp_file)
    
    print('Finished for '+ str(number) +' topics')  
