# Corpus sub-sections

This script goes through two illustrative examples of zooming onto specific sub-sections of the corpus: The cost of early childhood education and care services and the sufficiency of unemployment insurance. 

In [None]:
#Introducing relevant words
#These have been lemmatized usign the same language processing suite (Frog) so that they match the lemmatized tweets

cost_words = ['goedkoop', 'duur', 'kostbaar', 'kost', 'prijs', 'betaalbare', 'onbetaalbaar']
low_words = ['weinig', 'minder', 'laag', 'lager', 'genoeg', 'voldoende']
unemployment_words = ['uwv', 'uitkering', 'bijstand', 'werkeloosheidsverzekering', 'werkeloosheid', 'ww']
pronouns = ['ik', 'mij', 'mijn', 'me', 'we', 'wij', 'ons']

In [None]:
#LOADING INDIVIDUAL TWEETS - ECEC COST
from os import listdir
import json
import logging
from gensim import corpora
from nltk.corpus import stopwords

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

path = 'PATH'
tweets = {}
stops = set(stopwords.words('dutch'))
#keywords = ['ecec', 'lm_programmes', 'lm_employment', 'lm_phrases']


for month in ['08', '09', '10', '11', '12', '01', '02', '03', '04', '05', '06', '07']: #controls for month 
    for file in listdir(path):
        if file.split('_')[0] == month:   
            with open(path + '/' + file, 'r') as infile:
                data = json.loads(infile.read())
                for identifier in data.keys(): 
                    #Only focusing on ECEC tweets
                    if 'ecec' in data[identifier]['keyword_groups']:
                        #Establishing if a tweet is concerned with cost
                        cost_concerned = False
                        for word in cost_words:
                            if word in data[identifier]['lemmatized']:
                                cost_concerned = True
                                break
                        #Establishing if a tweet is personal        
                        personal = False 
                        for word in pronouns:
                            if word in data[identifier]['lemmatized']:
                                personal = True
                                break
                        
                        #Actually saving the tweet 
                        if cost_concerned is True and personal is True:   
                            tweet = []
                            for token in range(len(data[identifier]['full_frog'])):
                                #Removing punctuation
                                if data[identifier]['full_frog'][token]['dep'] != 'punct':
                                    #Removing stopwords
                                    if data[identifier]['full_frog'][token]['lemma'] not in stops:
                                        #Lowercasing all tokens
                                        tweet.append(data[identifier]['full_frog'][token]['lemma'].lower())
                            tweets[identifier] = tweet

dictionary = corpora.Dictionary(tweets.values())
corpus = [dictionary.doc2bow(tweet) for tweet in tweets.values()]

In [None]:
#LOADING INDIVIDUAL TWEETS - UNEMPLOYMENT ADEQUACY
from os import listdir
import json
import logging
from gensim import corpora
from nltk.corpus import stopwords

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

path = 'PATH'
tweets = {}
stops = set(stopwords.words('dutch'))
#keywords = ['ecec', 'lm_programmes', 'lm_employment', 'lm_phrases']


for month in ['08', '09', '10', '11', '12', '01', '02', '03', '04', '05', '06', '07']: #controls for month 
    for file in listdir(path):
        if file.split('_')[0] == month:   
            with open(path + '/' + file, 'r') as infile:
                data = json.loads(infile.read())
                for identifier in data.keys(): 
                    #Establishing a tweet is relevant ot one of the labour market keywords
                    if 'lm_programmes' or 'lm_employment' or 'lm_phrases' in data[identifier]['keyword_groups']:
                        #Establishing if a tweet is concerned with unemployment insurance
                        unemployment_concerned = False
                        for word in unemployment_words:
                            if word in data[identifier]['lemmatized']:
                                unemployment_concerned = True
                                break
                        #Establishing if a tweet is concerned with sufficiency and insufficiency
                        low_concerned = False
                        for word in low_words:
                            if word in data[identifier]['lemmatized']:
                                low_concerned = True
                                break
                        #Establishing if a tweet is personal           
                        personal = False 
                        for word in pronouns:
                            if word in data[identifier]['lemmatized']:
                                personal = True
                                break
                        
                        #Actually saving the tweet
                        if unemployment_concerned is True and low_concerned is True and personal is True:   
                            tweet = []
                            for token in range(len(data[identifier]['full_frog'])):
                                #Remove punctuation
                                if data[identifier]['full_frog'][token]['dep'] != 'punct':
                                    #Remove stopwords
                                    if data[identifier]['full_frog'][token]['lemma'] not in stops:
                                        #Lowercase all tokens
                                        tweet.append(data[identifier]['full_frog'][token]['lemma'].lower())
                            tweets[identifier] = tweet

dictionary = corpora.Dictionary(tweets.values())
corpus = [dictionary.doc2bow(tweet) for tweet in tweets.values()]

## LDA Model

The following generates LDA models of the loaded corpus sub-section and saves visualisations for them.

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore
from collections import defaultdict
from gensim import corpora
import logging
import json
import pyLDAvis
import pyLDAvis.gensim


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#Include numbers of topics to train for in the "numbers" list 
numbers = []
                     
from gensim.models import LdaModel

for number in numbers:
    lda = LdaModel(corpus, num_topics=number, id2word=dictionary, alpha='auto', eta='auto', random_state=808,
                   passes=50, iterations=100000000, gamma_threshold=0.001, chunksize=50)
    
    #Saving visualisation
    temp_file = r"lm_" + str(number) + ".html"
    vignette = pyLDAvis.gensim.prepare(lda, corpus, dictionary, n_jobs=2, sort_topics=False, lambda_step=1.1) 
    pyLDAvis.save_html(vignette, temp_file)
    
    #Saving model itself
    #temp_file = r"ecec_" + str(number)
    #lda.save(temp_file)
    
    print('Finished for '+ str(number) +' alpha')  

In [None]:
#OPTIONAL STEP

#This step is necessary ONLY if you remove the "sort_topics=False" attribute from generating the vignette.
#That is because then the topic numbers between gensim model and pyLDAvis visualisation do nto correspond to one another

#Load a specific model and print the relevant topics - necessary to establish which topics need to be focused on

from gensim.models import LdaModel
#Insert path to model here
model_file = r"ecec_10"
lda = LdaModel.load(model_file)

#Replace the number in range by the number of topics of a selected topic model
for topic in range(10):
    print('Topic ' +  str(topic))
    print(lda.print_topic(topic, topn=20))

## Printing most representative tweets of topics

Before starting this process the selected model should be loaded as "lda" and you should manually a select a topic (or multiple topics) and not the numbers of those topics (in the gensim model).

The result is a word document that lists the X tweets most representative of the topic Y from the loaded LDA model. The document includes the phi-score, user name, display name, and unprocessed text of the tweet (unprocessed for readability).

In [None]:
from collections import defaultdict
from operator import itemgetter

#Which topic is relevant
selected_topic = 1
#How many top tweets to save
top_tweets = 50

all_topics = lda.get_document_topics(corpus, per_word_topics=False)

topic_info = defaultdict(dict)

#Generates a dictionary where tweet ID is the key and value is another
#dictionary where the key is "topicx" and value is the phis score for that topic.
for identifier, topics in zip(tweets.keys(), all_topics):
    for topic in topics:
        if topic[0] == selected_topic:
            topic_info[identifier]['topic'+str(selected_topic)] = topic[1]

#This information can be included for any other topics of interest

#Creates a selection of only tweets that are at least partially constituted from the topic of interest
selection = {}
for month in ['08', '09', '10', '11', '12', '01', '02', '03', '04', '05', '06', '07']: #controls for month 
    for file in listdir(path):
        if file.split('_')[0] == month:   
            with open(path + '/' + file, 'r') as infile:
                data = json.loads(infile.read())
                for identifier in data.keys():  
                    if identifier in topic_info.keys():
                        selection[identifier] = data[identifier]
                        selection[identifier]['topic'+str(selected_topic)] = topic_info[identifier]['topic'+str(selected_topic)]
                        
sort = sorted(selection.values(), key=itemgetter('topic'+str(selected_topic)), reverse=True)

#Change the name of the file tweets get saved into
text_file = open(r"name.doc", "w", encoding="utf-8")

for tweet in sort[:top_tweets]:
    text_file.write('Phi score: ' + str(tweet['topic'+str(selected_topic)]))
    text_file.write('Next User')
    text_file.write('\n')
    if tweet['truncated'] is True:
        text_file.write(tweet['extended_tweet']['full_text'])
    else:
        text_file.write(tweet['text'])
    
    text_file.write('\n')
    text_file.write('\n')
    
text_file.close()