# Loading tweets

This script loads the stored tweets and pre-processes them. This includes the following steps:

<ol>
<li>Removing retweets (both formally identified and informally identified by beginning with "RT".</li>

<li>Removing duplicate tweets.</li>

<li>Removing tweets whose authors post too frequently (assumed bots), also stores the information on tweeting frequency as meta-data for individual tweets.</li>

<li>Adding a processed version of quoted tweets by joining the text of the quoted tweet and the original text.</li>

<li>Adding meta-data to tweets indicating what group of keywords is represented in the text of the tweet</li>
</ol>

The script can be made more efficient (it stores multiple copies of monthly data as it pre-processes). This feature was used in bug-fixing, but should be removed based on memory constrains (instead of generating new versions of the "data" dictionary each step can simpply re-write the record in the original "data" dictionary). 

In [None]:
#IMPORTS
from os import listdir
import json
from collections import defaultdict
from datetime import datetime

#LOADING FROM RAW

path = "C:/Notebooks/new"
removals = defaultdict(int)

for month in ['08', '09']: #controls for month 
    data = {}
    corrupted_counter = 0
    for day in listdir(path):
        if day.split('-')[1] == month:        
            with open(path + '/' + day) as daily: 
                for line in daily:
                    try:
                        tweet = json.loads(line)
                        id = tweet["id"]
                        data[id] = tweet
                    except:
                        with open("C:/Notebooks/Twitter_second/Unreadable.txt", 'w+') as out_file:
                            out_file.write(line)
                        corrupted_counter += 1
                        pass
                    
    print('for the month ' + month + ':')
    print('loaded a total of ' + str(len(data.keys())) + ' tweets') #print total loads for month
    print('there were ' + str(corrupted_counter) + ' unreadable tweets')
    removals['total_tweets'] += len(data.keys())
    removals['corrutped_tweets'] += corrupted_counter
    
#REMOVING RETWEETS
#both formal and informal posts identified by 'RT' 
    
    removed_tweets = defaultdict(dict) #To allow adding key:value pairs to non-existent keys

    #formal retweets
    data_v2 = {}
    for identifier in data.keys():
        if "retweeted_status" not in data[identifier]:
            data_v2[identifier] = data[identifier]
        else:
            removed_tweets[identifier] = data[identifier]
            removed_tweets[identifier]['reason'] = 'formal_retweet'
    
    #informal retweets    
    data_v3 = {}
    for identifier in data_v2.keys():
        tweet = data_v2[identifier]['text']
        if tweet.startswith("RT ") is True:
            removed_tweets[identifier] = data[identifier]
            removed_tweets[identifier]['reason'] = 'informal_retweet'
        else: 
            data_v3[identifier] = data_v2[identifier]

    print('removed ' + str(len(removed_tweets.keys())) + ' tweet as formal and informal re-tweets')  
    removals['removed_retweets'] += len(removed_tweets.keys()) 
    
#CHECK FOR AND REMOVAL OF DUPLICATES
#Likely reduntant due to changes to ohow long tweets get collected

    read = set()
    data_v4 = {}
    duplicates_counter = 0
    for identifier in data_v3.keys():
        if data[identifier]['truncated'] is True:
            tweet = data[identifier]['extended_tweet']['full_text']
        else:
            tweet = data[identifier]['text']

        if tweet in read:
            removed_tweets[identifier] = data_v3[identifier]
            removed_tweets[identifier]['reason'] = 'duplicate'
            duplicates_counter += 1
        else: 
            read.add(tweet)
            data_v4[identifier] = data_v3[identifier]

    print('removed ' + str(duplicates_counter) + ' tweets as duplicates')
    removals['removed_duplicates'] += duplicates_counter
    
#REMOVING FREQUENT POSTERS (ASSUMED BOTS)
#also including a 'monthly_relevant_tweets' and 'monthly_tweets' variable under 'user' for all tweets.

    #Topic-relevant tweeting volume
    users = defaultdict(int)
    for identifier in data_v4.keys():
        person = data_v4[identifier]['user']['screen_name']
        users[person] += 1 
    for identifier in data_v4.keys():
        person = data_v4[identifier]['user']['screen_name']
        data_v4[identifier]['user']['monthly_relevant_tweets'] = users[person]
    
    #General tweeting volume (using 30.44 as average days in a month)
    for identifier in data_v4.keys():
        start_time = datetime.strptime(data_v4[identifier]['user']['created_at'], '%a %b %d %X %z %Y')
        end_time = datetime.strptime(data_v4[identifier]['created_at'], '%a %b %d %X %z %Y')
        age = (end_time - start_time).total_seconds()/(24*60*60)
        data_v4[identifier]['user']['monthly_tweets'] = data_v4[identifier]['user']['statuses_count']/(age/30.44) 
    
    #actual filtering
    data_v5 = {}
    relevant_threshold = 450  #threshold with potentially important implications
    general_threshold = 1500  #threshold with potentially important implications
    relevant_counter = 0
    general_counter = 0
    for identifier in data_v4.keys():
        if data_v4[identifier]['user']['monthly_relevant_tweets'] > relevant_threshold:
            removed_tweets[identifier] = data[identifier]
            removed_tweets[identifier]['reason'] = 'exceeded relevant tweet threshold of ' + str(relevant_threshold)
            relevant_counter += 1
        else:
            if data_v4[identifier]['user']['monthly_tweets'] > general_threshold:
                removed_tweets[identifier] = data[identifier]
                removed_tweets[identifier]['reason'] = 'exceeded general tweet threshold of ' + str(general_threshold)
                general_counter += 1
            else:
                data_v5[identifier] = data_v4[identifier]
                
    print('removed ' + str(relevant_counter) + ' tweets because user tweeted more than ' 
          + str(relevant_threshold) + ' relevant tweets this month')
    print('removed ' + str(general_counter) + ' tweets because user tweets more than ' 
          + str(general_threshold) + ' tweets per month in the accounts lifetime')
    removals['general_volume'] += general_counter
    removals['relevant_volume'] += relevant_counter
    
#JOINING QUOTED TWEETS
#adding a combination of original tweet and its quote separated by '||' as 'joined_text' to root of the tweet
    
    for identifier in data_v5.keys():
        if data[identifier]['truncated'] is True:
            tweet = data[identifier]['extended_tweet']['full_text']
        else:
            tweet = data[identifier]['text']

        if "quoted_status" in data[identifier]:
            data_v5[identifier]['joined_text'] = (data[identifier]['text'] +
                                                   "||" +
                                                   data[identifier]['quoted_status']['text'])
            removals['quoted_tweets'] += 1
        
    print('Joined quoted tweets')

#ADDING A LABEL BASED ON KEYWORD GROUP THAT MADE TWEET RELEVANT
#This uses the same keywords as the listener to check which group of keywords made a given tweet relevant.

    ecec = [
        'kinderopvangtoeslag', 'kindgebonden budget', 'kinderbijslag',
        'kinderopvang', 'kinder opvang', 'kinderdagverblijf',
        'kdv', 'gastouder', 'gastouders',
        'gastouderopvang', 'gastouder opvang', 'gastouderbureau',
        'peuterspeelzalen', 'peuterspeelzaal', 'peuterspeelplaats',
        'peutergroep', 'peutergroepen', 'buitenschoolseopvang',
        'buitenschoolse opvang', 'naschoolseopvang', 'naschoolse opvang',
        'naschoolse', 'BSO', 'voorschoolse opvang',
        'voorschoolse', 'voorschoolseopvang', 'oppas',
        'oppassers', 'babysitter', 'babysitters',
        'nanny', 'nannies'
        ]

    lm_programmes = [
        'Participatiewet', 'Participatie wet', 'Gesubsidieerde arbeid',
        'opleiding', 'scholing', 'heropleiding',
        'omscholing', 'training', 'retraining',
        're-training', 'studie', 'studeren',
        'praktijktraining', 'werkervaringsplek', 'stage',
        'stage lopen', 'werkervaringsplek', 'werkervaring plek',
        'studeer en werkplek', 'studeer- en werkplek', 'traineeship',
        'Werkbedrijf', 'werk.nl', 'werkplein',
        'werkpleinen', 'arbeidsadviseur', 'uwv',
        'arbeidsbemiddelaar', 'arbeidsbemiddeling', 'loopbaan coach',
        'werk coach', 'WW-uitkering', 'uitkering',
        'bijstand', 'bijstandsuitkering', 'meewerkaftrek'
        ]

    lm_employment = [
        'full-time werk', 'full time werk', 'fulltime werk',
        'full-time baan'  'full time baan', 'fulltime baan',
        'voltijd baan', 'voltijd werk', 'voltijdwerk',
        '1 fte', '1 wtf', 'deeltijd werk',
        'part-time werk', 'part time werk', 'deeltijd baan',
        'part-time baan', 'part time baan', 'vast contract',
        'vaste baan', 'vaste aanstelling', 'tijdelijk contract',
        'tijdelijke baan', 'tijdelijke aanstelling', 'uitzendcontract',
        'nul uren contract', '0 uren contract', 'zelfstandige zonder personeel',
        'zzp', "zzp'ers", "zzp'er",
        'zzper', 'zzpers', 'DBA modelovereenkomst',
        'schijnzelfstandigheid', 'loondienst', 'in loondienst',
        'eigen baas', 'eigen baas zijn'
        ]

    lm_phrases = [
        'werkloosheid', 'werkeloosheid', 'werkloos',
        'zonder baan', 'jobless'  'in between jobs',
        'between jobs', 'in between two jobs', 'between two jobs',
        'onderbezetting', 'onderbezet', 'zoek naar werk',
        'kijken voor werk', 'een baan zoeken',
        'zoeken naar een baan', 'banen zoeken', 'passend werk',
        'passende arbeid', 'passende baan', 'passende job',
        'goed werk', 'slecht werk', 'beter werk',
        'betere kansen op werk', 'beter arbeidscontract', 'goed arbeidscontract',
        'slecht arbeidscontract', 'vacature', 'vacatures',
        'openstaande baan', 'vaardigheidseisen', 'ervaringseisen',
        'werkervaring', 'werkervaringseisen', 'competenties'
        ]
    keywords = {'ecec': ecec, 'lm_programmes': lm_programmes, 'lm_employment': lm_employment, 'lm_phrases': lm_phrases}

    for identifier in data_v5.keys():
        if 'joined_text' in data_v5[identifier]: 
            tweet_raw = data_v5[identifier]['joined_text']
        elif data_v5[identifier]['truncated'] is True:
            tweet_raw = data_v5[identifier]['extended_tweet']['full_text']
        else:
            tweet_raw = data_v5[identifier]['text']
        tweet = tweet_raw.lower()
        
        data_v5[identifier]['keyword_groups'] = []
        data_v5[identifier]['keywords'] = []
        for word_list in keywords.keys():
            for keyword in keywords[word_list]: 
                if keyword in tweet:
                    if keyword not in data_v5[identifier]['keywords']:
                        data_v5[identifier]['keywords'].append(keyword)
                    if word_list not in data_v5[identifier]['keyword_groups']:
                        data_v5[identifier]['keyword_groups'].append(word_list)
        
    print('Added labels based on keywords')
        
#SAVE TWEETS

    savename = str(month) + '_preprocessed'
    with open('C:/Notebooks/Processed_2020/' + savename + '.json', 'w+') as outfile:
        json.dump(data_v5, outfile)
    print('Preprocessed tweets saved, total of ' + str(len(data_v5.keys())) + 'tweets')

#(OPTIONAL) SAVE REMOVED TWEETS AS WELL

#    savename2 = str(month) + '_removed'
#    with open('D:/Notebooks/Twitter_Preprocessed/Removed/' + savename2 + '.json', 'w+') as outfile:
#        json.dump(removed_tweets, outfile)
#    print('Removed tweets saved')
#    print('\n')
