# Identifying removed users

The following code accesses Twitter's API after the data collection period to see which accounts are not retrievable. Some in the literature use this as a form of establishjing ground turth for bot removal (assuming the accounts that were removed were removed because they are bot accounts). 

In [None]:
#This loads tweets and generates a list of users
#IMPORTS
from os import listdir
import json
from collections import defaultdict
from datetime import datetime
import pickle

#LOADING FROM RAW
#Path to raw tweets
path = ""
users = []

for month in ['08', '09', '10', '11', '12', '01', '02', '03', '04', '05', '06', '07']: #controls for month 
    data = {}
    corrupted_counter = 0
    for day in listdir(path):
        if day.split('-')[1] == month:        
            with open(path + '/' + day) as daily: 
                for line in daily:
                    try:
                        tweet = json.loads(line)
                        id = tweet["id"]
                        data[id] = tweet
                    except:
                        with open("D:/Notebooks/Twitter_Preprocessed/Unreadable.txt", 'w') as out_file:
                            out_file.write(line)
                        corrupted_counter += 1
                        pass
                    
    print('for the month ' + month + ':')
    print('loaded a total of ' + str(len(data.keys())) + ' tweets') #print total loads for month
    print('there were ' + str(corrupted_counter) + ' unreadable tweets')

    #Generating user list
    counter = 0
    for identifier in data.keys():
        if data[identifier]['user']['id'] in users:
            pass
        else:
            users.append(data[identifier]['user']['id'])
            counter += 1
    print('Added ' + str(counter) + ' unique users for the month')


In [None]:
#This connects to the API and attempts to access the information for each account
#IMPORTS
import tweepy

tweepy.debug(True)

#AUTHENTICATION
# Configured for your twitter account
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''

#SETTING API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) #handling of rate limit errors already automatic

removed = []

#checking if a user still exists in batches of a hundred users per call
users = [str(user) for user in users]
users_split = [users[i:i+100] for i in range(0, len(users), 100)]
progress_counter = 0
for user_group in users_split:
    counter = 0
    retrieved = api.lookup_users(user_ids=user_group)
    retrieved = [user.id_str for user in retrieved]
    for user in user_group:
        if user in retrieved:
            pass
        else:
            removed.append(user)
            counter += 1
    #print('Added ' + str(counter) + ' missing used ids')
    
    progress_counter += 1 
    if progress_counter % 10 == 0:
        print ('Processed '+str(progress_counter*100)+str('/')+str(len(users))+ ' users')


## Looking through removed users' tweets

The following code is basic exploration of the content of tweets from the users whose account was not retrievable after the data collection period. This essentially served (together with manual exploration of representative tweets and accounts) as a test of the hypothesis that these accounts are predominantly bots and that is why they were removed from Twitter.

In [None]:
#Loading only tweets from removed users from raw tweets
#IMPORTS
from os import listdir
import json
from collections import defaultdict

#Path to raw tweets
path = ""
bot_data = defaultdict(dict)

for month in ['08', '09', '10', '11', '12', '01', '02', '03', '04', '05', '06', '07']: #controls for month 
    counter = 0
    for day in listdir(path):
        if day.split('-')[1] == month:        
            with open(path + '/' + day) as daily: 
                for line in daily:
                    try:
                        tweet = json.loads(line)
                        if tweet['user']['id_str'] in removed_users:
                            bot_data[tweet['user']['id_str']][tweet["id"]] = tweet
                            counter += 1
                    except:
                        pass
    print('Finished for month ' + str(month))

#This path is where the bot data json will be saved
with open('path', 'w+') as outfile:
    json.dump(bot_data, outfile)

    

In [None]:
#This lemmatizes the tweets (this is necessary as not all these tweets made it through pre-processing)
#This is once again running the Frog natural language processing suite, so it might require a virtual machines 
#IMPORTS
import json
import frog
from collections import defaultdict

#This path is where the bot data json is saved
path = 'path'
with open(path, 'r') as infile:
    data = json.loads(infile.read())


frog = frog.Frog(frog.FrogOptions(parser=True, ner=True))    
counter = 0

for user in data.keys():
    for identifier in data[user].keys():
        if data[user][identifier]['truncated'] is True:
            tweet_raw = data[user][identifier]['extended_tweet']['full_text']
        else:
            tweet_raw = data[user][identifier]['text']

        tweet_proc = frog.process(tweet_raw) 

        data[user][identifier]['lemmatized'] = [token['lemma'] for token in tweet_proc]
        data[user][identifier]['tokenized'] = [token['text'] for token in tweet_proc]
        data[user][identifier]['full_frog'] = tweet_proc

        counter += 1
        if counter % 10000 == 0:

            print('processed ' + str(counter) + ' tweets')

#This path is where the lemmatized bot data will be saved
with open('path', 'w') as outfile:
    json.dump(data, outfile)




In [None]:
#Plotting user tweeting frequency
#IMPORTS
from datetime import datetime
import numpy as np
from matplotlib import pyplot as plt

#Potential loading of the data
#with open('C:/Notebooks/Bots_lemmatized.json', 'r') as infile:
#    bot_data = json.load(infile)

volumes = {}
for user in bot_data.keys():
    volumes[user] = {'relevant': len(bot_data[user].keys())}
    last_tweet = bot_data[user][list(bot_data[user].keys())[-1]]
    start_time = datetime.strptime(last_tweet['user']['created_at'], '%a %b %d %X %z %Y')
    end_time = datetime.strptime(last_tweet['created_at'], '%a %b %d %X %z %Y')
    age = (end_time - start_time).total_seconds()/(24*60*60)
    volumes[user]['general'] = last_tweet['user']['statuses_count']/age
    
to_plot = np.array([volumes[user]['general'] for user in bot_data.keys()])

offset = 0
bins = np.arange(0, 1000, step = 5)
plt.figure(figsize=(20,10))
#plt.xlim([min(to_plot)-offset, max(to_plot)+offset])
plt.hist(to_plot, bins=bins, alpha=0.5, )
plt.show()

#It might also be useful to simply print out the values from largest to lowest to get a better idea of the upper extremes
#This is not appropriately visible in the 'tail' of the graph
print(sorted(to_plot, reverse=True))

In [None]:
#Topic modelling the content of tweets from removed users
#IMPORTS
from gensim import corpora
import json
import pyLDAvis
import pyLDAvis.gensim
from gensim.models import LdaModel
import logging
from nltk.corpus import stopwords

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

stops = set(stopwords.words('dutch'))

#Get tweet list from overall data
tweets = []
for user in bot_data.keys():
    for identifier in bot_data[user].keys():
        if "retweeted_status" not in bot_data[user][identifier]:
            tweet = []
            for token in range(len(bot_data[user][identifier]['full_frog'])):
                if bot_data[user][identifier]['full_frog'][token]['dep'] != 'punct':
                    if bot_data[user][identifier]['full_frog'][token]['lemma'] not in stops:
                        tweet.append(bot_data[user][identifier]['full_frog'][token]['lemma'])
            tweets.append(tweet)

#Get corpus from tweets
dictionary = corpora.Dictionary(tweets)
corpus = [dictionary.doc2bow(tweet) for tweet in tweets]

#Training model and saving visualisation
#Include the number of topics to model for in the list
numbers = [20, 30, 40, 50]
for number in numbers:
    lda = LdaModel(corpus, num_topics=number, id2word=dictionary, alpha='auto', eta='auto',
                   passes=6, iterations=100000000, gamma_threshold=0.001, chunksize=2000)
    temp_file = "bots_topics_"+str(number)+"_cleaned"+".html"
    vignette = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
    pyLDAvis.save_html(vignette, temp_file)