# Downloading data from Twitter

Note that the functions used for preprocessing and downloading are imported from our script 'twitter_helpers'.

In [1]:
import pandas as pd
import pickle
import twitter_helpers as th

# set local working directory
# import os
# os.chdir('/Users/patrickschulze/Desktop/Consulting/Bundestag-MP-Analyse/')

## 1. Data Import

In [2]:
# import Bundestag data
with open('abg_df.pickle', 'rb') as handle:
    bt_data = pickle.load(handle)


In [3]:
bt_data

Unnamed: 0,Name,Partei,Wahlart,Bundesland,Wahlkreis,Ausschuesse,Soziale Medien,Biografie,Twitter
0,"Abercron, Dr. Michael von",CDU/CSU,Direkt gewählt,Schleswig-Holstein,Wahlkreis 007: Pinneberg,{'Ordentliches Mitglied': ['Ausschuss für Ernä...,{'von-abercron.de/': 'http://www.von-abercron....,Geboren am 17. November 1952 in Ehlers...,mvabercron
1,"Achelwilm, Doris",Die Linke,Gewählt über Landesliste,Bremen,n.a.,{'Ordentliches Mitglied': ['Ausschuss für Fami...,{'doris-achelwilm.de': 'http://www.doris-achel...,Geboren am 30. November 1976 in Thuine...,DorisAchelwilm
2,"Aggelidis, Grigorios",FDP,Gewählt über Landesliste,Niedersachsen,Wahlkreis 043: Hannover-Land I,{'Ordentliches Mitglied': ['Kuratorium der Bun...,{'grigorios-aggelidis.de': 'http://www.grigori...,Geboren am 19. August 1965 in Hannover...,aggelidis_fdp
3,"Akbulut, Gökay",Die Linke,Gewählt über Landesliste,Baden-Württemberg,Wahlkreis 275: Mannheim,"{'Ordentliches Mitglied': ['Schriftführer/in',...",{'goekay-akbulut.de': 'https://goekay-akbulut....,Geboren 1982 in Pinarbasi/ Türkei; ledig.Juni ...,akbulutgokay
4,"Albani, Stephan",CDU/CSU,Gewählt über Landesliste,Niedersachsen,Wahlkreis 027: Oldenburg – Ammerland,{'Ordentliches Mitglied': ['Ausschuss für Bild...,{'stephan-albani.de': 'http://www.stephan-alba...,Geboren am 3. Juni 1968 in Göttingen; verheira...,
...,...,...,...,...,...,...,...,...,...
725,"Zierke, Stefan",SPD,Gewählt über Landesliste,Brandenburg,Wahlkreis 057: Uckermark – Barnim I,{'Parlamentarischer Staatssekretär bei der Bun...,{'stefan-zierke.de': 'http://www.stefan-zierke...,Geboren am 5. Dezember 1970 in Prenzlau (Brand...,zierke
726,"Zimmer, Prof. Dr. Matthias",CDU/CSU,Direkt gewählt,Hessen,Wahlkreis 182: Frankfurt am Main I,{'Obmann': ['Ausschuss für Arbeit und Soziales...,{'matthias-zimmer.de': 'http://www.matthias-zi...,Geboren am 3. Mai 1961 in Marburg/Lahn; verhei...,matthiaszimmer
727,"Zimmermann, Dr. Jens",SPD,Gewählt über Landesliste,Hessen,Wahlkreis 187: Odenwald,"{'Obmann': ['Ausschuss Digitale Agenda'], 'Ord...",{'jens-zimmermann.org': 'http://www.jens-zimme...,Geboren am 9. September 1981 in Groß-U...,JensZimmermann1
728,"Zimmermann, Pia",Die Linke,Gewählt über Landesliste,Niedersachsen,Wahlkreis 051: Helmstedt – Wolfsburg,{'Ordentliches Mitglied': ['Ausschuss für Gesu...,{'pia-zimmermann.de': 'http://www.pia-zimmerma...,Geboren am 17. September 1956 in Braunschweig;...,


In [4]:
# select name and username for each member and store in table called twitter_account
names = bt_data['Name']
twitter_usernames = bt_data['Twitter']
names.rename("name", inplace = True)
twitter_usernames.rename("username", inplace = True)
twitter_account = pd.concat([names, twitter_usernames], axis = 1)

In [5]:
twitter_account

Unnamed: 0,name,username
0,"Abercron, Dr. Michael von",mvabercron
1,"Achelwilm, Doris",DorisAchelwilm
2,"Aggelidis, Grigorios",aggelidis_fdp
3,"Akbulut, Gökay",akbulutgokay
4,"Albani, Stephan",
...,...,...
725,"Zierke, Stefan",zierke
726,"Zimmer, Prof. Dr. Matthias",matthiaszimmer
727,"Zimmermann, Dr. Jens",JensZimmermann1
728,"Zimmermann, Pia",


In [36]:
# drop usernames that are nan or empty, in order to avoid error message when downloading
usr_nan = twitter_account.username.isna()
usr_empty = twitter_account.username == ''
twitter_account = twitter_account[~(usr_nan | usr_empty)]

In [37]:
twitter_account

Unnamed: 0,name,username
0,"Abercron, Dr. Michael von",mvabercron
1,"Achelwilm, Doris",DorisAchelwilm
2,"Aggelidis, Grigorios",aggelidis_fdp
3,"Akbulut, Gökay",akbulutgokay
5,"Alt, Renata",RenataAlt_MdB
...,...,...
720,"Zdebel, Hubertus",ZdebelHubertus
724,"Ziemiak, Paul",PaulZiemiak
725,"Zierke, Stefan",zierke
726,"Zimmer, Prof. Dr. Matthias",matthiaszimmer


## 2. Download with GetOldTweets3

GetOldTweets3 is an "inofficial" Python module, which can be used to scrape tweets and other information from Twitter. While the official Twitter-API Tweepy has a limit of 3200 Tweets per user, with GetOldTweets3 it is possible to download an unlimited number of tweets for a given user. 

In [38]:
# download tweets using GetOldTweets3 for specified time period
res_got3 = pd.DataFrame()
for username in twitter_account.iloc[0:40, 1]:
    res_got3 = pd.concat([res_got3, th.download_tweets_got3(username, since = "2020-04-05", \
                                          until = "2020-04-08")])

Downloading for mvabercron
Downloading for DorisAchelwilm
Downloading for aggelidis_fdp
Downloading for akbulutgokay
Downloading for RenataAlt_MdB
Downloading for peteraltmaier
Downloading for NielsAnnen
Downloading for cad59
Downloading for PeterAumer
Downloading for bela_bach
Downloading for badulrichmartha
Downloading for DoroBaer
Downloading for ABaerbock
Downloading for MdB_Ulrike_Bahr
Downloading for Thomas_Bareiss
Downloading for SBarrientosK
Downloading for matthiasbartke 
Downloading for soerenbartol
Downloading for DietmarBartsch
Downloading for baerbelbas
Downloading for berndbaumannafd
Downloading for MargareteBause
Downloading for DerDanyal
Downloading for LieblingXhain
Downloading for jensbeeck
Downloading for MaikBeermann
Downloading for Manfredbehrens
Downloading for SybilleBenning
Downloading for MarcBernhardAfD
Downloading for c_bernstiel
Downloading for lgbeutin
Downloading for Peter_Beyer
Downloading for MBiadaczMdB
Downloading for steffenbilger
Downloading for loth

In [39]:
# add 'name' column (download only uses 'username' as input)
res_got3 = twitter_account.merge(res_got3, on = 'username')
# display results
res_got3

Unnamed: 0,name,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,formatted_date,hashtags,mentions,geo,urls
0,"Achelwilm, Doris",DorisAchelwilm,,#Weltgesundheitstag 2020: Gesundheit ist kein ...,8,42,4,1247472932466688000,https://twitter.com/DorisAchelwilm/status/1247...,4819478705,2020-04-07 10:34:58+00:00,Tue Apr 07 10:34:58 +0000 2020,#Weltgesundheitstag,,,
1,"Achelwilm, Doris",DorisAchelwilm,ndaktuell,Queerpolitik hat unter #Corona keinen leichten...,15,30,4,1247448540533710848,https://twitter.com/DorisAchelwilm/status/1247...,4819478705,2020-04-07 08:58:03+00:00,Tue Apr 07 08:58:03 +0000 2020,#Corona #Hatespeech #LGBT #Trump #Orban,,,https://twitter.com/ndaktuell/status/124723009...
2,"Achelwilm, Doris",DorisAchelwilm,salzzitronen,Dieser Tarifabschluss wird aus Pflegeversicher...,0,0,1,1247210632635703298,https://twitter.com/DorisAchelwilm/status/1247...,4819478705,2020-04-06 17:12:41+00:00,Mon Apr 06 17:12:41 +0000 2020,,,,
3,"Achelwilm, Doris",DorisAchelwilm,redheadhb2,"Gut, dass Du es sagst. Nach der langen Ausbild...",0,2,0,1247207997664878592,https://twitter.com/DorisAchelwilm/status/1247...,4819478705,2020-04-06 17:02:13+00:00,Mon Apr 06 17:02:13 +0000 2020,,,,
4,"Achelwilm, Doris",DorisAchelwilm,,Geht doch: #Altenpflegekräfte bekommen im Juli...,4,21,4,1247203823438958594,https://twitter.com/DorisAchelwilm/status/1247...,4819478705,2020-04-06 16:45:38+00:00,Mon Apr 06 16:45:38 +0000 2020,#Altenpflegekr #Krankenhauspersonal #Systemrel...,,,https://www.verdi.de/presse/pressemitteilungen...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,"Boehringer, Peter",PeterBoehringer,,Selbstmord aus Angst vor dem CO2-Tod: EUliten ...,6,14,3,1247064072740356096,https://twitter.com/PeterBoehringer/status/124...,99268600,2020-04-06 07:30:18+00:00,Mon Apr 06 07:30:18 +0000 2020,#co2 #klimafl #AfD #pboehringer,,,https://youtu.be/h_cSmlQZOgc
85,"Brandenburg (Rhein-Neckar), Dr. Jens",JBrandenburgFDP,,"Wer nach dieser Pressekonferenz glaubt, die Bu...",1,8,0,1247182620980662273,https://twitter.com/JBrandenburgFDP/status/124...,857906951707127808,2020-04-06 15:21:23+00:00,Mon Apr 06 15:21:23 +0000 2020,,,,
86,"Brandenburg (Rhein-Neckar), Dr. Jens",JBrandenburgFDP,StephanManfred1,Leider nein,0,1,2,1246782600938741766,https://twitter.com/JBrandenburgFDP/status/124...,857906951707127808,2020-04-05 12:51:50+00:00,Sun Apr 05 12:51:50 +0000 2020,,,,
87,"Brandenburg (Südpfalz), Mario",BraFDP,,Heute hat sich der Allgemeine Studierendenauss...,1,3,1,1247539165345886208,https://twitter.com/BraFDP/status/124753916534...,802968497831739392,2020-04-07 14:58:09+00:00,Tue Apr 07 14:58:09 +0000 2020,#Asta,,,https://www.asta-landau.de/wp-content/uploads/...


We can check that it is indeed possible to download more than 3200 tweets per user:

In [40]:
res = th.download_tweets_got3('realDonaldTrump',since = "2018-09-24", until = "2020-04-08")

Downloading for realDonaldTrump


In [41]:
res.shape

(6737, 15)

However, although occuring very rarely, some tweets appear to be missing (and some rows are empty). Furthermore, retweets cannot be downloaded using GetOldTweets3. 

## 3. Download with Tweepy

With Tweepy we can circumvent these shortcomings, i.e. we can download retweets and there is no information missing, as Tweepy is the official Twitter-API. However, as mentioned, there is a limit of 3200 Tweets per user. 

In [42]:
# download most recent tweets using tweepy (at most 3200 tweets per user)
res_tweepy = pd.DataFrame()
for username in twitter_account.iloc[0:3, 1]:
    res_tweepy = pd.concat([res_tweepy, th.download_tweets_tweepy(username)])
# again, add column 'name'
res_tweepy = twitter_account.merge(res_tweepy, on = 'username')

Downloading for mvabercron
Downloading for DorisAchelwilm
Downloading for aggelidis_fdp


In [43]:
res_tweepy.columns

Index(['name', 'username', '_api', '_json', 'created_at', 'id', 'id_str',
       'full_text', 'truncated', 'display_text_range', 'entities',
       'extended_entities', 'source', 'source_url', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'author', 'user',
       'geo', 'coordinates', 'place', 'contributors', 'is_quote_status',
       'retweet_count', 'favorite_count', 'favorited', 'retweeted',
       'possibly_sensitive', 'lang', 'retweeted_status', 'quoted_status_id',
       'quoted_status_id_str', 'quoted_status_permalink', 'quoted_status',
       'withheld_in_countries'],
      dtype='object')

Columns that might be most important for us:

In [46]:
res_tweepy.iloc[:,[0,1,4,7,9,10,17,19,25,26,27,28,29,32]].columns

Index(['name', 'username', 'created_at', 'full_text', 'display_text_range',
       'entities', 'in_reply_to_user_id_str', 'author', 'is_quote_status',
       'retweet_count', 'favorite_count', 'favorited', 'retweeted',
       'retweeted_status'],
      dtype='object')

In [47]:
res_tweepy.iloc[:,[0,1,4,7,9,10,17,19,25,26,27,28,29,32]]

Unnamed: 0,name,username,created_at,full_text,display_text_range,entities,in_reply_to_user_id_str,author,is_quote_status,retweet_count,favorite_count,favorited,retweeted,retweeted_status
0,"Abercron, Dr. Michael von",mvabercron,2020-04-10 14:10:03,Dieser Karfreitag wird anders sein als in den ...,"[0, 204]","{'hashtags': [], 'symbols': [], 'user_mentions...",,User(_api=<tweepy.api.API object at 0x11355915...,False,1,3,False,False,
1,"Abercron, Dr. Michael von",mvabercron,2020-04-02 12:29:38,"Uni fällt aus? Keine Angst, eine Pause im Lehr...","[0, 275]","{'hashtags': [{'text': 'BAföG', 'indices': [82...",,User(_api=<tweepy.api.API object at 0x11355915...,False,0,0,False,False,
2,"Abercron, Dr. Michael von",mvabercron,2020-04-01 19:21:32,Alle Unternehmen können vom #Corona-Sonderprog...,"[0, 199]","{'hashtags': [{'text': 'Corona', 'indices': [2...",,User(_api=<tweepy.api.API object at 0x11355915...,False,2,1,False,False,
3,"Abercron, Dr. Michael von",mvabercron,2020-03-31 10:59:46,RT @cducsubt: .@gitta_connemann und @mvabercro...,"[0, 140]","{'hashtags': [{'text': 'Corona', 'indices': [5...",,User(_api=<tweepy.api.API object at 0x11355915...,False,4,0,False,False,Status(_api=<tweepy.api.API object at 0x113559...
4,"Abercron, Dr. Michael von",mvabercron,2020-03-31 10:59:37,Wir kämpfen um jeden Job – durch Ausweitung d....,"[0, 280]","{'hashtags': [{'text': 'wirhandeln', 'indices'...",,User(_api=<tweepy.api.API object at 0x11355915...,False,0,0,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3880,"Aggelidis, Grigorios",aggelidis_fdp,2018-04-20 10:52:34,@BAEKaktuell möchte durch eine zentrale Liste ...,"[0, 265]",{'hashtags': [{'text': 'Schwangerschaftsabruch...,243143022,User(_api=<tweepy.api.API object at 0x11355915...,False,0,1,False,False,
3881,"Aggelidis, Grigorios",aggelidis_fdp,2018-04-19 12:54:16,Erste Studie zeigt #baukindergeld entfalten ka...,"[0, 212]","{'hashtags': [{'text': 'baukindergeld', 'indic...",,User(_api=<tweepy.api.API object at 0x11355915...,False,3,9,False,False,
3882,"Aggelidis, Grigorios",aggelidis_fdp,2018-04-18 14:53:22,Der Vorschlag von @hubertus_heil läuft an Lebe...,"[0, 277]","{'hashtags': [{'text': 'Teilzeit', 'indices': ...",,User(_api=<tweepy.api.API object at 0x11355915...,False,0,2,False,False,
3883,"Aggelidis, Grigorios",aggelidis_fdp,2018-04-15 15:03:07,Fast noch wichtiger als die Reden sind die mot...,"[0, 185]","{'hashtags': [{'text': 'fdplptnds', 'indices':...",,User(_api=<tweepy.api.API object at 0x11355915...,False,1,2,False,False,


If 'in_reply_to_user_id_str' is not 'None', the tweet is a reply to another tweet. If 'is_quote_status' is not 'False', the tweet is a quote (check definition of quotes in twitter if unknown).

### Author and User

In [48]:
res_tweepy.columns.get_loc('author')

19

In [49]:
res_tweepy.columns.get_loc('user')

20

In [50]:
res_tweepy.iloc[3,19]

User(_api=<tweepy.api.API object at 0x113559150>, _json={'id': 862747349277450240, 'id_str': '862747349277450240', 'name': 'Dr. Michael von Abercron MdB', 'screen_name': 'mvabercron', 'location': 'Pinneberg, Deutschland', 'description': 'Direkt gewählter Bundestagsabgeordneter aus dem Wahlkreis Pinneberg | Es schreiben Michael von Abercron und sein Team', 'url': 'https://t.co/5Qqm51N9U8', 'entities': {'url': {'urls': [{'url': 'https://t.co/5Qqm51N9U8', 'expanded_url': 'http://www.von-abercron.de', 'display_url': 'von-abercron.de', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 750, 'friends_count': 674, 'listed_count': 46, 'created_at': 'Thu May 11 19:12:51 +0000 2017', 'favourites_count': 162, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': True, 'statuses_count': 187, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '000000', 'profile_

In [51]:
res_tweepy.iloc[3,20]

User(_api=<tweepy.api.API object at 0x113559150>, _json={'id': 862747349277450240, 'id_str': '862747349277450240', 'name': 'Dr. Michael von Abercron MdB', 'screen_name': 'mvabercron', 'location': 'Pinneberg, Deutschland', 'description': 'Direkt gewählter Bundestagsabgeordneter aus dem Wahlkreis Pinneberg | Es schreiben Michael von Abercron und sein Team', 'url': 'https://t.co/5Qqm51N9U8', 'entities': {'url': {'urls': [{'url': 'https://t.co/5Qqm51N9U8', 'expanded_url': 'http://www.von-abercron.de', 'display_url': 'von-abercron.de', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 750, 'friends_count': 674, 'listed_count': 46, 'created_at': 'Thu May 11 19:12:51 +0000 2017', 'favourites_count': 162, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': True, 'statuses_count': 187, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '000000', 'profile_

We see that 'author' and 'user' are essentially the same. On Stackoverflow it's stated that 'user' is deprecated, thus we should use 'author' (if needed).

The following fields of the 'author' object might be interesting for us:

In [52]:
res_tweepy.iloc[3,19].description

'Direkt gewählter Bundestagsabgeordneter aus dem Wahlkreis Pinneberg | Es schreiben Michael von Abercron und sein Team'

In [53]:
res_tweepy.iloc[3,19].location

'Pinneberg, Deutschland'

The number of followers this account currently has:

In [54]:
res_tweepy.iloc[3,19].followers_count

750

The number of users this account is following (AKA their “followings”):

In [55]:
res_tweepy.iloc[3,19].friends_count

674

The number of public lists that this user is a member of:

In [56]:
res_tweepy.iloc[3,19].listed_count

46

The number of Tweets this user has liked in the account’s lifetime:

In [57]:
res_tweepy.iloc[3,19].favourites_count

162

### Retweets

By column 'retweeted_status' it can be checked, whether tweet is a retweet or a new tweet. Retweets are truncated to 140 characters:

In [61]:
res_tweepy.columns.get_loc('retweeted_status')

32

In [58]:
res_tweepy[['display_text_range','retweeted_status']]

Unnamed: 0,display_text_range,retweeted_status
0,"[0, 204]",
1,"[0, 275]",
2,"[0, 199]",
3,"[0, 140]",Status(_api=<tweepy.api.API object at 0x113559...
4,"[0, 280]",
...,...,...
3880,"[0, 265]",
3881,"[0, 212]",
3882,"[0, 277]",
3883,"[0, 185]",


We can retrieve the full text of a retweet by accessing the attribute 'full_text' of the tweepy object in column 'retweeted_status' (if the tweet is a retweet, in which case the value in this column is not NaN):

In [60]:
res_tweepy.iloc[3,32].full_text

'.@gitta_connemann und @mvabercron erklären: #Corona-Soforthilfen auch für Höfe, Forstbetriebe und landwirtschaftlichen Gartenbau  https://t.co/CRADyyuX4D'

### Additional Information - 'entities'

For each tweet, column 'entities' contains a dict with additional information, such as hashtags or users and urls that are mentioned in the tweet.

In [62]:
res_tweepy.columns.get_loc('entities')

10

In [63]:
# value of 'entities' for the 4th downloaded tweet
res_tweepy.iloc[3,10]

{'hashtags': [{'text': 'Corona', 'indices': [58, 65]}],
 'symbols': [],
 'user_mentions': [{'screen_name': 'cducsubt',
   'name': 'CDU/CSU',
   'id': 46085533,
   'id_str': '46085533',
   'indices': [3, 12]},
  {'screen_name': 'gitta_connemann',
   'name': 'Gitta Connemann',
   'id': 1125751445205262336,
   'id_str': '1125751445205262336',
   'indices': [15, 31]},
  {'screen_name': 'mvabercron',
   'name': 'Dr. Michael von Abercron MdB',
   'id': 862747349277450240,
   'id_str': '862747349277450240',
   'indices': [36, 47]}],
 'urls': []}

In [65]:
# Access first hashtag of this tweet (in this case the only hashtag)
res_tweepy.iloc[3,10]['hashtags'][0]['text']

'Corona'

In [67]:
# obtain username of second user that is mentioned
res_tweepy.iloc[3,10]['user_mentions'][1]['name']

'Gitta Connemann'