# Notebook: Create Twitter Dataset

This notebook is used to crawl tweets mentioning the 89 relevant Twitter accounts of German politicians.
<br>**Contributors:** [Nils Hellwig](https://github.com/NilsHellwig/) | [Markus Bink](https://github.com/MarkusBink/)

## Packages

In [1]:
from datetime import datetime, timedelta
import nest_asyncio
import pandas as pd
import numpy as np
import calendar
import random
import twint
import re
import os

## Parameters

In [2]:
RAW_DATASET_PATH = '../Datasets/raw_dataset/'
SEED_VALUE = 0
N_DAYS_PER_ACCOUNT = 2
PARTIES = ['CDU_CSU', 'FDP', 'AFD', 'LINKE', 'SPD', 'GRUENE']

## Code

### 1. Get Reproducable Results

In [3]:
os.environ['PYTHONHASHSEED'] = str(SEED_VALUE)
random.seed(SEED_VALUE)
np.random.seed(SEED_VALUE)

### 2. Function to get a random day in one of the 12 months in 2021 (as well as the next day)

Twint expects a date in the format of `%Y-%m-%d`

In [4]:
def get_random_day_and_next(month: int) -> tuple:
    # Validate the month input
    if not 1 <= month <= 12:
        raise ValueError("Month must be an integer between 1 and 12")

    # Get the number of days in a specific month
    num_days = calendar.monthrange(2021, month)[1]

    # Get random day in the month
    day = random.randint(1, num_days)

    # Format date as a string
    date_str = f"2021-{month:02d}-{day:02d}"

    # Convert the date string to a datetime object
    date = datetime.strptime(date_str, "%Y-%m-%d")

    # Calculate the next day
    next_day = date + timedelta(days=1)

    # Return the date and next day as a tuple
    return (date.strftime("%Y-%m-%d"), next_day.strftime("%Y-%m-%d"))

Test the function

In [5]:
random_day, following_day = get_random_day_and_next(4)

print(f"Random day: {random_day}")
print(f"Following day: {following_day}")

Random day: 2021-04-28
Following day: 2021-04-29


### 3. Function to retrieve all tweets with a specific @-mention and date

In [6]:
def get_tweets_for_specific_day(query: str, since: str, until:str):
    nest_asyncio.apply()
    config = twint.Config()
    
    config.Search = query  # Set the search query
    config.Limit = 9000000000  # Set a very large limit to retrieve all tweets for a day
    config.Since = since  # Set the start date for the search
    config.Until = until  # Set the end date for the search
    config.Pandas = True  # Return the results as a Pandas DataFrame
    config.Hide_output = True  # Suppress console output
    
    twint.run.Search(config)
    return twint.storage.panda.Tweets_df

### 4. Load Accounts

#### Politicians

In [7]:
df_politicians = pd.read_csv('../Datasets/accounts_politicians.csv', header=0)

In [8]:
df_politicians

Unnamed: 0,AFD,LINKE,SPD,GRUENE,FDP,CDU,CSU
0,Alice_Weidel,SWagenknecht,Karl_Lauterbach,cem_oezdemir,c_lindner,jensspahn,Markus_Soeder
1,Joerg_Meuthen,GregorGysi,HeikoMaas,GoeringEckardt,MaStrackZi,ArminLaschet,DoroBaer
2,Beatrix_vStorch,katjakipping,OlafScholz,JTrittin,MarcoBuschmann,_FriedrichMerz,andreasscheuer
3,gottfriedcurio,DietmarBartsch,KuehniKev,KonstantinNotz,KonstantinKuhle,JuliaKloeckner,ManfredWeber
4,MalteKaufmann,anked,larsklingbeil,RenateKuenast,johannesvogel,n_roettgen,DerLenzMdB
5,JoanaCotar,b_riexinger,hubertus_heil,Ricarda_Lang,Wissing,PaulZiemiak,hahnflo
6,Tino_Chrupalla,jankortemdb,EskenSaskia,KathaSchulze,Lambsdorff,groehe,smuellermdb
7,StBrandner,Janine_Wissler,Ralf_Stegner,BriHasselmann,ria_schroeder,HBraun,DaniLudwigMdB
8,GtzFrmming,SevimDagdelen,KarambaDiaby,nouripour,LindaTeuteberg,rbrinkhaus,ANiebler
9,PetrBystronAFD,SusanneHennig,MiRo_SPD,MiKellner,f_schaeffler,tj_tweets,MarkusFerber


In [9]:
df_parties = pd.read_csv('../Datasets/accounts_parties.csv', header=0)

In [10]:
df_parties

Unnamed: 0,AFD,LINKE,SPD,GRUENE,FDP,CDU_CSU
0,AfD,dieLinke,spdde,Die_Gruenen,fdp,CDU
1,AfDimBundestag,Linksfraktion,spdbt,GrueneBundestag,fdpbt,CSU
2,AfDBerlin,dielinkeberlin,jusos,gruene_jugend,fdp_nrw,cducsubt
3,,,,,,Junge_Union


### 5. Create Directories for Dataset

In [11]:
# Iterate over the parties
for party in PARTIES:
    # Try to create the directory for the party in dataset folder
    try:
        os.makedirs(RAW_DATASET_PATH + party)
    except FileExistsError:
        # If the directory already exists pass
        pass

### 6. Download Tweets by Politicians

In [12]:
for party, col_data in df_politicians.items():
    # Iterate over the accounts of the current party
    for itr, account_name in enumerate(col_data.to_numpy()):
        # tweets by the parties CDU and CSU will be stored in the same directory
        if party in ["CDU", "CSU"]:
            party = "CDU_CSU"
            
        # Print account name of current iteration
        print(f'Current Iteration: {itr+1} | Party: {party} | Account: @{account_name}')
        
        # Initialize an empty DataFrame to store the tweets for a party account
        dataset = pd.DataFrame()
        
        # Iterate over the months of the year
        for month in range(1, 13):
            # Initialize an empty list to store the random days for the current month and account
            random_days = []
            
            # Iterate over the number of days to crawl per account
            for i in range(N_DAYS_PER_ACCOUNT):
                random_day_found = False
                
                # Keep trying to find a random day that has not been used before
                while not(random_day_found):
                    # Get the start and end dates for a random day in the month
                    random_day, following_day = get_random_day_and_next(month)
                    if not(random_day in random_days):
                        random_days.append(random_day)
                        random_day_found = True
            
                # Get tweets for the specified day
                new_tweets = get_tweets_for_specific_day(query=f"@{account_name}", since=random_day, until=following_day)
                
                # Save the information for which account the tweet was crawled
                new_tweets['source_account'] = account_name
                new_tweets['source_party'] = party
                
                # Remove place column (as this column had problems regarding the data type and is irrelevant for our analyses)
                try:
                    del new_tweets['place']
                except:
                    pass
                
                # Append the new tweets to the dataset
                dataset = pd.concat([dataset, new_tweets], axis=0).reset_index().drop(columns='index')
            
        # Remove all line breaks from the values in the "tweet" row
        dataset['tweet'] = dataset['tweet'].apply(lambda x: re.sub(r'\r\n|\r|\n', '', x))
        
        # Save the dataset to a CSV file
        dataset.to_csv(f"../Datasets/raw_dataset/{party}/{account_name}.csv")

Current Iteration: 1 | Party: AFD | Account: @Alice_Weidel
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.


KeyError: 'place'

In [None]:
from time import strftime, localtime
localtime()

### 7. Download Tweets From Party Accounts

In [None]:
for party, col_data in df_parties.items():
    # Iterate over the accounts of the current party
    # It is checked if a value is NaN, because only CDU and CSU have a fourth account, which we consider
    for itr, account_name in enumerate(col_data.to_numpy()[~pd.isnull(col_data.to_numpy())]):
        # Print account name of current iteration
        print(f'Current Iteration: {itr+1} | Party: {party} | Account: @{account_name}')
        
        # Initialize an empty DataFrame to store the tweets for a party account
        dataset = pd.DataFrame()
        
        # Iterate over the months of the year
        for month in range(1, 13):
            # Initialize an empty list to store the random days for the current month and account
            random_days = []
            
            # Iterate over the number of days to crawl per account
            for i in range(N_DAYS_PER_ACCOUNT):
                random_day_found = False
                
                # Keep trying to find a random day that has not been used before
                while not(random_day_found):
                    # Get the start and end dates for a random day in the month
                    random_day, following_day = get_random_day_and_next(month)

                    if not(random_day in random_days):
                        random_days.append(random_day)
                        random_day_found = True

                # Get tweets for the specified day
                new_tweets = get_tweets_for_specific_day(query=f"@{account_name}", since=random_day, until=following_day)
                
                # Save the information for which account the tweet was crawled
                new_tweets['source_account'] = account_name
                new_tweets['source_party'] = party
                
                # Append the new tweets to the dataset
                dataset = pd.concat([dataset, new_tweets], axis=0).reset_index().drop(columns='index')
        
        # Remove all line breaks from the values in the "tweet" column
        dataset['tweet'] = dataset['tweet'].apply(lambda x: re.sub(r'\r\n|\r|\n', '', x))
        
        # Save the dataset to a CSV file
        dataset.to_csv(f"../Datasets/raw_dataset/{party}/{account_name}.csv")