# Notebook: Create Subset

This notebook is used to create a subset of **2000** tweets, which will then be annotated with respect to their sentiment.
<br>**Contributors:** [Nils Hellwig](https://github.com/NilsHellwig/) | [Markus Bink](https://github.com/MarkusBink/)

## Packages

In [19]:
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
import random
import os

## Parameters

In [20]:
ANNOTATION_DATASET_PATH = '../Datasets/annotation_dataset'
DATASET_PATH = '../Datasets/dataset/'
SUBSET_SIZE = 2000
SEED_VALUE = 0
PARTIES = ["CDU_CSU", "SPD", "AFD", "FDP", "GRUENE", "LINKE"]

## Code

### 1. Get Reproducable Results

In [21]:
os.environ['PYTHONHASHSEED'] = str(SEED_VALUE)
random.seed(SEED_VALUE)
np.random.seed(SEED_VALUE)

### 2. Calculate Number of Tweets

In [22]:
n_tweets_total = 0
party_statistics = {}

In [23]:
for party in PARTIES:
    n_tweets_party = 0
    for subdir, _, files in os.walk(DATASET_PATH + party):
        for file in files:
            if file.endswith('.csv') and subdir[len(DATASET_PATH):] in PARTIES:
                # Get username of CSV file
                username = file[:-4]
                
                # Read dataframe
                df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
                
                # Add to counter
                n_tweets_party += df.shape[0]
                
                # Add length to n_tweets_total
                n_tweets_total += df.shape[0]
                
    party_statistics[party] = n_tweets_party

  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)


In [24]:
n_tweets_total

713733

In [25]:
party_statistics

{'CDU_CSU': 229981,
 'SPD': 230324,
 'AFD': 58045,
 'FDP': 80544,
 'GRUENE': 74004,
 'LINKE': 40835}

### 3. Check which party gets an additional Tweet

In [26]:
def get_key_with_max_value_under_0_5(dictionary):
    filtered_dict = {}
    for key, value in dictionary.items():
        if value < 0.5:
            filtered_dict[key] = value
    return max(filtered_dict, key=filtered_dict.get)

In [27]:
def truncate(x, d):
    return int(x*(10.0**d))/(10.0**d)

for party in party_statistics:
    party_statistics[party] = ((SUBSET_SIZE / n_tweets_total) * party_statistics[party]) - truncate((SUBSET_SIZE / n_tweets_total) * party_statistics[party], 0)

In [28]:
# Check which Party will get an additional Tweet
party_with_additional_tweet = get_key_with_max_value_under_0_5(party_statistics)
party_with_additional_tweet

'CDU_CSU'

### 4. Get Random Tweets From Each Account

In [29]:
n_subset_total = 0

In [30]:
annotation_dataset = pd.DataFrame()

# In principle, duplicate data points can occur in the data set, but this is extremely rare. 
# Nevertheless, we want to be sure that the performance of the model is not evaluated with a tweet that is also among the test data.
annotation_dataset_unique = False

while not(annotation_dataset_unique):
    for party in PARTIES:
        # Initialize an empty DataFrame to store the tweets from accounts of a party
        df_party = pd.DataFrame()
    
        for subdir, _, files in os.walk(DATASET_PATH + party):
            for file in files:
                if file.endswith('.csv') and subdir[len(DATASET_PATH):] in PARTIES:
                    # Get username of CSV file
                    username = file[:-4]
                
                    # Read dataframe
                    df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
                
                    # Add dataframe to party dataframe
                    df_party = pd.concat([df_party, df_account], axis=0).reset_index().drop(columns='index')
                
        n_tweets_party = df_party.shape[0]
        n_tweets_party_for_subset = round((SUBSET_SIZE / n_tweets_total) * n_tweets_party)
        
        if party_with_additional_tweet == party:
            n_tweets_party_for_subset += 1
            
        n_subset_total += n_tweets_party_for_subset
                
        df_samples_for_party = df_party.sample(n=n_tweets_party_for_subset, random_state=SEED_VALUE)
        annotation_dataset = pd.concat([annotation_dataset, df_samples_for_party], axis=0).reset_index().drop(columns='index')       
        #print(party, username, n_tweets_party, n_tweets_party_for_subset, (SUBSET_SIZE / n_tweets_total) * n_tweets_party, n_tweets_party_for_subset)
        
    if annotation_dataset['id'].nunique() == len(annotation_dataset):
        annotation_dataset_unique = True

  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)


In [31]:
n_subset_total

2000

### 4. Create Sub Datasets for Annotation 

Save dataset for annotation

In [32]:
annotation_dataset = annotation_dataset.sample(frac=1, random_state=SEED_VALUE).reset_index()
annotation_dataset.to_csv(ANNOTATION_DATASET_PATH + "/annotation_dataset.csv")

For the entire dataset, we do not delete duplicates. Duplicates can occur because a tweet can mention several politicians at once, which means that a tweet could be crawled for multiple politicians. However, we want to avoid evaluating the trained BERT model with tweets that were also used for training. Therefore, we make sure that there are no duplicates among the 2000 annotated tweets that we will later use for training and evaluation of our BERT model.

In [33]:
# Check if the 'id' column is unique
is_unique = df['id'].is_unique
print("Dataset uniqueness: ", is_unique)

Dataset uniqueness:  True


Add column for sentiment label and columns with information that might be helpfull for annotators

In [34]:
annotation_dataset["sentiment_label"] = ""
annotation_dataset = annotation_dataset.loc[:, ['id', 'username', 'date', 'sentiment_label', 'tweet', 'link', 'source_account']]

In [35]:
df_session1 = annotation_dataset[:int(SUBSET_SIZE/2)]
df_session2 = annotation_dataset[int(SUBSET_SIZE/2):]

In [36]:
df_session1.to_csv(ANNOTATION_DATASET_PATH + "/tweets_session_1.csv")
df_session1.to_excel(ANNOTATION_DATASET_PATH + "/tweets_session_1.xlsx")
df_session1

Unnamed: 0,id,username,date,sentiment_label,tweet,link,source_account
0,1390067537589776384,biwa59,2021-05-05 22:15:01,,@realMaskedDoc @jensspahn Ich habe bei der Anm...,https://twitter.com/biwa59/status/139006753758...,jensspahn
1,1373465664304865288,Thom50521528,2021-03-21 02:45:06,,@t0b1t @1984reload @ViolaPriesemann @Karl_Laut...,https://twitter.com/Thom50521528/status/137346...,Karl_Lauterbach
2,1441262843572068357,SchwarzRalf1,2021-09-24 04:46:53,,@spdbt Wenn Die Linke (ehemals SED) an einer B...,https://twitter.com/SchwarzRalf1/status/144126...,spdbt
3,1410656625162174464,Hosenmaus1,2021-07-01 17:48:42,,@AKS03350356 @In_griiieed @JRatek @Ostwestfale...,https://twitter.com/Hosenmaus1/status/14106566...,spdde
4,1460883235983011842,JohannkuPeter,2021-11-17 08:11:20,,@ITUADE @B_Heisterkamp @KathaSchulze Grüne NAZ...,https://twitter.com/JohannkuPeter/status/14608...,KathaSchulze
...,...,...,...,...,...,...,...
995,1449655103749820417,Elke_Ferner,2021-10-17 08:34:44,,@JA_Allmendinger hat Recht! Nur 2 Frauen aber ...,https://twitter.com/Elke_Ferner/status/1449655...,EskenSaskia
996,1405999592617959430,blauervogelx,2021-06-18 21:23:19,,@docholi95307723 @Karl_Lauterbach Die Beste Lö...,https://twitter.com/blauervogelx/status/140599...,Karl_Lauterbach
997,1424647942393184256,ErbenKongs,2021-08-09 08:25:12,,@MarieLou_Fire @Die_Gruenen Toller Film. Wird ...,https://twitter.com/ErbenKongs/status/14246479...,Die_Gruenen
998,1381579238218797057,joffi_99,2021-04-12 12:05:33,,@Murgpirat @CSU @GrueneBundestag @fdpbt @CDU I...,https://twitter.com/joffi_99/status/1381579238...,fdpbt


In [37]:
df_session2.to_csv(ANNOTATION_DATASET_PATH + "/tweets_session_2.csv")
df_session2.to_excel(ANNOTATION_DATASET_PATH + "/tweets_session_2.xlsx")
df_session2

Unnamed: 0,id,username,date,sentiment_label,tweet,link,source_account
1000,1415216929065902085,Economicus,2021-07-14 07:49:44,,@tagesschau Ist Merkel schon 2x geimpft? @Serv...,https://twitter.com/Economicus/status/14152169...,_FriedrichMerz
1001,1459132644831158277,LiffersGert,2021-11-12 12:15:06,,@MarcoBuschmann @spdbt @GrueneBundestag @fdpbt...,https://twitter.com/LiffersGert/status/1459132...,MarcoBuschmann
1002,1399827360996859909,sarfeld,2021-06-01 20:37:05,,@Patrick_Kunkel @tagfuchs @ArminLaschet @SZ Eb...,https://twitter.com/sarfeld/status/13998273609...,ArminLaschet
1003,1464363994278502407,GerdaKozian,2021-11-26 22:42:37,,@christiansagt @Karl_Lauterbach @chrischirp Ho...,https://twitter.com/GerdaKozian/status/1464363...,Karl_Lauterbach
1004,1382658396797685761,jsprondel,2021-04-15 11:33:45,,"Universitäten sind dicht (abgesehen davon, das...",https://twitter.com/jsprondel/status/138265839...,Karl_Lauterbach
...,...,...,...,...,...,...,...
1995,1382603282389024768,systemanalysen,2021-04-15 07:54:44,,@DonKamillo8 @SprichWahrheit @ergroovt @Karl_L...,https://twitter.com/systemanalysen/status/1382...,Karl_Lauterbach
1996,1376432992642564096,equickfinder,2021-03-29 07:16:12,,@zeitonline langsam muss man mehr Angst vor de...,https://twitter.com/equickfinder/status/137643...,Karl_Lauterbach
1997,1393553966273896458,FrankKrey,2021-05-15 13:08:51,,@fdp @Wissing Das sollte die FDP den Profis au...,https://twitter.com/FrankKrey/status/139355396...,fdp
1998,1373562494300618755,DasIbf,2021-03-21 09:09:52,,Jens Spahn: Ehrenrettung bezüglich Homöopathie...,https://twitter.com/DasIbf/status/137356249430...,jensspahn
