# Notebook: Create Subset

This notebook is used to create a subset of **2000** tweets, which will then be annotated with respect to their sentiment.
<br>**Contributors:** [Nils Hellwig](https://github.com/NilsHellwig/) | [Markus Bink](https://github.com/MarkusBink/)

## Packages

In [35]:
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
import random
import os

## Parameters

In [36]:
ANNOTATION_DATASET_PATH = '../Datasets/annotation_dataset'
DATASET_PATH = '../Datasets/dataset/'
SUBSET_SIZE = 2000
SEED_VALUE = 0
PARTIES = ["CDU_CSU", "SPD", "AFD", "FDP", "GRUENE", "LINKE"]

## Code

### 1. Get Reproducable Results

In [37]:
os.environ['PYTHONHASHSEED'] = str(SEED_VALUE)
random.seed(SEED_VALUE)
np.random.seed(SEED_VALUE)

### 2. Calculate Number of Tweets

In [38]:
n_tweets_total = 0
party_statistics = {}

In [39]:
for party in PARTIES:
    n_tweets_party = 0
    for subdir, _, files in os.walk(DATASET_PATH + party):
        for file in files:
            if file.endswith('.csv') and subdir[len(DATASET_PATH):] in PARTIES:
                # Get username of CSV file
                username = file[:-4]
                
                # Read dataframe
                df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
                
                # Add to counter
                n_tweets_party += df.shape[0]
                
                # Add length to n_tweets_total
                n_tweets_total += df.shape[0]
                
    party_statistics[party] = n_tweets_party

  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)


In [40]:
n_tweets_total

713742

In [41]:
party_statistics

{'CDU_CSU': 229984,
 'SPD': 230325,
 'AFD': 58046,
 'FDP': 80546,
 'GRUENE': 74005,
 'LINKE': 40836}

### 3. Check which party gets an additional Tweet

In [42]:
def get_key_with_max_value_under_0_5(dictionary):
    filtered_dict = {}
    for key, value in dictionary.items():
        if value < 0.5:
            filtered_dict[key] = value
    return max(filtered_dict, key=filtered_dict.get)

In [43]:
def truncate(x, d):
    return int(x*(10.0**d))/(10.0**d)

for party in party_statistics:
    party_statistics[party] = ((SUBSET_SIZE / n_tweets_total) * party_statistics[party]) - truncate((SUBSET_SIZE / n_tweets_total) * party_statistics[party], 0)

In [44]:
# Check which Party will get an additional Tweet
party_with_additional_tweet = get_key_with_max_value_under_0_5(party_statistics)
party_with_additional_tweet

'CDU_CSU'

### 4. Get Random Tweets From Each Account

In [45]:
n_subset_total = 0

In [46]:
annotation_dataset = pd.DataFrame()

# In principle, duplicate data points can occur in the data set, but this is extremely rare. 
# Nevertheless, we want to be sure that the performance of the model is not evaluated with a tweet that is also among the test data.
annotation_dataset_unique = False

while not(annotation_dataset_unique):
    for party in PARTIES:
        # Initialize an empty DataFrame to store the tweets from accounts of a party
        df_party = pd.DataFrame()
    
        for subdir, _, files in os.walk(DATASET_PATH + party):
            for file in files:
                if file.endswith('.csv') and subdir[len(DATASET_PATH):] in PARTIES:
                    # Get username of CSV file
                    username = file[:-4]
                
                    # Read dataframe
                    df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
                
                    # Add dataframe to party dataframe
                    df_party = pd.concat([df_party, df_account], axis=0).reset_index().drop(columns='index')
                
        n_tweets_party = df_party.shape[0]
        n_tweets_party_for_subset = round((SUBSET_SIZE / n_tweets_total) * n_tweets_party)
        
        if party_with_additional_tweet == party:
            n_tweets_party_for_subset += 1
            
        n_subset_total += n_tweets_party_for_subset
                
        df_samples_for_party = df_party.sample(n=n_tweets_party_for_subset, random_state=SEED_VALUE)
        annotation_dataset = pd.concat([annotation_dataset, df_samples_for_party], axis=0).reset_index().drop(columns='index')       
        #print(party, username, n_tweets_party, n_tweets_party_for_subset, (SUBSET_SIZE / n_tweets_total) * n_tweets_party, n_tweets_party_for_subset)
        
    if annotation_dataset['id'].nunique() == len(annotation_dataset):
        annotation_dataset_unique = True

  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)


In [47]:
n_subset_total

2000

### 4. Create Sub Datasets for Annotation 

Save dataset for annotation

In [48]:
try:
    os.makedirs(ANNOTATION_DATASET_PATH)
except FileExistsError:
    pass

In [49]:
annotation_dataset = annotation_dataset.sample(frac=1, random_state=SEED_VALUE).reset_index()
annotation_dataset.to_csv(ANNOTATION_DATASET_PATH + "/annotation_dataset.csv")

For the entire dataset, we do not delete duplicates. Duplicates can occur because a tweet can mention several politicians at once, which means that a tweet could be crawled for multiple politicians. However, we want to avoid evaluating the trained BERT model with tweets that were also used for training. Therefore, we make sure that there are no duplicates among the 2000 annotated tweets that we will later use for training and evaluation of our BERT model.

In [50]:
# Check if the 'id' column is unique
is_unique = df['id'].is_unique
print("Dataset uniqueness: ", is_unique)

Dataset uniqueness:  True


Add column for sentiment label and columns with information that might be helpfull for annotators

In [51]:
annotation_dataset["sentiment"] = ""
annotation_dataset = annotation_dataset.loc[:, ['id', 'username', 'date', 'sentiment', 'tweet', 'link', 'source_account']]

In [52]:
df_session1 = annotation_dataset[:int(SUBSET_SIZE/2)]
df_session2 = annotation_dataset[int(SUBSET_SIZE/2):]

In [53]:
df_session1.to_csv(ANNOTATION_DATASET_PATH + "/tweets_session_1.csv")
df_session1.to_excel(ANNOTATION_DATASET_PATH + "/tweets_session_1.xlsx")
df_session1

Unnamed: 0,id,username,date,sentiment,tweet,link,source_account
0,1449809957990371330,CocoRos67338599,2021-10-17 18:50:04,,"@n_roettgen Ja, danke! Bild: Aber verheiratet ...",https://twitter.com/CocoRos67338599/status/144...,n_roettgen
1,1409589750533480449,DenisAndrejevic,2021-06-28 19:09:20,,@Seppuku0815 @jannik_lul @Karl_Lauterbach Was ...,https://twitter.com/DenisAndrejevic/status/140...,Karl_Lauterbach
2,1442206006881099779,MacStarTrader,2021-09-26 19:14:41,,@1Theoden1 @KuehniKev Wer ist denn ihr? Die SP...,https://twitter.com/MacStarTrader/status/14422...,KuehniKev
3,1442362927307177984,amalth3a,2021-09-27 05:38:14,,@PeterPaulert @Dzienus @ABaerbock @OlafScholz ...,https://twitter.com/amalth3a/status/1442362927...,OlafScholz
4,1376909179504443393,va_vassmann,2021-03-30 14:48:24,,@uedio @Ricarda_Lang Und wie bringt man(n) die...,https://twitter.com/va_vassmann/status/1376909...,Ricarda_Lang
...,...,...,...,...,...,...,...
995,1445425897276911620,LDFeyre,2021-10-05 16:29:23,,@grtzmann @InvestigaTV @indysorger11 @einerein...,https://twitter.com/LDFeyre/status/14454258972...,Karl_Lauterbach
996,1376503090447257602,sigisippe,2021-03-29 11:54:45,,@Karl_Lauterbach Ihre Kampfparolen sind skurri...,https://twitter.com/sigisippe/status/137650309...,Karl_Lauterbach
997,1393172918125871104,myfairjulia,2021-05-14 11:54:42,,@DaniFleck1 @maxima_8 @Die_Gruenen Falls du no...,https://twitter.com/myfairjulia/status/1393172...,Die_Gruenen
998,1415275090116239365,EnBremer,2021-07-14 11:40:50,,@Q_Paxxx @MarcoBuschmann ... wobei die angebli...,https://twitter.com/EnBremer/status/1415275090...,MarcoBuschmann


In [54]:
df_session2.to_csv(ANNOTATION_DATASET_PATH + "/tweets_session_2.csv")
df_session2.to_excel(ANNOTATION_DATASET_PATH + "/tweets_session_2.xlsx")
df_session2

Unnamed: 0,id,username,date,sentiment,tweet,link,source_account
1000,1443817771557662741,ProvNero,2021-10-01 05:59:16,,@joeh6y @Segeltexter @Malahia_M @PeterRNeumann...,https://twitter.com/ProvNero/status/1443817771...,CDU
1001,1459132644831158277,LiffersGert,2021-11-12 12:15:06,,@MarcoBuschmann @spdbt @GrueneBundestag @fdpbt...,https://twitter.com/LiffersGert/status/1459132...,MarcoBuschmann
1002,1436071843472023557,nur_1Welt,2021-09-09 20:59:43,,Deshalb missbraucht #Laschet die #Polizei für ...,https://twitter.com/nur_1Welt/status/143607184...,PaulZiemiak
1003,1376588713715769344,Sophie111078,2021-03-29 17:34:59,,@SenBJF @RegBerlin @regina_kittler @jensspahn ...,https://twitter.com/Sophie111078/status/137658...,Karl_Lauterbach
1004,1389531302513287169,ropietsch,2021-05-04 10:44:13,,@hardyschu @W_Schmidt_ @phsteffen @KuehniKev @...,https://twitter.com/ropietsch/status/138953130...,OlafScholz
...,...,...,...,...,...,...,...
1995,1361601123103240192,SabineHueckmann,2021-02-16 08:59:39,,@SebastianDrols2 @Karl_Lauterbach @ArminLasche...,https://twitter.com/SabineHueckmann/status/136...,Karl_Lauterbach
1996,1441509169664757760,BuchinSwantje,2021-09-24 21:05:42,,@ClausStrunz @BILD @GretaThunberg @FridayForFu...,https://twitter.com/BuchinSwantje/status/14415...,spdbt
1997,1403331685966561287,boehnisch,2021-06-11 12:42:01,,@c_lindner Man hätte ins IfsG mit reinschreibe...,https://twitter.com/boehnisch/status/140333168...,c_lindner
1998,1396735449180348421,Nutzernameneu,2021-05-24 07:50:55,,Römer wird gerne von Lauterbach zitiert. Spinn...,https://twitter.com/Nutzernameneu/status/13967...,cducsubt


## IMPORTANT: NEXT STEPS

1. Create new Folder "annotated_datasets" in /Datasets
2. Add Annotated Datasets in .xlsx format 
3. Name these:
`
['../Datasets/annotated_dataset/tweets_session_1_1.xlsx',
 '../Datasets/annotated_dataset/tweets_session_1_2.xlsx',
 '../Datasets/annotated_dataset/tweets_session_1_3.xlsx',
 '../Datasets/annotated_dataset/tweets_session_2_1.xlsx',
 '../Datasets/annotated_dataset/tweets_session_2_2.xlsx',
 '../Datasets/annotated_dataset/tweets_session_2_3.xlsx']
`

Schema:

`
../Datasets/annotated_dataset/tweets_session_{SESSION_ID}_{ANNOTATOR_ID}.xlsx'
`