# Notebook: Create Subset

This notebook is used to create a subset of **2000** tweets, which will then be annotated with respect to their sentiment.
<br>**Contributors:** [Nils Hellwig](https://github.com/NilsHellwig/) | [Markus Bink](https://github.com/MarkusBink/)

## Packages

In [19]:
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
import random
import os

## Parameters

In [20]:
ANNOTATION_DATASET_PATH = '../Datasets/annotation_dataset'
DATASET_PATH = '../Datasets/dataset/'
SUBSET_SIZE = 2000
SEED_VALUE = 0
PARTIES = ["CDU_CSU", "SPD", "AFD", "FDP", "GRUENE", "LINKE"]

## Code

### 1. Get Reproducable Results

In [21]:
os.environ['PYTHONHASHSEED'] = str(SEED_VALUE)
random.seed(SEED_VALUE)
np.random.seed(SEED_VALUE)

### 2. Calculate Number of Tweets

In [22]:
n_tweets_total = 0
party_statistics = {}

In [23]:
for party in PARTIES:
    n_tweets_party = 0
    for subdir, _, files in os.walk(DATASET_PATH + party):
        for file in files:
            if file.endswith('.csv') and subdir[len(DATASET_PATH):] in PARTIES:
                # Get username of CSV file
                username = file[:-4]
                
                # Read dataframe
                df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
                
                # Add to counter
                n_tweets_party += df.shape[0]
                
                # Add length to n_tweets_total
                n_tweets_total += df.shape[0]
                
    party_statistics[party] = n_tweets_party

  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)


In [24]:
n_tweets_total

713486

In [25]:
party_statistics

{'CDU_CSU': 229899,
 'SPD': 230215,
 'AFD': 58050,
 'FDP': 80517,
 'GRUENE': 73981,
 'LINKE': 40824}

### 3. Check which party gets an additional Tweet

In [26]:
def get_key_with_max_value_under_0_5(dictionary):
    filtered_dict = {}
    for key, value in dictionary.items():
        if value < 0.5:
            filtered_dict[key] = value
    return max(filtered_dict, key=filtered_dict.get)

In [27]:
def truncate(x, d):
    return int(x*(10.0**d))/(10.0**d)

for party in party_statistics:
    party_statistics[party] = ((SUBSET_SIZE / n_tweets_total) * party_statistics[party]) - truncate((SUBSET_SIZE / n_tweets_total) * party_statistics[party], 0)

In [28]:
# Check which Party will get an additional Tweet
party_with_additional_tweet = get_key_with_max_value_under_0_5(party_statistics)
party_with_additional_tweet

'CDU_CSU'

### 4. Get Random Tweets From Each Account

In [29]:
n_subset_total = 0

In [30]:
annotation_dataset = pd.DataFrame()

# In principle, duplicate data points can occur in the data set, but this is extremely rare. 
# Nevertheless, we want to be sure that the performance of the model is not evaluated with a tweet that is also among the test data.
annotation_dataset_unique = False

while not(annotation_dataset_unique):
    for party in PARTIES:
        # Initialize an empty DataFrame to store the tweets from accounts of a party
        df_party = pd.DataFrame()
    
        for subdir, _, files in os.walk(DATASET_PATH + party):
            for file in files:
                if file.endswith('.csv') and subdir[len(DATASET_PATH):] in PARTIES:
                    # Get username of CSV file
                    username = file[:-4]
                
                    # Read dataframe
                    df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
                
                    # Add dataframe to party dataframe
                    df_party = pd.concat([df_party, df_account], axis=0).reset_index().drop(columns='index')
                
        n_tweets_party = df_party.shape[0]
        n_tweets_party_for_subset = round((SUBSET_SIZE / n_tweets_total) * n_tweets_party)
        
        if party_with_additional_tweet == party:
            n_tweets_party_for_subset += 1
            
        n_subset_total += n_tweets_party_for_subset
                
        df_samples_for_party = df_party.sample(n=n_tweets_party_for_subset, random_state=SEED_VALUE)
        annotation_dataset = pd.concat([annotation_dataset, df_samples_for_party], axis=0).reset_index().drop(columns='index')       
        #print(party, username, n_tweets_party, n_tweets_party_for_subset, (SUBSET_SIZE / n_tweets_total) * n_tweets_party, n_tweets_party_for_subset)
        
    if annotation_dataset['id'].nunique() == len(annotation_dataset):
        annotation_dataset_unique = True

  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)


In [31]:
n_subset_total

2000

### 4. Create Sub Datasets for Annotation 

Save dataset for annotation

In [32]:
annotation_dataset = annotation_dataset.sample(frac=1, random_state=SEED_VALUE).reset_index()
annotation_dataset.to_csv(ANNOTATION_DATASET_PATH + "/annotation_dataset.csv")

Add column for sentiment label and columns with information that might be helpfull for annotators

In [None]:
annotation_dataset["sentiment_label"] = ""
annotation_dataset = annotation_dataset.loc[:, ['id', 'username', 'date', 'sentiment_label', 'tweet', 'link', 'source_account']]

In [34]:
df_session1 = annotation_dataset[:int(SUBSET_SIZE/2)]
df_session2 = annotation_dataset[int(SUBSET_SIZE/2):]

In [35]:
df_session1.to_csv(ANNOTATION_DATASET_PATH + "/tweets_session_1.csv")
df_session1.to_excel(ANNOTATION_DATASET_PATH + "/tweets_session_1.xlsx")
df_session1

Unnamed: 0,id,username,date,sentiment_label,tweet
0,1394590713048936451,Noelte030,2021-05-18 09:48:30,,@DoroBaer @TeacherTokaryk @republica @LSMuelle...
1,1382657930114256899,eb_ballhausen,2021-04-15 11:31:53,,"@Karl_Lauterbach Wow, Klabauterbach hat die Ka..."
2,1392380357375311875,NilsEpunkt,2021-05-12 07:25:21,,@OliverFleig @Karl_Lauterbach @maithi_nk Wenn ...
3,1393240616973316101,Amend38863146,2021-05-14 16:23:42,,@Karl_Lauterbach Hoppla! Die genbasierten mRNA...
4,1463121751689338884,DerBuschfunker,2021-11-23 12:26:23,,Und nun? @spdde @Die_Gruenen @fdp https://t....
...,...,...,...,...,...
995,1376446316625207297,Kronos272,2021-03-29 08:09:09,,@ebonyplusirony @Karl_Lauterbach Politik und P...
996,1379563476411162628,violissimo,2021-04-06 22:35:38,,@siegfriedsfrom @HeikoMaas War @HeikoMaas nich...
997,1439274126640926729,Iro09908973,2021-09-18 17:04:26,,@berlinerzeitung @hungerstreik21 @Die_Gruenen ...
998,1476912578311897088,HataniDUS,2021-12-31 13:46:13,,@Martina__SchaLu @MarcoBuschmann Die FDP wohl ...


In [36]:
df_session2.to_csv(ANNOTATION_DATASET_PATH + "/tweets_session_2.csv")
df_session2.to_excel(ANNOTATION_DATASET_PATH + "/tweets_session_2.xlsx")
df_session2

Unnamed: 0,id,username,date,sentiment_label,tweet
1000,1443851640440868864,ODDO60,2021-10-01 08:13:51,,@CDU @ArminLaschet @PaulZiemiak @rbrinkhaus @j...
1001,1393547480319602692,baseman49,2021-05-15 12:43:04,,@fdp @Wissing Ist das so? Warum wird dann dies...
1002,1471860454708367360,ClaudiaGoedeWe1,2021-12-17 15:10:52,,@HugoMuellerVogg @KarstenSchrder @_FriedrichMe...
1003,1442118075143311361,Warrior_242,2021-09-26 13:25:17,,@KuehniKev @RA_Solf Ja. Ist so.
1004,1409455984192409609,RomanU__,2021-06-28 10:17:47,,@pilionsegler @kv_rbk @Karl_Lauterbach https:...
...,...,...,...,...,...
1995,1432979033042604033,MusaJupp,2021-09-01 08:09:59,,@ChristianRamm3 @senneka @Karl_Lauterbach Jetz...
1996,1464197435543236609,MiliH07472419,2021-11-26 11:40:46,,@ViktoriaKeller_ @Karl_Lauterbach Da sitzt bes...
1997,1356881096923824128,thoughtfultom,2021-02-03 08:23:57,,@BergheimJeff @realTomBohn @MarcoBuschmann Es ...
1998,1345444938960527360,BerndPfeiffer3,2021-01-02 19:00:45,,Die GRÜNEN trauen sich das Kanzleramt zu. Mind...
