# Notebook: Create Subset

This notebook is used to create a subset of **2000** tweets, which will then be annotated with respect to their sentiment.
<br>**Contributors:** [Nils Hellwig](https://github.com/NilsHellwig/) | [Markus Bink](https://github.com/MarkusBink/)

## Packages

In [98]:
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
import random
import os

## Parameters

In [99]:
ANNOTATION_DATASET_PATH = '../Datasets/annotation_dataset'
DATASET_PATH = '../Datasets/dataset/'
SUBSET_SIZE = 2000
SEED_VALUE = 0
PARTIES = ["CDU_CSU", "SPD", "AfD", "FDP", "GRUENE", "LINKE"]

## Code

### 1. Get Reproducable Results

In [100]:
os.environ['PYTHONHASHSEED'] = str(SEED_VALUE)
random.seed(SEED_VALUE)
np.random.seed(SEED_VALUE)

### 2. Calculate Number of Tweets

In [101]:
n_tweets_total = 0

In [102]:
for party in PARTIES:
    for subdir, _, files in os.walk(DATASET_PATH + party):
        for file in files:
            if file.endswith('.csv') and subdir[len(DATASET_PATH):] in PARTIES:
                # Get username of CSV file
                username = file[:-4]
                
                # Read dataframe
                df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
                
                # Add length to n_tweets_total
                n_tweets_total += df.shape[0]

  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)


In [103]:
n_tweets_total

713486

### 3. Get Random Tweets From Each Account

In [104]:
n_subset_total = 0

In [105]:
annotation_dataset = pd.DataFrame()

In [106]:
for party in PARTIES:
    # Initialize an empty DataFrame to store the tweets from accounts of a party
    df_party = pd.DataFrame()
    
    for subdir, _, files in os.walk(DATASET_PATH + party):
        for file in files:
            if file.endswith('.csv') and subdir[len(DATASET_PATH):] in PARTIES:
                # Get username of CSV file
                username = file[:-4]
                
                # Read dataframe
                df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
                
                # Save the information for which account the tweet was crawled
                df_account['source_account'] = username
                df_account['source_party'] = party
                
                # Add dataframe to party dataframe
                df_party = pd.concat([df_party, df_account], axis=0).reset_index().drop(columns='index')
                
    n_tweets_party = df_party.shape[0]
    n_tweets_party_for_subset = round((SUBSET_SIZE / n_tweets_total) * n_tweets_party)
    n_subset_total += n_tweets_party_for_subset
                
    df_samples_for_party = df_party.sample(n=n_tweets_party_for_subset, random_state=SEED_VALUE)
    annotation_dataset = pd.concat([annotation_dataset, df_samples_for_party], axis=0).reset_index().drop(columns='index')       
    #print(party, username, n_tweets_party, n_tweets_party_for_subset, (SUBSET_SIZE / n_tweets_total) * n_tweets_party, n_tweets_party_for_subset)

  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)
  df_account = pd.read_csv(DATASET_PATH + party + "/" + file, sep=",", index_col=0)


In [107]:
n_subset_total

1999

### 4. Create Sub Datasets for Annotation 

In [108]:
annotation_dataset = annotation_dataset.sample(frac=1, random_state=SEED_VALUE).reset_index()
annotation_dataset = annotation_dataset.loc[:, ['id', 'username', 'date', 'tweet']]

In [109]:
df_session1 = annotation_dataset[:int(SUBSET_SIZE/2)]
df_session2 = annotation_dataset[int(SUBSET_SIZE/2):]

In [110]:
df_session1.to_csv(ANNOTATION_DATASET_PATH + "/tweets_session_1.csv")
df_session1

Unnamed: 0,id,username,date,tweet
0,1394590713048936451,Noelte030,2021-05-18 09:48:30,@DoroBaer @TeacherTokaryk @republica @LSMuelle...
1,1382657930114256899,eb_ballhausen,2021-04-15 11:31:53,"@Karl_Lauterbach Wow, Klabauterbach hat die Ka..."
2,1458877998501933062,axchapman,2021-11-11 19:23:14,@IngwerBaum @Karl_Lauterbach @Krawallstein @Af...
3,1408888008745553924,HighAsSappyFuck,2021-06-26 20:40:51,@KonterKarma @Alice_Weidel Ein bisschen durch ...
4,1359558612360835076,TimDemisch,2021-02-10 17:43:27,@josefheynckes @OlafScholz Im Raum steht die F...
...,...,...,...,...
995,1439274126640926729,Iro09908973,2021-09-18 17:04:26,@berlinerzeitung @hungerstreik21 @Die_Gruenen ...
996,1438566205590327300,BeatricevF1,2021-09-16 18:11:25,CDU/CSU ist das Plumpsklo für Laschet mit Maaß...
997,1431373413524942850,GHG_goe,2021-08-27 21:49:50,@FelixSchabasian @spdde @FranziskaGiffey Bitte...
998,1443851640440868864,ODDO60,2021-10-01 08:13:51,@CDU @ArminLaschet @PaulZiemiak @rbrinkhaus @j...


In [111]:
df_session2.to_csv(ANNOTATION_DATASET_PATH + "/tweets_session_2.csv")
df_session2

Unnamed: 0,id,username,date,tweet
1000,1471860454708367360,ClaudiaGoedeWe1,2021-12-17 15:10:52,@HugoMuellerVogg @KarstenSchrder @_FriedrichMe...
1001,1409596747572297731,torsten_s,2021-06-28 19:37:08,@_axelmeyer @spdde @Die_Gruenen @dieLinke Letz...
1002,1409455984192409609,RomanU__,2021-06-28 10:17:47,@pilionsegler @kv_rbk @Karl_Lauterbach https:...
1003,1426330791760519172,rosiweis,2021-08-13 23:52:15,@Karl_Lauterbach Im Ablenken von Themen war Sc...
1004,1467181556691345409,Q_Paxxx,2021-12-04 17:18:36,@Afelia @OlafScholz Jeder der bei Bild auftrit...
...,...,...,...,...
1994,1382682158817554436,wahrheit_nurdie,2021-04-15 13:08:10,@KarlAdamek @ChanasitJonas @Karl_Lauterbach We...
1995,1431195832880091140,SueviaThinkTank,2021-08-27 10:04:11,@bibliopat81 @Die_Gruenen @spdde #Politbaromet...
1996,1405588501383553025,rodeidei,2021-06-17 18:09:47,@DFoest @c_lindner @fdp 🌈 gibt es dann da auch...
1997,1345444938960527360,BerndPfeiffer3,2021-01-02 19:00:45,Die GRÜNEN trauen sich das Kanzleramt zu. Mind...
