# Thesis data collection

### Imports

In [32]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import random

### COVID-19 Data Collection

In [33]:
covid_list = []

for i,tweet in enumerate(sntwitter.TwitterSearchScraper('#ikdoenietmeermee lang:nl since:2020-09-22 until:2021-06-01').get_items()):
    covid_list.append([tweet.date, tweet.id, tweet.content, tweet.username])
    
covid_df = pd.DataFrame(covid_list, columns=['datetime', 'tweet_id', 'post', 'user'])

*Info on COVID-19 data*

In [34]:
covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17398 entries, 0 to 17397
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   datetime  17398 non-null  datetime64[ns, UTC]
 1   tweet_id  17398 non-null  int64              
 2   post      17398 non-null  object             
 3   user      17398 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(1), object(2)
memory usage: 543.8+ KB


### Benefits Affair Data Collection

In [35]:
affair_list = []

for i,tweet in enumerate(sntwitter.TwitterSearchScraper('#toeslagenaffaire lang:nl since:2020-09-22 until:2021-06-01').get_items()):
    affair_list.append([tweet.date, tweet.id, tweet.content, tweet.username])
    
affair_df = pd.DataFrame(affair_list, columns=['datetime', 'tweet_id', 'post', 'user'])

*Info on benefits affair data*

In [36]:
affair_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64978 entries, 0 to 64977
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   datetime  64978 non-null  datetime64[ns, UTC]
 1   tweet_id  64978 non-null  int64              
 2   post      64978 non-null  object             
 3   user      64978 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(1), object(2)
memory usage: 2.0+ MB


### Dataset creation

In [37]:
subset_covid = ['covid' for i in range(len(covid_df))]
subset_affair = ['affair' for i in range(len(affair_df))]

In [38]:
covid_df['set'] = subset_covid
affair_df['set'] = subset_affair

In [39]:
data = pd.concat([covid_df, affair_df]).reset_index(drop=True)

In [40]:
datetime = [pd.to_datetime(date) for date in data['datetime']]
data['datetime'] = datetime

In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82376 entries, 0 to 82375
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   datetime  82376 non-null  datetime64[ns, UTC]
 1   tweet_id  82376 non-null  int64              
 2   post      82376 non-null  object             
 3   user      82376 non-null  object             
 4   set       82376 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(1), object(3)
memory usage: 3.1+ MB


### Remove rows

*Empty rows*

In [42]:
data = data.dropna()
print(len(data))

82376


*Rows that contain both search query hashtags*

In [43]:
to_remove = []
for index, post in enumerate(data['post']):
    if '#ikdoenietmeermee' in post and '#toeslagenaffaire' in post:
        to_remove.append(index)
data = data.drop(to_remove).reset_index(drop=True)
print(len(data))

82354


*Rows that contain no hashtags*

Can happen when the account is blocked by Twitter.

In [44]:
to_remove = []
for index, post in enumerate(data['post']):
    if '#' not in post:
        to_remove.append(index)
data = data.drop(to_remove).reset_index(drop=True)
print(len(data))

82349


### Pseudonimization

*Pseudonymize usernames*

In [45]:
names = list(set(data['user']))
random.shuffle(names)

In [46]:
username_dict = {names[i]: f'Persoon{i}' for i in range(len(names))}

In [47]:
len(username_dict)

19608

In [48]:
new_usernames = [username_dict[name] for name in data['user']]

In [49]:
data['user'] = new_usernames

*Pseudonymize tags*

In [50]:
tags = [word for text in data['post'] for word in text.split() if '@' in word]
tags = list(set(tags))
random.shuffle(tags)

In [51]:
tag_dict = {tags[i]: f'@gebruiker{i}' for i in range(len(tags))}

In [52]:
len(tag_dict)

13344

In [53]:
new_texts = [' '.join([tag_dict[word] if word in tag_dict.keys() else word for word in text.split()]) for text in data['post']] 

In [54]:
data['post'] = new_texts

In [55]:
data.head()

Unnamed: 0,datetime,tweet_id,post,user,set
0,2021-05-31 13:02:37+00:00,1399350603969613833,Wat een gezeik; tot eind september mondkapjes....,Persoon2795,covid
1,2021-05-31 10:21:26+00:00,1399310042717958149,Waarom zijn mensen bang voor vaccinatie? Het n...,Persoon19344,covid
2,2021-05-30 20:08:25+00:00,1399095371398324226,Ik krijg buikpijn van de steeds terugkerende v...,Persoon14610,covid
3,2021-05-30 17:05:06+00:00,1399049239607578625,Je zou maar #AstraZeneca hebben laten prikken....,Persoon1779,covid
4,2021-05-30 17:03:17+00:00,1399048780729622528,#samenleving #ikdoenietmeermee #nederlands Hal...,Persoon1410,covid


### Save datafiles

*Tweet IDs + labels for replication*

In [56]:
replication_data = data[['tweet_id', 'set']]

In [57]:
replication_data.to_csv('replication_data_12_09_2.csv', index=False)

*Whole set*

In [58]:
data.to_csv('complete_dataset_12_09_2.csv', index=False)