In [1]:
import pandas as pd
import numpy as np
from os.path import join
import os
import json
from datetime import datetime, timezone
from twarc import Twarc2

# parallelisation functionality
from multiprocess import Pool
import psutil
from tqdm import tqdm

import sys
sys.path.insert(0, "/home/jana/Projects/utilities/twitter_functions")
import twitter_functions as tf

In [2]:
src = "../../data/twitter/"
dst = "../../data/twitter/"

cols =["id", "conversation_id", "author_id", "reply_count"]
tweets = pd.read_csv(join(src,
    "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean.csv.gzip"),
    compression="gzip", usecols=cols,
    dtype={"id":str, "conversation_id":str, "author_id":str})

# Get conversations

In [13]:
subset = tweets[(tweets["reply_count"] <= 500) & (tweets["reply_count"] > 0)]\
    .sort_values(by="reply_count", ascending=True)\
    .reset_index(drop=True)

# get only tweets that started a conversation
subset = subset[subset["id"] == subset["conversation_id"]]
# select every third conversation
subset = subset.iloc[0::3]
subset["id"] = subset["id"].apply(lambda x: x.replace('"', ''))
subset = subset.drop_duplicates(subset=["id"])
np.savetxt(join(dst, "conversation_id_sample_01.txt"), subset["id"].values, fmt='%s')

In [21]:
id_sample_1 = set(np.loadtxt(join("../../data/twitter", "conversation_id_sample_01.txt"), dtype=str))
id_sample_2 = set(np.loadtxt(join("../../data/twitter", "conversation_id_sample_02.txt"), dtype=str))
id_sample_3 = set(np.loadtxt(join("../../data/twitter", "conversation_id_sample_03.txt"), dtype=str))

In [22]:
id_sample_2 = id_sample_2.difference(id_sample_1)
id_sample_2 = id_sample_2.difference(id_sample_3)

In [25]:
id_sample_3 = id_sample_3.difference(id_sample_1)
id_sample_3 = id_sample_3.difference(id_sample_2)

In [33]:
np.savetxt(join(dst, "conversation_id_sample_02.txt"), list(id_sample_2), fmt='%s')
np.savetxt(join(dst, "conversation_id_sample_03.txt"), list(id_sample_3), fmt='%s')

In [38]:
! rsync -avze ssh ../../data/twitter/conversation_id_sample_02.txt jana@remotejana:/data/honesty/ 

sending incremental file list
conversation_id_sample_02.txt

sent 4,228,164 bytes  received 35 bytes  768,763.45 bytes/sec
total size is 8,691,904  speedup is 2.06


In [16]:
# split batch into smaller batches
sample = "03"
id_sample = np.loadtxt(join("../../data/twitter", 
                            f"conversation_id_sample_{sample}.txt"), dtype=str)
N_keys = 8
batch_size = int(len(id_sample_3) / N_keys)

for i in range(1, N_keys + 1):
    id_batch = id_sample[i * batch_size : (i + 1) * batch_size]
    np.savetxt(join(dst, "conversation_id_sample_{}_batch_{:02d}.txt"\
                    .format(sample, i)), id_batch, fmt='%s')

In [18]:
! rsync -avze ssh ../../data/twitter/conversation_id_sample_03_batch_* jlasser@medea:/home/jlasser/Honesty-project/data/twitter/ 

sending incremental file list
conversation_id_sample_03_batch_01.txt
conversation_id_sample_03_batch_02.txt
conversation_id_sample_03_batch_03.txt
conversation_id_sample_03_batch_04.txt
conversation_id_sample_03_batch_05.txt
conversation_id_sample_03_batch_06.txt
conversation_id_sample_03_batch_07.txt
conversation_id_sample_03_batch_08.txt

sent 3,608,531 bytes  received 168 bytes  656,127.09 bytes/sec
total size is 7,690,733  speedup is 2.13


Conversation collection:

| key | server | batch | id sample | status |
| --- | ------ | ----- | --------- | ------ |
| Johannes W | medea | 01 | 1 | collected |
| Katharina | medea | 02 | 1 | collected |
| Malte | medea | 03 | 1 | collected |
| Seve | medea | 04 | 1 | collected |
| Almog | medea | 05 | 1 | collected |
| William | medea | 06 | 1 | collected |
| Michi | medea | 07 | 1 | collected |
| Simon | medea | 08 | 1 | collected |
| Flo | remotejana | full | 2 | running |
| Johannes W | medea | 01 | 3 | running |
| Katharina | medea | 02 | 3 | running |
| Malte | medea | 03 | 3 | running |
| Seve | medea | 04 | 3 | running |
| Almog | medea | 05 | 3 | running |
| William | medea | 06 | 3 | running |
| Michi | medea | 07 | 3 | running |

Start collection batch 3 on medea: 2022-04-06

In [None]:
cat conversation_id_sample_03_batch_08.txt | xargs -i sh -c "twarc2 --bearer-token conversation --archive {} /data/honesty/corpora/Twitter/US_politician_twitter_conversations/{}.jsonl"

In [92]:
cat conversation_id_sample.txt | xargs -i sh -c "twarc2 conversation --archive {} US_politician_twitter_conversations/{}.jsonl"

cat: '!': No such file or directory
cat: ls: No such file or directory
100%|███████████████| Processed 15 years/15 years [00:01<00:00, 1 tweets total ]
100%|███████████████| Processed 15 years/15 years [00:01<00:00, 1 tweets total ]
100%|███████████████| Processed 15 years/15 years [00:01<00:00, 1 tweets total ]
100%|███████████████| Processed 15 years/15 years [00:01<00:00, 1 tweets total ]
100%|███████████████| Processed 15 years/15 years [00:01<00:00, 1 tweets total ]


### Get missing conversation IDs

In [8]:
id_sample = set(np.loadtxt(join("../../data/twitter", "conversation_id_sample.txt"), dtype=str))

In [10]:
existing_conversations = os.listdir("../../data/twitter/US_politician_twitter_conversations/")
existing_conversations = set([c.split(".")[0] for c in existing_conversations])

In [12]:
missing_conversations = list(id_sample.difference(existing_conversations))
len(missing_conversations)

236108

In [14]:
for i in range(1, 9):
    np.savetxt(join("../../data/twitter", "missing_conversation_ids_sample_split_{:02d}.txt".format(i)),
               missing_conversations[i::3], fmt='%s')

# Collect conversations

In [3]:
! rsync -avze ssh jlasser@medea:/data/honesty/corpora/Twitter/US_politician_twitter_conversations_csv/1059564601501720577.csv .

receiving incremental file list
1059564601501720577.csv

sent 43 bytes  received 2,724 bytes  5,534.00 bytes/sec
total size is 13,912  speedup is 5.03


In [4]:
test = pd.read_csv("1059564601501720577.csv")

In [7]:
test

Unnamed: 0,id,conversation_id,referenced_tweets.replied_to.id,referenced_tweets.retweeted.id,referenced_tweets.quoted.id,author_id,in_reply_to_user_id,retweeted_user_id,quoted_user_id,created_at,...,geo.geo.bbox,geo.geo.type,geo.id,geo.name,geo.place_id,geo.place_type,__twarc.retrieved_at,__twarc.url,__twarc.version,Unnamed: 73
0,1059819140989493253,1059564601501720577,1059564601501720577,,,3397423293,2253968388,,,2018-11-06T14:45:48.000Z,...,,,,,,,2022-04-08T06:25:05+00:00,https://api.twitter.com/2/tweets/search/all?ex...,2.10.1,
1,1059603492707467264,1059564601501720577,1059564601501720577,,,917023309664968704,2253968388,,,2018-11-06T00:28:53.000Z,...,,,,,,,2022-04-08T06:25:05+00:00,https://api.twitter.com/2/tweets/search/all?ex...,2.10.1,
2,1059588002748469248,1059564601501720577,1059564601501720577,,,827287651296354304,2253968388,,,2018-11-05T23:27:20.000Z,...,,,,,,,2022-04-08T06:25:05+00:00,https://api.twitter.com/2/tweets/search/all?ex...,2.10.1,
3,1059565037428400128,1059564601501720577,1059564601501720577,,,799113571481358336,2253968388,,,2018-11-05T21:56:05.000Z,...,,,,,,,2022-04-08T06:25:05+00:00,https://api.twitter.com/2/tweets/search/all?ex...,2.10.1,


In [6]:
test.columns

Index(['id', 'conversation_id', 'referenced_tweets.replied_to.id',
       'referenced_tweets.retweeted.id', 'referenced_tweets.quoted.id',
       'author_id', 'in_reply_to_user_id', 'retweeted_user_id',
       'quoted_user_id', 'created_at', 'text', 'lang', 'source',
       'public_metrics.like_count', 'public_metrics.quote_count',
       'public_metrics.reply_count', 'public_metrics.retweet_count',
       'reply_settings', 'possibly_sensitive', 'withheld.scope',
       'withheld.copyright', 'withheld.country_codes', 'entities.annotations',
       'entities.cashtags', 'entities.hashtags', 'entities.mentions',
       'entities.urls', 'context_annotations', 'attachments.media',
       'attachments.media_keys', 'attachments.poll.duration_minutes',
       'attachments.poll.end_datetime', 'attachments.poll.id',
       'attachments.poll.options', 'attachments.poll.voting_status',
       'attachments.poll_ids', 'author.id', 'author.created_at',
       'author.username', 'author.name', 'author

In [None]:
! cd /home/jana/Projects/CSS_honesty/analysis/data/twitter
! cat conversations_to_convert.txt | xargs -i sh -c "twarc2 csv US_politician_twitter_conversations/{}.jsonl US_politician_twitter_conversations_csv/{}.csv"

In [245]:
src = "../../data/twitter/US_politician_twitter_conversations_csv"
files = os.listdir(src)
cols = ["id", "author.id",
        "conversation_id", "created_at", "lang", "text",
        "public_metrics.retweet_count", "public_metrics.like_count",
        "public_metrics.quote_count","public_metrics.reply_count",
        "entities.urls", "geo.place_id", "referenced_tweets"]
conversations = pd.DataFrame()
for f in files:
    tmp = pd.read_csv(join(src, f), usecols=cols)
    tmp = process_csv(tmp)
    conversations = pd.concat([conversations, tmp])