In [1]:
from jorts_utils import *
DATA_PREFIX = './data'

In [2]:
import json
# loading Jorts' follower list
jorts_followers_time_bounded = json.load(
    open('{}/HASHED_jorts_follower_data_by_cursor_all.json'.format(DATA_PREFIX), 'r')
)
jorts_followers_ts = {}
for ts, followers_gained in jorts_followers_time_bounded.items():
    for f in followers_gained:
        jorts_followers_ts[f] = ts


In [34]:
import pandas as pd
rt_to_ts = json.load(open('{}/HASHED_jorts_rt_authors_to_ts.json'.format(DATA_PREFIX), 'r'))
prefix = '{}/jorts/'.format(DATA_PREFIX)

"""
These data structures map each followed account to a dict with the following structure:
{
    True: {
        ts1: estimated_jorts_followers_gained_on_ts1,
        ts2: estimated_jorts_followers_gained_on_ts2,
        ...
        ts14: estimated_jorts_followers_gained_on_ts14,
    },
    False: {
        ts1: estimated_non_jorts_followers_gained_on_ts1,
        ts2: estimated_non_jorts_followers_gained_on_ts2,
        ...
        ts14: estimated_non_jorts_followers_gained_on_ts14,
    },
}
For followers_gained_post_jorts, ts1 through ts14 are the days of the retweet until 14 days after it.
For followers_gained_pre_jorts, ts1 is 14 days before the retweet up until the day before the retweet.
"""

"""
Here we pull full sets of followers who followed within the two-week period, 
with the aim of producing a mapping of dates the attention broker retweeted
to the set of hashed user IDs who followed a retweeted account.

With these "samples" in hand, we can simulate a mark/recapture experiment. 
Each day of retweets is one capturing/marking run. 
Each time we see a user's hashed ID, we "mark" them; 
if we encounter them again, it is a "recapture" event.

We can then use Project MARK's free software and R package to estimate the total population
of eligible followers. 
"""

non_ab_followers_ts = {}
ab_followers_ts = {}
for rt_acct, rt_ts in rt_to_ts.items(): 
    try:
        follows_after = json.load(
            open(prefix + 'HASHED_{}_following_data_2_weeks_after_{}_all.json'.format(
                rt_acct, rt_ts
            ), 'r'
        ))
    except FileNotFoundError as e:
        print(e)
        continue

    follows_after_set = set()
    for v in follows_after.values():
        follows_after_set = follows_after_set.union(set(v))
    try:
        follows_on = json.load(
            open(prefix + 'HASHED_{}_following_data_post_{}_all.json'.format(
                rt_acct, rt_ts
            ), 'r'
        )) 
    except Exception as e:
        print(e)
        continue
    follows_on_set = set()
    for v in follows_on.values():
        follows_on_set = follows_on_set.union(set(v))
    follows_before = json.load(
        open(prefix + 'HASHED_{}_following_data_2_weeks_pre_{}_all.json'.format(
            rt_acct, rt_ts
        ), 'r'
    ))
    follows_before_set = set()
    for v in follows_before.values():
        follows_before_set = follows_before_set.union(set(v))
    tot_followers = list(set().union(*(follows_on_set, follows_before_set, follows_after_set)))
    ab_followers = [f for f in tot_followers if f in jorts_followers_ts]
    not_ab_followers = [f for f in tot_followers if f not in jorts_followers_ts]
    if rt_ts in non_ab_followers_ts:
        non_ab_followers_ts[rt_ts] += not_ab_followers
    else:
        non_ab_followers_ts[rt_ts] = not_ab_followers
    if rt_ts in ab_followers_ts:
        ab_followers_ts[rt_ts]+= ab_followers
    else:
        ab_followers_ts[rt_ts] = ab_followers

[Errno 2] No such file or directory: './data/jorts/HASHED_20d9e4cae225fb9b1c0774dd5c72f6e848b89594_following_data_2_weeks_after_20220115_all.json'
[Errno 2] No such file or directory: './data/jorts/HASHED_53b53a14d8d76abac1134a11853ece2a8b9efdad_following_data_2_weeks_after_20220301_all.json'
[Errno 2] No such file or directory: './data/jorts/HASHED_e9efd0343979c4c8a6028bd232189bfb5e3e0309_following_data_2_weeks_after_20220224_all.json'
[Errno 2] No such file or directory: './data/jorts/HASHED_2b9769fd1303dd095c1880420cfcb7ae769b1727_following_data_2_weeks_after_20220220_all.json'
[Errno 2] No such file or directory: './data/jorts/HASHED_f7766c1a6617cb542922b1570f49aec461235af2_following_data_2_weeks_after_20220209_all.json'
[Errno 2] No such file or directory: './data/jorts/HASHED_fed00522b7bb4fd15df5632173a0c7d1a67b654d_following_data_2_weeks_after_20220214_all.json'
[Errno 2] No such file or directory: './data/jorts/HASHED_3680d5ebbb74e32b9c78b18dcd331b1cc80dcdff_following_data_2_we

In [35]:
# construct sequences of captures/non-captures for non-followers of the attention broker;
# a 1 means the user was "captured" (i.e. followed an account on that day)
# and a zero means the user was not "captured" on that particular day.

tot_non_ab_followers = set()
for v in non_ab_followers_ts.values():
    tot_non_ab_followers = tot_non_ab_followers.union(set(v))
    
tot_non_ab_followers_to_seqs = {k: [] for k in list(tot_non_ab_followers)}

for k, v in sorted(non_ab_followers_ts.items(), key=lambda b: b[0]):
    in_list = set(v)
    for t in tot_non_ab_followers_to_seqs.keys():
        if t in in_list:
            tot_non_ab_followers_to_seqs[t] += [1]
        else:
            tot_non_ab_followers_to_seqs[t] += [0]
            
    

In [36]:
from collections import Counter
# this is mostly a sanity check to make sure we have some recapture events.
c = Counter([''.join([str(vv) for vv in v]) for v in tot_non_ab_followers_to_seqs.values()])

In [37]:
# write all capture histories to disk

with open('non_f_outf_indiv.txt', 'w') as f:
    for seq, ct in c.items():
        for s in range(ct):
            f.write(seq + ';\n')
            


In [46]:
# construct capture histories for followers of the attention broker

tot_ab_followers = set()
for v in ab_followers_ts.values():
    tot_ab_followers = tot_ab_followers.union(set(v))
    
tot_ab_followers_to_seqs = {k: [] for k in list(tot_ab_followers)}

for k, v in sorted(ab_followers_ts.items(), key=lambda b: b[0]):
    in_list = set(v)
    for t in tot_ab_followers_to_seqs.keys():
        if t in in_list:
            tot_ab_followers_to_seqs[t] += [1]
        else:
            tot_ab_followers_to_seqs[t] += [0]
            
    

In [47]:
# sanity check
from collections import Counter
c = Counter([''.join([str(vv) for vv in v]) for v in tot_ab_followers_to_seqs.values()])

In [48]:
# write all capture histories to disk
with open('ab_foll_outf_indiv.txt', 'w') as f:
    for seq, ct in c.items():
        for s in range(ct):
            f.write(seq + ';\n')

In [None]:
# Population estimates using RMark and a POPAN model:
# Non-Followers: 17890822 
# Followers: 163987.4 