In [1]:
from jorts_utils import *
import json

# Loading JKR's partial follower list
DATA_PREFIX = './data'

jkr_followers_time_bounded = json.load(
    open(
        '{}/HASHED_jkr_followers_full_past_20180615.json'.format(DATA_PREFIX), 'rb'
    )
)

# Map each follower user ID to the time at which they followed JKR. 
jkr_followers_ts = {}
for ts, followers_gained in jkr_followers_time_bounded.items():
    for f in followers_gained:
        jkr_followers_ts[f] = ts

In [3]:
import pandas as pd

df_post_2018 = pd.read_csv('{}/HASHED_jkr_rts_past_20180615.tsv'.format(DATA_PREFIX), sep='\t')
rts_post_2018 = df_post_2018['hashed_user_id'].tolist() # retweeted handles
tss_post_2018 = df_post_2018['created_at'].apply(ts_hyphen_transform) # timestamps of RTs
prefix = '{}/jkr_json/'.format(DATA_PREFIX)

"""
These data structures map each followed account to a dict with the following structure:
{
    True: {
        ts1: estimated_jkr_followers_gained_on_ts1,
        ts2: estimated_jkr_followers_gained_on_ts2,
        ...
        ts14: estimated_jkr_followers_gained_on_ts14,
    },
    False: {
        ts1: estimated_non_jkr_followers_gained_on_ts1,
        ts2: estimated_non_jkr_followers_gained_on_ts2,
        ...
        ts14: estimated_non_jkr_followers_gained_on_ts14,
    },
}
For followers_gained_post_jkr, ts1 through ts14 are the days of the retweet until 14 days after it.
For followers_gained_pre_jkr, ts1 is 14 days before the retweet up until the day before the retweet.
"""

"""
Here we pull full sets of followers who followed within the two-week period, 
with the aim of producing a mapping of dates the attention broker retweeted
to the set of hashed user IDs who followed a retweeted account.

With these "samples" in hand, we can simulate a mark/recapture experiment. 
Each day of retweets is one capturing/marking run. 
Each time we see a user's hashed ID, we "mark" them; 
if we encounter them again, it is a "recapture" event.

We can then use Project MARK's free software and R package to estimate the total population
of eligible followers. 
"""

non_ab_followers_ts = {}
ab_followers_ts = {}
for rt_acct, rt_ts in zip(rts_post_2018, tss_post_2018):
    amplify_ts_dt = dt.datetime.strptime(rt_ts, '%Y%m%d') # time of amplification
    after_ts_dt = amplify_ts_dt + dt.timedelta(weeks=2) # 2 weeks after; cutoff
    before_ts_dt = amplify_ts_dt - dt.timedelta(weeks=2) # 2 weeks prior; cutoff

    after_ts = dt.datetime.strftime(after_ts_dt, '%Y%m%d')
    before_ts = dt.datetime.strftime(before_ts_dt, '%Y%m%d')

    follows_on = json.load(
        open(prefix + 'HASHED_{}_following_data_pre_{}_all.json'.format(
            rt_acct, rt_ts
        ), 'r',
    )) 


    follows_before = json.load(
        open(prefix + 'HASHED_{}_following_data_pre_{}_all.json'.format(
            rt_acct, before_ts
        ), 'r',
    ))
    


    follows_after = json.load(
        open(prefix + 'HASHED_{}_following_data_pre_{}_all.json'.format(
            rt_acct, after_ts
        ), 'r',
    ))
    follows_on_set = set()
    for v in follows_on.values():
        follows_on_set = follows_on_set.union(set(v))
        
    follows_before_set = set()
    for v in follows_before.values():
        follows_before_set = follows_before_set.union(set(v))
    
    
    follows_after_set = set()
    for v in follows_after.values():
        follows_after_set = follows_after_set.union(set(v))
        
    tot_followers = list(set().union(*(follows_on_set, follows_before_set, follows_after_set)))
    ab_followers = [f for f in tot_followers if f in jkr_followers_ts]
    not_ab_followers = [f for f in tot_followers if f not in jkr_followers_ts]
    if rt_ts in non_ab_followers_ts:
        non_ab_followers_ts[rt_ts] += not_ab_followers
    else:
        non_ab_followers_ts[rt_ts] = not_ab_followers
    if rt_ts in ab_followers_ts:
        ab_followers_ts[rt_ts]+= ab_followers
    else:
        ab_followers_ts[rt_ts] = ab_followers


In [5]:
tot_non_ab_followers = set()
for v in non_ab_followers_ts.values():
    tot_non_ab_followers = tot_non_ab_followers.union(set(v))
    
tot_non_ab_followers_to_seqs = {k: [] for k in list(tot_non_ab_followers)}

# construct sequences of captures/non-captures for non-followers of the attention broker;
# a 1 means the user was "captured" (i.e. followed an account on that day)
# and a zero means the user was not "captured" on that particular day.

for k, v in sorted(non_ab_followers_ts.items(), key=lambda b: b[0]):
    in_list = set(v)
    for t in tot_non_ab_followers_to_seqs.keys():
        if t in in_list:
            tot_non_ab_followers_to_seqs[t] += [1]
        else:
            tot_non_ab_followers_to_seqs[t] += [0]
            
    

In [6]:
from collections import Counter
# this is mostly a sanity check to make sure we have some recapture events.
c = Counter([''.join([str(vv) for vv in v]) for v in tot_non_ab_followers_to_seqs.values()])

In [None]:
# write all capture histories to disk
with open('jkr_non_f_outf_indiv.txt', 'w') as f:
    for seq, ct in c.items():
        for s in range(ct):
            f.write(seq + ';\n')

In [7]:
# take a 10% random sample of individuals captured
# MARK can't handle a gigabyte of data, which is what the full dataset requires, 
# so we take a 10% random sample of capture histories. 
# we then multiply the population number MARK returns by 10 to get our final result.
import random
with open('./jkr_non_f_outf_indiv.txt', 'r') as f:
    with open('ten_pct_jkr_non_outf_indiv.txt', 'w') as outf:
        for line in f.readlines():
            if random.randint(0, 10) == 5:
                outf.write(line)

In [8]:
# construct capture histories for followers of the attention broker
tot_ab_followers = set()
for v in ab_followers_ts.values():
    tot_ab_followers = tot_ab_followers.union(set(v))
    
tot_ab_followers_to_seqs = {k: [] for k in list(tot_ab_followers)}

for k, v in sorted(ab_followers_ts.items(), key=lambda b: b[0]):
    in_list = set(v)
    for t in tot_ab_followers_to_seqs.keys():
        if t in in_list:
            tot_ab_followers_to_seqs[t] += [1]
        else:
            tot_ab_followers_to_seqs[t] += [0]
            
    

In [9]:
c = Counter([''.join([str(vv) for vv in v]) for v in tot_ab_followers_to_seqs.values()])

In [10]:
# write capture histories to disk. 
with open('jkr_foll_outf_indiv.txt', 'w') as f:
    for seq, ct in c.items():
        for s in range(ct):
            f.write(seq + ';\n')

In [3]:
# Results, using RMark and the POPAN model.
# followers: 841164.4
# non-followers: 26758530