In [109]:
import pandas as pd
from dacite import from_dict
import json
from dataclasses import dataclass, field
from typing import Optional
import os
CONTROL = True

In [110]:
@dataclass
class Tweet:
    id: int
    text: str
    created_at: str
    lang: str
    source: str
    retweeted: bool

@dataclass
class User:
    id: int
    screen_name: str
    name: str
    description: Optional[str]
    location: str
    tweets: list[Tweet] = field(default_factory=list)

In [111]:
directory = "../datasets/users/ia"
filename = "users-ia.json"
if CONTROL:
    directory = "../datasets/users/ia-control"
    filename = "users-ia-control.json"
filepath = os.path.join(directory, filename)
data = {}
tweets = {}
with open(filepath, "r") as f:
    for line in f:
        datum = json.loads(line.strip()) #from_dict(data_class=User, data=json.loads(line.strip()))
        data[datum["id"]] = datum
        for tweet in datum["tweets"]:
            tweet["user_id"] = datum["id"]
            tweets[tweet["id"]] = tweet
        datum["tweets"] = [tweet["id"] for tweet in datum["tweets"]]

In [112]:
print(len(data))
print(len(tweets))

11610
51617


In [113]:
users = list(data.values())
users_df = pd.DataFrame.from_dict(users)
users_df["from_ia"] = True
users_df["deleted"] = False
users_df["protected"] = False
users_df["last_query_date"] = None

display(users_df)

tweet_list = list(tweets.values())
tweets_df = pd.DataFrame.from_dict(tweet_list)
tweets_df["from_ia"] = True
tweets_df["user_deleted"] = False
tweets_df["deleted"] = False
tweets_df["user_protected"] = False

tweets_df["last_query_date"] = None
display(tweets_df)

Unnamed: 0,id,screen_name,name,description,location,tweets,from_ia,deleted,protected,last_query_date
0,2351009236,AKokolli,Arbnor Kokolli,,New York City,"[1134983435523694592, 1134944327829004295]",True,False,False,
1,593232251,nic_mar98,Nicole Marino 🌙,"""Art is not what I create. What I create is Ch...",New York City,"[1134983448119205898, 1163533794416181248, 111...",True,False,False,
2,78455847,typhanieluv,ObamaCrat,"LOVES LIFE, Politics, Music, Movies & Broadway...",NEW YORK CITY,"[1134977685162201089, 1134909141850513411, 116...",True,False,False,
3,3313384270,Laurettamylove,laurettalove,I AM LOVE-A Health-Nutrition Vegan & Fitness E...,NEW YORK CITY,"[1134977907435196417, 1134799980877811712, 113...",True,False,False,
4,29094983,Beauty_TheBeat,⚜️Creole Seasoning⚜️,Your Destani⚜️Private Pilot | First In Flight✈...,New York City,"[1134985943713140736, 1135001009682436096, 116...",True,False,False,
...,...,...,...,...,...,...,...,...,...,...
11605,467874137,PhilLiubicich,Phil Liubicich,NYC Fireman (ret) E Pluribus Unum 🇺🇸 Non Imped...,New York City,[1138923866691166213],True,False,False,
11606,199606358,WhtvrJeffSaid,Jeffrey Skinner-Perkins,nobody.,New York City,[1138922998470070272],True,False,False,
11607,30258547,Starcode,"Dave O'Connor, PhD",,New York City,[1138924953032626177],True,False,False,
11608,854918461,NikiHatzidis,Niki Hatzidis,Award Nominated Playwright/Features Writer @ O...,"New York City, New York",[1138927670941691904],True,False,False,


Unnamed: 0,id,text,created_at,lang,source,retweeted,user_id,from_ia,user_deleted,deleted,user_protected,last_query_date
0,1134983435523694592,RT @FootballFactly: No opposition player has c...,2019-06-02 00:41:52+00:00,en,"<a href=""http://twitter.com/download/android"" ...",False,2351009236,True,False,False,False,
1,1134944327829004295,RT @LFC: A moment we'll never forget.\n\n#SixT...,2019-06-01 22:06:28+00:00,en,"<a href=""http://twitter.com/download/android"" ...",False,2351009236,True,False,False,False,
2,1134983448119205898,RT @_justmateo_: They made their reservation j...,2019-06-02 00:41:55+00:00,en,"<a href=""http://twitter.com/download/iphone"" r...",False,593232251,True,False,False,False,
3,1163533794416181248,RT @Pathryn34: Remember when Sam Winchester to...,2019-08-19 19:30:48+00:00,en,"<a href=""http://twitter.com/download/iphone"" r...",False,593232251,True,False,False,False,
4,1117763116501278722,@nicosicko I just checked mine and it said the...,2019-04-15 12:14:28+00:00,en,"<a href=""http://twitter.com/download/iphone"" r...",False,593232251,True,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...
51612,1138923866691166213,@FreakinLibs The end of porn,2019-06-12 21:39:44+00:00,en,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,467874137,True,False,False,False,
51613,1138922998470070272,The U.S. Department of State should issue a Tr...,2019-06-12 21:36:17+00:00,en,"<a href=""http://www.facebook.com/twitter"" rel=...",False,199606358,True,False,False,False,
51614,1138924953032626177,Had a great time at Pier 17 last night seeing ...,2019-06-12 21:44:03+00:00,en,"<a href=""https://tapbots.com/software/tweetbot...",False,30258547,True,False,False,False,
51615,1138927670941691904,Fillmore is helping me read scripts for @athen...,2019-06-12 21:54:51+00:00,en,"<a href=""http://instagram.com"" rel=""nofollow"">...",False,854918461,True,False,False,False,


In [115]:
user_lookup_filename = "queried_users_ia.jsonl"
tweet_lookup_filename = "queried_users_tweets_ia.jsonl"
if CONTROL:
    user_lookup_filename = "control_" + user_lookup_filename
    tweet_lookup_filename = "control_" + tweet_lookup_filename

with open(user_lookup_filename, "r") as f:
    for line in f:
        line = line.strip()
        data = json.loads(line)
        #print(data["protected"])
        #print(data.keys())
        users_df.loc[users_df["id"] == data["id"], "deleted"] = not data["found"]
        users_df.loc[users_df["id"] == data["id"], "last_query_date"] = data["queried_time"]
        if data["found"]:
            users_df.loc[users_df["id"] == data["id"], "protected"] = data["protected"]
            tweets_df.loc[tweets_df["user_id"] == data["id"], "user_protected"] = data["protected"]
        else:
            tweets_df.loc[tweets_df["user_id"] == data["id"], "user_deleted"] = not data["found"]

        

In [116]:
queried_users = {}
with open(tweet_lookup_filename, "r") as f:
    for line in f:
        line = line.strip()
        data = json.loads(line)
        #print(data["protected"])
        tweets_df.loc[tweets_df["id"] == data["id"], "deleted"] = not data["found"]
        tweets_df.loc[tweets_df["id"] == data["id"], "last_query_date"] = data["queried_time"]



In [117]:

sum(list(users_df["deleted"] | users_df["protected"]))/len(users_df)

0.13755383290267012

In [119]:
users_out_filename = "users_ia_hk.pkl"
tweets_out_filename = "tweets_ia_hk.pkl"
if CONTROL:
    users_out_filename = "control_" + users_out_filename
    tweets_out_filename = "control_" + tweets_out_filename
users_df.to_pickle(users_out_filename)
tweets_df.to_pickle(tweets_out_filename)

Unnamed: 0,id,text,created_at,lang,source,retweeted,user_id,from_ia,user_deleted,deleted,user_protected,last_query_date
0,1101628887279128576,I just hope the size chart was right! https://...,2019-03-01 23:42:48+00:00,en,"<a href=""https://www.shopperapproved.com/"" rel...",False,20838690,True,False,False,False,2021-12-27 11:44:26.230899
1,1101631420659724288,@sakurakharel @BuffordIanah @irmaroxas3 @grabe...,2019-03-01 23:52:52+00:00,en,"<a href=""http://twitter.com/download/android"" ...",False,734509989855961089,True,False,False,False,2021-12-27 11:42:01.525895
2,1101455255668088832,RT @irmaroxas3: @sakurakharel @BuffordIanah @g...,2019-03-01 12:12:51+00:00,en,"<a href=""http://twitter.com/download/android"" ...",False,734509989855961089,True,False,False,False,2021-12-27 11:49:01.190461
3,1101457453516873729,RT @LiesesMarryHK: @Elj1329 @aldubeth02 @pinky...,2019-03-01 12:21:35+00:00,en,"<a href=""http://twitter.com/download/android"" ...",False,734509989855961089,True,False,False,False,2021-12-27 11:45:08.141071
4,1101415388833701889,RT @grabesila: @sakurakharel @BuffordIanah @ir...,2019-03-01 09:34:26+00:00,nl,"<a href=""http://twitter.com/download/android"" ...",False,734509989855961089,True,False,False,False,2021-12-27 11:44:48.986565
...,...,...,...,...,...,...,...,...,...,...,...,...
86882,1095572048082944001,RT @vboykis: Confused about all the different ...,2019-02-13 06:35:05+00:00,en,"<a href=""http://twitter.com/download/android"" ...",False,205276657,True,False,False,False,2021-12-27 11:46:06.634750
86883,1095564561237737472,【ドラゴンボールZ ドッカンバトル】\nドッカンバトルはついに4周年！\n「力属性ガシャ」開...,2019-02-13 06:05:20+00:00,ja,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,4859048519,True,False,False,False,2021-12-27 11:49:36.685321
86884,1095396839430311936,ごちそうさま♡ https://t.co/uB8LusQUqS,2019-02-12 18:58:52+00:00,ja,"<a href=""http://twitter.com/download/android"" ...",False,774330398,True,False,False,False,2021-12-27 11:44:07.721638
86885,1095391588136542208,RT @WANIMAL912: 收工 https://t.co/grL9n19Haj,2019-02-12 18:38:00+00:00,ja,"<a href=""http://twitter.com/download/iphone"" r...",False,973162772547727360,True,False,False,False,2021-12-27 11:50:06.616330
