In [41]:
import pandas as pd
from dacite import from_dict
import json
from dataclasses import dataclass, field
from typing import Optional
import os
CONTROL = False

In [42]:
@dataclass
class Tweet:
    id: int
    text: str
    created_at: str
    lang: str
    source: str
    retweeted: bool

@dataclass
class User:
    id: int
    screen_name: str
    name: str
    description: Optional[str]
    location: str
    tweets: list[Tweet] = field(default_factory=list)

In [43]:
USERS_DIR = "../datasets/users/ia"
USERS_FILENAME = "users.json"
if CONTROL:
    USERS_DIR += "/control"
USERS_FILEPATH = os.path.join(USERS_DIR, USERS_FILENAME)


QUERY_DIR = "../datasets/queried"
QUERIED_USERS_FILENAME = "users.jsonl"
QUERIED_TWEETS_FILENAME = "tweets.jsonl"

if CONTROL:
    QUERIED_USERS_FILENAME = "control_" + QUERIED_USERS_FILENAME
    QUERIED_TWEETS_FILENAME = "control_" + QUERIED_TWEETS_FILENAME
    
QUERIED_USERS_FILEPATH = os.path.join(QUERY_DIR, QUERIED_USERS_FILENAME)
QUERIED_TWEETS_FILEPATH = os.path.join(QUERY_DIR, QUERIED_TWEETS_FILENAME)


OUTDIR = "../datasets/pandas"
USER_DB_FILENAME = "users_ia.pkl"
TWEET_DB_FILENAME = "tweets_ia.pkl"
if CONTROL:
    USER_DB_FILENAME = "control_" + USER_DB_FILENAME
    TWEET_DB_FILENAME = "control_" + TWEET_DB_FILENAME

In [44]:
# Load all users and tweets.
data = {}
tweets = {}
with open(USERS_FILEPATH, "r") as f:
    for line in f:
        datum = json.loads(line.strip()) #from_dict(data_class=User, data=json.loads(line.strip()))
        data[datum["id"]] = datum
        for tweet in datum["tweets"]:
            tweet["user_id"] = datum["id"]
            tweets[tweet["id"]] = tweet
        datum["tweets"] = [tweet["id"] for tweet in datum["tweets"]]

In [45]:
print(len(data))
print(len(tweets))
print(USERS_FILEPATH)

35858
264988
../datasets/users/ia/users.json


In [35]:
users = list(data.values())
users_df = pd.DataFrame.from_dict(users)
users_df["from_ia"] = True
users_df["deleted"] = False
users_df["protected"] = False
users_df["last_query_date"] = None

display(users_df)

tweet_list = list(tweets.values())
tweets_df = pd.DataFrame.from_dict(tweet_list)
tweets_df["from_ia"] = True
tweets_df["user_deleted"] = False
tweets_df["deleted"] = False
tweets_df["user_protected"] = False

tweets_df["last_query_date"] = None
display(tweets_df)

Unnamed: 0,id,screen_name,name,description,location,tweets,from_ia,deleted,protected,last_query_date
0,995057353169997825,RCnycLI,RC,🚇⚓️💾⚾️🏒,MTA New York City Transit,[1123324188968783876],True,False,False,
1,271563785,michaelkasdan,Michael Kasdan,"IP attny, Adjunct Prof @NYULaw, Dir. Special P...",New York City,"[1123330027452551169, 1123211089548840961, 112...",True,False,False,
2,2785094659,nyc_tykes,New York City Tykes,barnsley fans of new york city!!,New York City,"[1123319315170828289, 1096157568714637317, 111...",True,False,False,
3,815587149051494401,aleks_lmly,^aleks loves jooheon^,"kpop? 🤠 multi stan (I LOVE, SUPPORT, AND ACCEP...",New York City,"[1123326701369548802, 1122292918667051008, 113...",True,False,False,
4,24848469,Yankeefan2975,Deidre,I love the Yankees. I like meeting new people....,New York City,"[1123326726510137344, 1123251296180559874, 112...",True,False,False,
...,...,...,...,...,...,...,...,...,...,...
18030,55354567,ErickGordon,Erick Gordon,Teacher. Writer. Maker. Project Builder. Excel...,New York City,[1080253405480636416],True,False,False,
18031,1076440976732737537,ayyzazu,zazu,hey im zazu n this is my art twit. main is @st...,"Stark Tower, New York City",[1080255708153532416],True,False,False,
18032,994146246032998401,caIienteo,Teo,#WestWorld #WSQ OCRP 18+ :|| Mateo Elías — req...,New York City,[1080057384687423488],True,False,False,
18033,1077470326878027776,kvrlasroses,𝖗𝖔𝖗𝖆,"man, i feel like cleopatra, joan of arc, queen...","new york city, babe",[1080064045259153408],True,False,False,


Unnamed: 0,id,text,created_at,lang,source,retweeted,user_id,from_ia,user_deleted,deleted,user_protected,last_query_date
0,1123324188968783876,@LIRR Why is it a black subway station sign ra...,2019-04-30 20:32:11+00:00,en,"<a href=""http://twitter.com/download/iphone"" r...",False,995057353169997825,True,False,False,False,
1,1123330027452551169,RT @krassenstein: BREAKING: New Details Emerg...,2019-04-30 20:55:23+00:00,en,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,271563785,True,False,False,False,
2,1123211089548840961,Totally normal behavior for innocent people. \...,2019-04-30 13:02:46+00:00,en,"<a href=""http://twitter.com/download/iphone"" r...",False,271563785,True,False,False,False,
3,1123433307998433280,RT @GoodMenProject: Whether in the pros or in ...,2019-05-01 03:45:47+00:00,en,"<a href=""http://twitter.com/download/iphone"" r...",False,271563785,True,False,False,False,
4,1123450605274640388,RT @JoCornell4: Thoughtful article on football...,2019-05-01 04:54:31+00:00,en,"<a href=""http://twitter.com/download/iphone"" r...",False,271563785,True,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...
132854,1080253405480636416,RT @parisreview: “The fact is that as times ch...,2019-01-02 00:04:16+00:00,en,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,55354567,True,False,False,False,
132855,1080255708153532416,summer of heat~ https://t.co/RIMc613ZCd,2019-01-02 00:13:25+00:00,en,"<a href=""http://twitter.com/download/iphone"" r...",False,1076440976732737537,True,False,False,False,
132856,1080057384687423488,@burntveins dm me!,2019-01-01 11:05:21+00:00,en,"<a href=""http://twitter.com/download/android"" ...",False,994146246032998401,True,False,False,False,
132857,1080064045259153408,@Ificouldflyxhaz è una delle persone più impor...,2019-01-01 11:31:49+00:00,it,"<a href=""http://twitter.com/download/iphone"" r...",False,1077470326878027776,True,False,False,False,


In [36]:
# Populate user & tweet DB with user query status
# (is user deleted? protected? when was this query made?)


with open(QUERIED_USERS_FILEPATH, "r") as f:
    for line in f:
        line = line.strip()
        data = json.loads(line)
        #print(data["protected"])
        #print(data.keys())
        users_df.loc[users_df["id"] == data["id"], "deleted"] = not data["found"]
        users_df.loc[users_df["id"] == data["id"], "last_query_date"] = data["queried_time"]
        if data["found"]:
            users_df.loc[users_df["id"] == data["id"], "protected"] = data["protected"]
            tweets_df.loc[tweets_df["user_id"] == data["id"], "user_protected"] = data["protected"]
        else:
            tweets_df.loc[tweets_df["user_id"] == data["id"], "user_deleted"] = not data["found"]

        

In [37]:
# Populate tweet DB with query status (was Tweet deleted?)
i = 0
with open(QUERIED_TWEETS_FILEPATH, "r") as f:
    for line in f:
        i += 1
        if i % 10000 == 0:
            print(f"{int(10000*i/len(tweet_list))/float(100)}% tweets processed")
        line = line.strip()
        data = json.loads(line)
        #print(data["protected"])
        tweets_df.loc[tweets_df["id"] == data["id"], "deleted"] = not data["found"]
        tweets_df.loc[tweets_df["id"] == data["id"], "last_query_date"] = data["queried_time"]

7.52% tweets processed
15.05% tweets processed
22.58% tweets processed
30.1% tweets processed
37.63% tweets processed
45.16% tweets processed
52.68% tweets processed
60.21% tweets processed
67.74% tweets processed
75.26% tweets processed
82.79% tweets processed
90.32% tweets processed
97.84% tweets processed


In [38]:
sum(list(users_df["deleted"] | users_df["protected"]))/len(users_df)

0.13800942611588576

In [39]:
# Write pickle files of pandas DB


USER_DB_FILEPATH = os.path.join(OUTDIR, USER_DB_FILENAME)
TWEET_DB_FILEPATH = os.path.join(OUTDIR, TWEET_DB_FILENAME)
users_df.to_pickle(USER_DB_FILEPATH)
tweets_df.to_pickle(TWEET_DB_FILEPATH)

In [40]:
USER_DB_FILEPATH


'../datasets/pandas/control_users_ia.pkl'