In [55]:
import pandas as pd
from dacite import from_dict
import json
from dataclasses import dataclass, field
from typing import Optional
import os


In [56]:
@dataclass
class Tweet:
    id: int
    text: str
    created_at: str
    lang: str
    source: str
    retweeted: bool

@dataclass
class User:
    id: int
    screen_name: str
    name: str
    description: Optional[str]
    location: str
    tweets: list[Tweet] = field(default_factory=list)

In [57]:
EXP = "tokyo"

USERS_DIR = "../datasets/users/ia"
USERS_FILENAME = f"{EXP}.json"
USERS_FILEPATH = os.path.join(USERS_DIR, USERS_FILENAME)


QUERY_DIR = f"../datasets/queried/{EXP}"
QUERIED_USERS_FILENAME = "users.jsonl"
QUERIED_TWEETS_FILENAME = "tweets.jsonl"

    
QUERIED_USERS_FILEPATH = os.path.join(QUERY_DIR, QUERIED_USERS_FILENAME)
QUERIED_TWEETS_FILEPATH = os.path.join(QUERY_DIR, QUERIED_TWEETS_FILENAME)


OUTDIR = f"../datasets/pandas/{EXP}"
USER_DB_FILENAME = "users.pkl"
TWEET_DB_FILENAME = "tweets.pkl"

In [58]:
# Load all users and tweets.
data = {}
tweets = {}
with open(USERS_FILEPATH, "r") as f:
    for line in f:
        datum = json.loads(line.strip()) #from_dict(data_class=User, data=json.loads(line.strip()))
        data[datum["id"]] = datum
        for tweet in datum["tweets"]:
            tweet["user_id"] = datum["id"]
            tweets[tweet["id"]] = tweet
        datum["tweets"] = [tweet["id"] for tweet in datum["tweets"]]

In [59]:
print(len(data))
print(len(tweets))
print(USERS_FILEPATH)

215219
675785
../datasets/users/ia/tokyo.json


In [60]:
users = list(data.values())
users_df = pd.DataFrame.from_dict(users)
users_df["from_ia"] = True
users_df["deleted"] = False
users_df["protected"] = False
users_df["last_query_date"] = None

display(users_df)

tweet_list = list(tweets.values())
tweets_df = pd.DataFrame.from_dict(tweet_list)
tweets_df["from_ia"] = True
tweets_df["user_deleted"] = False
tweets_df["deleted"] = False
tweets_df["user_protected"] = False

tweets_df["last_query_date"] = None
display(tweets_df)

Unnamed: 0,id,screen_name,name,description,location,tweets,from_ia,deleted,protected,last_query_date
0,69282959,mos_burger,モスバーガー,株式会社モスフードサービスの公式アカウントです。 日頃よりモスバーガーをご利用いただき、あり...,東京都品川区,"[1133717124810252288, 1133717301008773120, 113...",True,False,False,
1,571799741,miki_cloverZ,miki_cloverZ,ももクロが好きな主婦兼会社員。ももクロとの出会いは2010年のイオン レイクタウンのイベント...,東京と神奈川の隙間,"[1133717137430798336, 1155101045594263552, 115...",True,False,False,
2,1104299935493550080,jr25663998,シュガーjr,荒らしは空の旅,東京 千代田区,[1133717137393037312],True,False,False,
3,1046011069993373697,w__phalaenopsis,可奈子,98'心理学生,東京,"[1133717145789947905, 1133591920674914304, 113...",True,False,False,
4,305034754,suuutiii,岩﨑豊∝夏期テキストの締め切りが迫ってく～る～,Ｚ会で中学生に理科を教えてます。いかなる戦争も軍隊も認めず。Peace & Love! :D...,東京都目黒区,"[1133717158368690176, 1095659063122190337, 115...",True,False,False,
...,...,...,...,...,...,...,...,...,...,...
215214,134748101,solsticemusic,SOLSTICE MUSIC,Solstice Music is an electronic music producti...,Tokyo Japan,[1132907594794688512],True,False,False,
215215,2800147584,Ainy615,西園寺まる,はっちゃんのママです。座右の銘｢エロを科学する｣です。 アホなエロ話してると幸せだー！ReL...,日本 東京,[1132907758360023041],True,False,False,
215216,36065155,audrey501,ひるどら,開発してないサーバサイドエンジニア。日本酒、クラフトビール、音ゲー。スプラトゥーンX2300...,Tokyo⇄さいたま,[1132910136542892032],True,False,False,
215217,404096821,rinko_r06,なおたんﾟ*｡®︎1y6m,25歳面倒くさがり主婦 娘大好き ブログ書いてます。よかったら読んでみてね👇 #なおたんの懸賞当選,東京,[1132910216234688512],True,False,False,


Unnamed: 0,id,text,created_at,lang,source,retweeted,user_id,from_ia,user_deleted,deleted,user_protected,last_query_date
0,1133717124810252288,@unicco24結果は動画をチェック！\n\nフォロー＆RTキャンペーンは明日も挑戦可能！...,2019-05-29 12:50:00+00:00,ja,"<a href=""https://shuttlerock.co.jp"" rel=""nofol...",False,69282959,True,False,False,False,
1,1133717301008773120,@718_melody結果は動画をチェック！\n\nフォロー＆RTキャンペーンは明日も挑戦可...,2019-05-29 12:50:42+00:00,ja,"<a href=""https://shuttlerock.co.jp"" rel=""nofol...",False,69282959,True,False,False,False,
2,1133717330356244480,@yamamon_iomon結果は動画をチェック！\n\nフォロー＆RTキャンペーンは明日も...,2019-05-29 12:50:49+00:00,ja,"<a href=""https://shuttlerock.co.jp"" rel=""nofol...",False,69282959,True,False,False,False,
3,1133706504832462848,@akinao66結果は動画をチェック！\n\nフォロー＆RTキャンペーンは明日も挑戦可能！...,2019-05-29 12:07:48+00:00,ja,"<a href=""https://shuttlerock.co.jp"" rel=""nofol...",False,69282959,True,False,False,False,
4,1133706559383646208,@bandoakao結果は動画をチェック！\n\nフォロー＆RTキャンペーンは明日も挑戦可能...,2019-05-29 12:08:01+00:00,ja,"<a href=""https://shuttlerock.co.jp"" rel=""nofol...",False,69282959,True,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...
675780,1132907594794688512,6月22日(土)23:00~ #サマーソルスティス #セレブレーション #ソルスティス #夏...,2019-05-27 07:13:13+00:00,ja,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,134748101,True,False,False,False,
675781,1132907758360023041,RT @wolffthepoo2: あるアイドルから「星の数ほどいる地下アイドルの中から自分...,2019-05-27 07:13:52+00:00,ja,"<a href=""http://twitter.com/download/iphone"" r...",False,2800147584,True,False,False,False,
675782,1132910136542892032,ファミマの外人店員さんの名前が｢ぷりてい｣だった......プリティ......,2019-05-27 07:23:19+00:00,ja,"<a href=""https://www.wakamesoba98.net/sobacha/...",False,36065155,True,False,False,False,
675783,1132910216234688512,RT @Ta_Za_N_Te: 洗車しようとフロントガラスに水ぶっかけたら温度差でヒビ入った...,2019-05-27 07:23:38+00:00,ja,"<a href=""http://twitter.com/download/iphone"" r...",False,404096821,True,False,False,False,


In [None]:
# Populate user & tweet DB with user query status
# (is user deleted? protected? when was this query made?)


with open(QUERIED_USERS_FILEPATH, "r") as f:
    for line in f:
        line = line.strip()
        data = json.loads(line)
        #print(data["protected"])
        #print(data.keys())
        users_df.loc[users_df["id"] == data["id"], "deleted"] = not data["found"]
        users_df.loc[users_df["id"] == data["id"], "last_query_date"] = data["queried_time"]
        if data["found"]:
            users_df.loc[users_df["id"] == data["id"], "protected"] = data["protected"]
            tweets_df.loc[tweets_df["user_id"] == data["id"], "user_protected"] = data["protected"]
        else:
            tweets_df.loc[tweets_df["user_id"] == data["id"], "user_deleted"] = not data["found"]

        

In [51]:
# Populate tweet DB with query status (was Tweet deleted?)
i = 0
with open(QUERIED_TWEETS_FILEPATH, "r") as f:
    for line in f:
        i += 1
        if i % 10000 == 0:
            print(f"{int(10000*i/len(tweet_list))/float(100)}% tweets processed")
        line = line.strip()
        data = json.loads(line)
        tweets_df.loc[tweets_df["id"] == data["id"], "deleted"] = not data["found"]
        tweets_df.loc[tweets_df["id"] == data["id"], "last_query_date"] = data["queried_time"]

7.52% tweets processed
15.05% tweets processed
22.58% tweets processed
30.1% tweets processed
37.63% tweets processed
45.16% tweets processed
52.68% tweets processed
60.21% tweets processed
67.74% tweets processed
75.26% tweets processed
82.79% tweets processed
90.32% tweets processed
97.84% tweets processed


In [53]:
# Write pickle files of pandas DB


USER_DB_FILEPATH = os.path.join(OUTDIR, USER_DB_FILENAME)
TWEET_DB_FILEPATH = os.path.join(OUTDIR, TWEET_DB_FILENAME)
users_df.to_pickle(USER_DB_FILEPATH)
tweets_df.to_pickle(TWEET_DB_FILEPATH)

In [52]:
def deleted_tweets_stats(df):
    total = len(df)
    n_deleted = sum(df["deleted"] == True)
    n_user_deleted = sum(df["user_deleted"] == True)
    n_user_protected = sum(df["user_protected"] == True)

    print(f"{(n_deleted)/total} deleted, user public")
    print(f"{(n_deleted+n_user_deleted+n_user_protected)/total} inaccessible")

deleted_tweets_stats(tweets_df)
# users2_df = users_df[~users_df["location"].isnull()]
# #display(users2_df)

# #sum(list(users2_df["deleted"] | users2_df["protected"]))/len(users2_df)
# def check_lang(lang):
#     lang_df = tweets_df[tweets_df["lang"] == lang]
#     print(sum(list(lang_df["deleted"]))/len(lang_df))

# check_lang("en")
# check_lang("zh")
# check_lang("ja")
# print(sum(list(tweets_df["deleted"]))/len(tweets_df))


0.28618309636531963 deleted, user public
0.4513130461617203 inaccessible


In [54]:
USER_DB_FILEPATH


'../datasets/pandas/nyc/users.pkl'