In [30]:
import os
import json
import datetime
from dateutil import parser
from dataclasses import dataclass, field
from TwitterAPI import TwitterAPI
from dacite import from_dict, Config

CREDENTIALS_FILE = "./creds.txt"
PUSHSHIFT_DIR = "datasets/pushshift"
OUTDIR = "datasets/users"
CONTROL = True

if CONTROL:
    OUTDIR = f"{OUTDIR}-control"
# TT_hongkong.nd json
# First Tweet retrieved Friday, August 9, 2019 8:33:46 PM EST
# Result_type: recent
FILENAME = "hongkong.ndjson"
OUTFILE = os.path.join(OUTDIR, f"users.{FILENAME}")

In [31]:
@dataclass
class Tweet:
    id: int
    text: str
    created_at: datetime.datetime
    lang: str
    source: str
    retweeted: bool

@dataclass
class User:
    id: int
    screen_name: str
    name: str
    description: str
    location: str
    tweets: list[Tweet] = field(default_factory=list)

In [32]:
config=Config()
users = dict()
with open(os.path.join(PUSHSHIFT_DIR,  FILENAME), "r") as f:
    i = 0
    for line in f:
        tweet = json.loads(line)
        user = tweet["user"]
        if "full_text" in tweet:
            tweet["text"] = tweet["full_text"]
        tweet["created_at"] = parser.parse(tweet["created_at"])
        if user["id"] not in users:
            users[user["id"]] = from_dict(data_class=User, data=user)
        users[user["id"]].tweets.append(
            from_dict(data_class=Tweet, data=tweet))

In [33]:
def hk_in_profile(user):
    if not user.location:
        return False
    keywords = [ "hongkong", "hong kong", "hk", "🇭🇰", "香港" ]
    for keyword in keywords:
        if keyword.lower() in user.location.lower():
            return True
    return False

def newyork_in_profile(user):
    if not user.location:
        return False
    keywords = [ "new york", "nyc" ]
    for keyword in keywords:
        if keyword.lower() in user.location.lower():
            return True
    return False

import hanzidentifier
def contains_trad_chinese(s):
    return hanzidentifier.identify(s) == hanzidentifier.TRADITIONAL
    
def tweets_in_trad_chinese(user):
    return any(filter(lambda tw: tw.lang == "zh" and contains_trad_chinese(tw.text), user.tweets))

In [28]:
# users = list(filter(newyork_in_profile, users))
# len(users)

1953

In [34]:
import dataclasses
import json
control_filter_fn = lambda x: not hk_in_profile(x) and len(x.location) > 0
exp_filter_fn = hk_in_profile
filter_fn = exp_filter_fn
if CONTROL:
    filter_fn = newyork_in_profile
users = list(filter(filter_fn, users.values()))


In [35]:
with open(OUTFILE, "w") as f:
    for user in users:
        json.dump(dataclasses.asdict(user), f, default=str)
        f.write("\n")

In [37]:
print(users)

[User(id=335455570, screen_name='ReutersWorld', name='Reuters World', description='Your source for top international news and analysis.', location='NYC', tweets=[Tweet(id=1155049352659910656, text='Several thousand protesters defied a police ban to converge on a rural Hong Kong town where suspected triad gang me… https://t.co/zw7zP7HGCU', created_at=datetime.datetime(2019, 7, 27, 9, 36, 40, tzinfo=tzutc()), lang='en', source='<a href="http://www.socialflow.com" rel="nofollow">SocialFlow</a>', retweeted=False)]), User(id=17938103, screen_name='wilfredchan', name='wilfred chan', description='', location='new york / hong kong', tweets=[Tweet(id=1155045444159852544, text='tear gas in the distance. #YuenLong https://t.co/67CiF8TkwK', created_at=datetime.datetime(2019, 7, 27, 9, 21, 8, tzinfo=tzutc()), lang='en', source='<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', retweeted=False)]), User(id=2430055920, screen_name='fear_of_god18', name='LUCIFER.😈', d

[User(id=994785137190735877, screen_name='Sassarella2', name='Shannon', description='Truth seeker. 🔎 Knowledge junkie.💡Lover of humor. 🤣 #MAGA #KAG 🇺🇸🐘📜 #TRUMP2020', location='Manhattan, NY', tweets=[Tweet(id=1156549414116384769, text='RT @alessabocchi: Hong Kong protestors are on another level. Here they’re using lasers to avoid facial recognition cameras. A cyber war aga…', created_at=datetime.datetime(2019, 7, 31, 12, 57, 22, tzinfo=tzutc()), lang='en', source='<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', retweeted=False)]),
 User(id=71099184, screen_name='teresa472002', name='Teresa Laubinger', description='"Many people would be scared if they saw in the mirror, not their own faces, but their CHARACTER."  #TRUMP Politics, Criminal Psych, Criminology poetry', location='Wisconsin', tweets=[Tweet(id=1156549420873334784, text='RT @PoliticalKathy: I support peace everywhere, whether that is in black communities, white communities, Hong Kong, or an