In [1]:
# Singaporean Twitter
# Twitter is a social media company that allows its users to broadcast short threads to all its followers. People have used it to assess public sentiment in the past.

# In "singapore_twitter_copy.json" we have some data collected from the Twitter API. This is almost 15Mb and Ed can only support 20Mb so please download this notebook and data to run on your local environment.

# Task 1 - Explore the data
# First examine 2 records
# What fields are there
# What are the types for the different types of data
# Are there repeated authors?
# What percentage of these are "re-tweets"?
# What time span are these tweets covering?

In [2]:
import json

In [6]:
with open('singapore_twitter_copy.json', 'r') as f:
    dat = json.load(f)

In [14]:
# types of dat
type(dat)

list

In [8]:
len(dat)

13740

In [10]:
type(dat[0])

dict

In [11]:
len(dat[0])

10

In [12]:
dat[0].keys()

dict_keys(['referenced_tweets', 'id', 'in_reply_to_user_id', 'reply_settings', 'entities', 'text', 'source', 'created_at', 'public_metrics', 'author_id'])

In [13]:
dat[0]

{'referenced_tweets': [{'type': 'replied_to', 'id': '1491816184240480284'}],
 'id': '1491816347117916162',
 'in_reply_to_user_id': '204970988',
 'reply_settings': 'everyone',
 'entities': {'mentions': [{'start': 0,
    'end': 12,
    'username': 'QuantoQuant',
    'id': '4332053537'}],
  'annotations': [{'start': 241,
    'end': 249,
    'probability': 0.8172,
    'type': 'Place',
    'normalized_text': 'singapore'}]},
 'text': "@QuantoQuant I don't think we should have death penalty for dealers, but at the very least I think they should be deported if they are here illegally. Alternatively, I think they should go to prison for involuntary manslaughter. \n\nI believe singapore has only had a few dozen put to death.",
 'source': 'Twitter Web App',
 'created_at': '2022-02-10T16:48:28.000Z',
 'public_metrics': {'retweet_count': 0,
  'reply_count': 1,
  'like_count': 2,
  'quote_count': 0},
 'author_id': '204970988'}

In [15]:
dat[1]

{'referenced_tweets': [{'type': 'replied_to', 'id': '1491780697106763784'}],
 'id': '1491813901842984962',
 'in_reply_to_user_id': '1382974668',
 'reply_settings': 'everyone',
 'entities': {'mentions': [{'start': 0,
    'end': 11,
    'username': 'IonaItalia',
    'id': '1382974668'}],
  'annotations': [{'start': 99,
    'end': 107,
    'probability': 0.9643,
    'type': 'Place',
    'normalized_text': 'Singapore'}]},
 'text': '@IonaItalia It’s also a question of enforcing laws at a level of harshness that drives compliance. Singapore does not have a drug problem. Instead, it has very strict drug laws...that it enforces...with a cane or death penalty. \n\nBut if one won’t be that harsh, then consider legalization.',
 'source': 'Twitter for iPad',
 'created_at': '2022-02-10T16:38:45.000Z',
 'public_metrics': {'retweet_count': 0,
  'reply_count': 1,
  'like_count': 1,
  'quote_count': 0},
 'author_id': '1459920226385027086'}

In [20]:
# there we notice new method called .get(), it's the same as get the columns directly.
# and the other method is .startswith(), this method help us to judge if the fisrt several words are the words we want.
dat[0].get('text').startswith('RT')

False

In [21]:
dat[0]['text'].startswith('RT')

False

In [22]:
rt_tweet = [tweet for tweet in dat if tweet['text'].startswith('RT')]

In [23]:
rt_rate = len(rt_tweet) / len(dat)

In [24]:
rt_rate

0.6558224163027656

In [35]:
# get the time span
import datetime as dt

In [28]:
print(dat[0]['created_at'])

2022-02-10T16:48:28.000Z


In [30]:
dt.datetime.strptime(dat[0]['created_at'], '%Y-%m-%dT%H:%M:%S.000Z')

datetime.datetime(2022, 2, 10, 16, 48, 28)

In [32]:
dates = [dt.datetime.strptime(tweet['created_at'], '%Y-%m-%dT%H:%M:%S.000Z') for tweet in dat]

In [33]:
min(dates)

datetime.datetime(2022, 2, 10, 1, 28, 26)

In [34]:
max(dates)

datetime.datetime(2022, 2, 10, 17, 14, 16)

In [36]:
# Entities in Twitter
# Please wrangle the data so we can look at the number of tweets/retweets for each entity over the different hours in the day.

In [37]:
dat[0].keys()

dict_keys(['referenced_tweets', 'id', 'in_reply_to_user_id', 'reply_settings', 'entities', 'text', 'source', 'created_at', 'public_metrics', 'author_id'])

In [39]:
dat[0]['entities'].keys()

dict_keys(['mentions', 'annotations'])

In [50]:
dat[0]['entities']['mentions']

[{'start': 0, 'end': 12, 'username': 'QuantoQuant', 'id': '4332053537'}]

In [43]:
{k for tweet in dat if tweet.get('entities') for k in tweet.get('entities')}

{'annotations', 'cashtags', 'hashtags', 'mentions', 'urls'}

In [42]:
{k for tweet in dat if tweet.get('entities') for k in tweet.get("entities")}

{'annotations', 'cashtags', 'hashtags', 'mentions', 'urls'}

In [47]:
dat[0].get('entities')

{'mentions': [{'start': 0,
   'end': 12,
   'username': 'QuantoQuant',
   'id': '4332053537'}],
 'annotations': [{'start': 241,
   'end': 249,
   'probability': 0.8172,
   'type': 'Place',
   'normalized_text': 'singapore'}]}

In [49]:
not dat[0].get('entities')

False

In [60]:
# the .get() method can help to get the value of the key in a dictionary.
ents = set()
ent_map = {"annotations": "normalized_text",
           "cashtags": "tag",
           "hashtags": "tag",
           "mentions": "username",
           "urls": "url"}
for tweet in dat:
    if not tweet.get('entities'):
        continue
    for ent in ent_map:
        if ent not in tweet.get('entities'):
            continue
        ents.update({i.get(ent_map.get(ent)) for i in tweet.get('entities').get(ent)})

In [62]:
hours = {h: {} for h in range(24)}

In [63]:
hours

{0: {},
 1: {},
 2: {},
 3: {},
 4: {},
 5: {},
 6: {},
 7: {},
 8: {},
 9: {},
 10: {},
 11: {},
 12: {},
 13: {},
 14: {},
 15: {},
 16: {},
 17: {},
 18: {},
 19: {},
 20: {},
 21: {},
 22: {},
 23: {}}

In [64]:
dat[0]['created_at']

'2022-02-10T16:48:28.000Z'

In [71]:
import datetime as dt
for tweet in dat:
    hour = dt.datetime.strptime(tweet.get('created_at'), '%Y-%m-%dT%H:%M:%S.000Z').hour
    

In [72]:
hour

1

In [76]:
print(hours.get(hour).get(ent))

None


In [82]:
for tweet in dat:
    hour = dt.datetime.strptime(tweet.get('created_at'), '%Y-%m-%dT%H:%M:%S.000Z').hour
    hour_ents = hours.get(hour)
    for ent_type in ent_map:
        if not tweet.get('entities') or ent_type not in tweet.get('entities'):
            continue
        for i in tweet.get('entities').get(ent_type):
            ent = i.get(ent_map.get(ent_type))
            if ent in hour_ents:
                hour_ents[ent] += hour_ents.get(ent)
            else:
                hour_ents.update({ent: 0})

In [77]:
True or False

True

In [86]:
[len(hours[h]) for h in hours]

[0,
 3,
 483,
 1167,
 1118,
 1273,
 1203,
 1251,
 1506,
 1372,
 1279,
 1201,
 1259,
 1280,
 1171,
 1240,
 1081,
 375,
 0,
 0,
 0,
 0,
 0,
 0]

In [87]:
# Connectivity on Twitter
# How connected are the tweeter users in our dataset? A connection can be defined as someone retweeting or mentioning another user.

In [89]:
connects = {}
for tweet in dat:
    if tweet.get('entities') and 'mentions' in tweet.get('entities'):
        tweet_buds = [u.get('id') for u in tweet.get('entities').get('mentions')]
        if tweet.get('author_id') not in connects:
            connects.update({tweet.get('author_id'): set(tweet_buds)})
        else:
            connects.get(tweet.get('author_id')).update(tweet_buds)

In [91]:
authors = list(connects.keys())

In [92]:
references = list({u for k, v in connects.items() for u in v})

In [96]:
len(authors)

8525

In [97]:
len(references)

3028

In [98]:
import pandas as pd
df = pd.DataFrame(0, columns=references, index=authors)

In [100]:
df.head()

Unnamed: 0,1389927529335730177,1079310252,1082669328,102614399,1409477129117573121,1452686076267483139,2976128340,2595597240,946322930031403008,1224599167697162241,...,154573698,86390214,28138682,711913,3169916217,1327584022063616000,323133826,1168187333197234177,1309693154312953856,307658441
204970988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1459920226385027086,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
840441380652556288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1388529837833392132,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1197328734417588224,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [104]:
for a in connects:
    for r in connects.get(a):
        df.loc[a, r] = 1

In [105]:
df.head()

Unnamed: 0,1389927529335730177,1079310252,1082669328,102614399,1409477129117573121,1452686076267483139,2976128340,2595597240,946322930031403008,1224599167697162241,...,154573698,86390214,28138682,711913,3169916217,1327584022063616000,323133826,1168187333197234177,1309693154312953856,307658441
204970988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1459920226385027086,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
840441380652556288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1388529837833392132,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1197328734417588224,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
(df == 0).sum().sum() / df.shape[0] / df.shape[1]

np.float64(0.9995231214432646)