# Olympus Metadata

In [1]:
import os
import pandas as pd

## What are we working with?

In [2]:
# where is your data stored? Change this if you're not running this on HPC!
data_dir = '/scratch/olympus/filter_metadata/'

In [3]:
!ls -ltha $data_dir

total 2.0M
drwxrws---+   2 ly501  smapp 4.0K Jan 19 05:01 archive
-rwxrwx---+   1 ly501  smapp  21K Jan 19 05:01 collection_meta.csv
-rwxrwx---+   1 ly501  smapp 358K Jan 19 05:01 following_users.csv
-rwxrwx---+   1 ly501  smapp  65K Jan 19 05:01 tracking_terms.csv
-rwxrwx---+   1 ly501  smapp 1.5M Jan 19 05:01 user_lookup.json
drwxrwx---+ 222 yns207 smapp  16K Jan 18 12:58 ..
drwxrws---+   3 ly501  smapp 4.0K Jan  5 16:12 .


In [4]:
# let's declare those files above as variables in Python.
f_user = os.path.join(data_dir, 'following_users.csv')
f_term = os.path.join(data_dir, 'tracking_terms.csv')
f_meta = os.path.join(data_dir, 'collection_meta.csv')

## Users

In [5]:
# we'll read the user file into a Pandas dataframe.
df_user = pd.read_csv(f_user)
df_user.head(3)

Unnamed: 0,collection,date_added,user.id,user.name
0,britain_broadcast_journalists_2016,2016-10-19,107807497,Sarah Vaughan-Brown
1,britain_broadcast_journalists_2016,2016-10-19,104483362,Morwenna Grills
2,britain_broadcast_journalists_2016,2016-10-19,102769529,Elaine Ly


In [6]:
len(df_user[df_user['collection'] == 'us_media_accounts_2016'])

1644

In [7]:
df_user[
    df_user['collection'] == 'us_media_accounts_2016'
]['user.name'].tolist()[:5]

['ed silverman',
 'ed lavandera',
 'Elizabeth Holmes',
 'Ali Ehrlich',
 'Edward Barsamian']

In [8]:
# we can see ~our~ most followed users,
df_user['user.name'].value_counts()[:10]

user suspended        373
not found             117
Marco Rubio             4
Новости Украины         3
Senator Ted Cruz        3
Bernie Sanders          3
Светлана                3
Dana Rohrabacher        3
Robin Kelly             3
Rep. Jared Huffman      3
Name: user.name, dtype: int64

Note: "user suspended" and "not found" are from a [response code](https://developer.twitter.com/en/docs/basics/response-codes) returned after requesting a username via Tweepy.<br>
-> Can that be used for some aspect of your research?

## We have a lot of Marco Rubios

In [9]:
# exact match
df_user[df_user['user.name'] == 'Marco Rubio']

Unnamed: 0,collection,date_added,user.id,user.name
3356,us_election_marcorubio_2016,2016-10-18,15745368,Marco Rubio
3369,us_election_others_2016,2016-10-18,15745368,Marco Rubio
3610,us_legislators_2016,2016-10-18,15745368,Marco Rubio
4180,us_legislators_2017,2017-02-14,15745368,Marco Rubio


In [10]:
# regular expression aka wildcard pattern
df_user[df_user['user.name'].str.contains('marco rubio', case=False)]

Unnamed: 0,collection,date_added,user.id,user.name
3356,us_election_marcorubio_2016,2016-10-18,15745368,Marco Rubio
3369,us_election_others_2016,2016-10-18,15745368,Marco Rubio
3610,us_legislators_2016,2016-10-18,15745368,Marco Rubio
4180,us_legislators_2017,2017-02-14,15745368,Marco Rubio


## Terms

In [11]:
df_term = pd.read_csv(f_term)
df_term.head(10)

Unnamed: 0,collection,date_added,keyword
0,Jerusalem,2017-12-08,palestine
1,Jerusalem,2017-12-08,فلسطين
2,Jerusalem,2017-12-08,القدس
3,Jerusalem,2017-12-08,jerusalem
4,arab_events_3_2016,2016-10-18,#خليك_نسر
5,arab_events_3_2016,2016-10-18,سني
6,arab_events_3_2016,2016-10-18,الشيعة
7,arab_events_3_2016,2016-10-18,حزبالله
8,arab_events_3_2016,2016-10-18,الجهاد
9,arab_events_3_2016,2016-10-18,القاعدة


# Looking at The Size of Collections

In [12]:
df_meta = pd.read_csv(f_meta)
df_meta.head(3)

Unnamed: 0,collection,collection_number_of_files,collection_size,collection_size_bytes,latest_filedate,latest_filename,latest_filesize
0,us_politics_pro_trump,353,46.63 GB,50071473759,2018-01-19 04:01:01,/scratch/olympus/us_politics_pro_trump/data/us...,236.94 MB
1,us_election_marcorubio_2016,463,17.38 GB,18657937962,2018-01-19 04:01:01,/scratch/olympus/us_election_marcorubio_2016/d...,8.35 MB
2,turkey_referendum_2017,357,2.88 GB,3093438342,2018-01-19 04:01:02,/scratch/olympus/turkey_referendum_2017/data/t...,33.42 KB


In [13]:
import math

def convert_size(size_bytes):
    '''
    Bytes to a human-readable format.
    '''
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)

    return "%s %s" % (s, size_name[i])

In [14]:
olympus_btyes = df_meta['collection_size_bytes'].sum()
olympus_btyes

3906407432268

In [15]:
convert_size(olympus_btyes)

'3.55 TB'

In [16]:
len(df_meta['collection'].unique())

117

In [17]:
# how big is the largest collection?
largest_collection_size = df_meta['collection_size_bytes'].max()
largest_collection_size

384424700854

In [18]:
# how do we find the largest collection?
largest_collection = df_meta[df_meta['collection_size_bytes'] == largest_collection_size]
largest_collection

Unnamed: 0,collection,collection_number_of_files,collection_size,collection_size_bytes,latest_filedate,latest_filename,latest_filesize
87,arab_events_3_2016,458,358.02 GB,384424700854,2018-01-19 02:04:49,/scratch/olympus/arab_events_3_2016/data/arab_...,8.91 GB


## Let's Look at this File

In [21]:
!pip install smappdragon --upgrade

Requirement already up-to-date: smappdragon in /home/ly501/anaconda3/lib/python3.6/site-packages
Requirement already up-to-date: pymongo>=3.2.1 in /home/ly501/anaconda3/lib/python3.6/site-packages (from smappdragon)


In [22]:
import json
from smappdragon import JsonCollection

In [23]:
?JsonCollection()

In [25]:
f = '/scratch/olympus/womens_march_2017/data/womens_march_2017_data__01_18_2018__00_00_00__23_59_59.json.bz2'
collect = JsonCollection(f, compression='bz2', throw_error=0)

collections are "generators", which are placeholders that store no data in memory.

In [26]:
collect.get_iterator()

<generator object JsonCollection.get_iterator at 0x2b2f879764c0>

Let's take the sum. We're returning 1 for each tweet, and then taking the sum of the result.

In [27]:
sum(1 for tweet in collect.get_iterator())

25230

In [28]:
collect = JsonCollection(f, compression='bz2', throw_error=0, verbose=1)

Let's look at the first tweet:

In [29]:
for tweet in collect.get_iterator():
    print(json.dumps(tweet, indent=4))
    break

{
    "created_at": "Thu Jan 18 05:01:18 +0000 2018",
    "filter_level": "low",
    "id_str": "953854789292568577",
    "in_reply_to_user_id_str": null,
    "retweeted": false,
    "reply_count": 0,
    "favorited": false,
    "truncated": false,
    "contributors": null,
    "coordinates": null,
    "id": 953854789292568577,
    "in_reply_to_screen_name": null,
    "retweet_count": 0,
    "in_reply_to_status_id": null,
    "place": null,
    "in_reply_to_user_id": null,
    "retweeted_status": {
        "extended_tweet": {
            "entities": {
                "hashtags": [
                    {
                        "indices": [
                            14,
                            24
                        ],
                        "text": "anchorage"
                    },
                    {
                        "indices": [
                            145,
                            154
                        ],
                        "text": "Williwaw"
   

Let's look at the first x tweets.

In [31]:
limit = 2
for i, tweet in enumerate(collect.get_iterator()):
    if i < limit:
        print(json.dumps(tweet, indent=4))

{
    "created_at": "Thu Jan 18 05:01:18 +0000 2018",
    "filter_level": "low",
    "id_str": "953854789292568577",
    "in_reply_to_user_id_str": null,
    "retweeted": false,
    "reply_count": 0,
    "favorited": false,
    "truncated": false,
    "contributors": null,
    "coordinates": null,
    "id": 953854789292568577,
    "in_reply_to_screen_name": null,
    "retweet_count": 0,
    "in_reply_to_status_id": null,
    "place": null,
    "in_reply_to_user_id": null,
    "retweeted_status": {
        "extended_tweet": {
            "entities": {
                "hashtags": [
                    {
                        "indices": [
                            14,
                            24
                        ],
                        "text": "anchorage"
                    },
                    {
                        "indices": [
                            145,
                            154
                        ],
                        "text": "Williwaw"
   

## Let's Filter Geolocated Tweets:

In [32]:
def is_getlocated(tweet):
    if isinstance(tweet, dict) and tweet['place']:
        return True
    return False

In [33]:
geo_collect = collect.set_custom_filter(is_getlocated)

In [35]:
limit = 1
for i, tweet in enumerate(collect.get_iterator()):
    if i < limit:
        print(json.dumps(tweet, indent=4))

{
    "created_at": "Thu Jan 18 05:17:03 +0000 2018",
    "filter_level": "low",
    "id_str": "953858754952466432",
    "in_reply_to_user_id_str": null,
    "retweeted": false,
    "reply_count": 0,
    "favorited": false,
    "truncated": false,
    "contributors": null,
    "coordinates": null,
    "id": 953858754952466432,
    "in_reply_to_screen_name": null,
    "retweet_count": 0,
    "in_reply_to_status_id": null,
    "place": {
        "place_type": "city",
        "country": "United States",
        "name": "Brooklyn",
        "url": "https://api.twitter.com/1.1/geo/id/011add077f4d2da3.json",
        "attributes": {},
        "full_name": "Brooklyn, NY",
        "id": "011add077f4d2da3",
        "country_code": "US",
        "bounding_box": {
            "type": "Polygon",
            "coordinates": [
                [
                    [
                        -74.041878,
                        40.570842
                    ],
                    [
                       