# Olympus Metadata

In [1]:
import os
import pandas as pd

## What are we working with?

In [2]:
# where is your data stored? Change this if you're not running this on HPC!
data_dir = '/scratch/olympus/filter_metadata/'

In [3]:
!ls -ltha $data_dir

total 2.4M
drwxrwsr-x    2 ly501  smapp 4.0K Jan  5 16:26 archive
-rw-rwxr--+   1 ly501  smapp  21K Jan  5 16:26 collection_meta.csv
-rwxrwx---+   1 ly501  smapp 755K Jan  5 16:26 following_users.csv
-rwxrwx---+   1 ly501  smapp  64K Jan  5 16:26 tracking_terms.csv
-rwxrwx---+   1 ly501  smapp 1.5M Jan  5 16:26 user_lookup.json
drwxrws---+   3 ly501  smapp 4.0K Jan  5 16:12 .
drwxrws---+ 220 yns207 smapp  16K Jan  3 14:34 ..


In [4]:
# let's declare those files above as variables in Python.
f_user = os.path.join(data_dir, 'following_users.csv')
f_term = os.path.join(data_dir, 'tracking_terms.csv')
f_meta = os.path.join(data_dir, 'collection_meta.csv')

## Users

In [15]:
# we'll read the user file into a Pandas dataframe.
df_user = pd.read_csv(f_user)
df_user.head(3)

Unnamed: 0,collection,date_added,user.id,user.name
0,britain_broadcast_journalists_2016,2016-10-19,107807497,Sarah Vaughan-Brown
1,britain_broadcast_journalists_2016,2016-10-19,104483362,Morwenna Grills
2,britain_broadcast_journalists_2016,2016-10-19,102769529,Elaine Ly


In [16]:
# we can see ~our~ most followed users,
df_user['user.name'].value_counts()[:10]

user suspended        373
not found             117
Marco Rubio             4
Rep. Jared Huffman      3
Jim McGovern            3
Dana Rohrabacher        3
Новости Украины         3
Bernie Sanders          3
Dina Titus              3
Elizabeth Warren        3
Name: user.name, dtype: int64

Note: "user suspended" and "not found" are from a [response code](https://developer.twitter.com/en/docs/basics/response-codes) returned after requesting a username via Tweepy.<br>
-> Can that be used for some aspect of your research?

### Marcos!

<img src="http://i0.kym-cdn.com/entries/icons/facebook/000/019/754/marco-rubio-robot-memes-2.jpg" width="300"></img>

In [17]:
# exact match
df_user[df_user['user.name'] == 'Marco Rubio']

Unnamed: 0,collection,date_added,user.id,user.name
3356,us_election_marcorubio_2016,2016-10-18,15745368,Marco Rubio
3369,us_election_others_2016,2016-10-18,15745368,Marco Rubio
3605,us_legislators_2016,2016-10-18,15745368,Marco Rubio
4180,us_legislators_2017,2017-02-14,15745368,Marco Rubio


In [18]:
# regular expression aka wildcard patter
df_user[df_user['user.name'].str.contains('marco rubio', case=False)]

Unnamed: 0,collection,date_added,user.id,user.name
3356,us_election_marcorubio_2016,2016-10-18,15745368,Marco Rubio
3369,us_election_others_2016,2016-10-18,15745368,Marco Rubio
3605,us_legislators_2016,2016-10-18,15745368,Marco Rubio
4180,us_legislators_2017,2017-02-14,15745368,Marco Rubio


## Terms

In [19]:
df_term = pd.read_csv(f_term)
df_term.head(10)

Unnamed: 0,collection,date_added,keyword
0,Jerusalem,2017-12-08,palestine
1,Jerusalem,2017-12-08,فلسطين
2,Jerusalem,2017-12-08,القدس
3,Jerusalem,2017-12-08,jerusalem
4,arab_events_3_2016,2016-10-18,#خليك_نسر
5,arab_events_3_2016,2016-10-18,سني
6,arab_events_3_2016,2016-10-18,الشيعة
7,arab_events_3_2016,2016-10-18,حزبالله
8,arab_events_3_2016,2016-10-18,الجهاد
9,arab_events_3_2016,2016-10-18,القاعدة


In [20]:
arab_event_terms = df_term[df_term['collection'] == 'arab_events_3_2016']['keyword'].tolist()
arab_event_terms[-5:] # last 5 terms

['أقباط', 'الأردن', 'سقوط', 'لاجئ', 'مجزرة']

# Looking at The Size of Collections

In [22]:
df_meta = pd.read_csv(f_meta)
df_meta.head(3)

Unnamed: 0,collection,collection_number_of_files,collection_size,collection_size_bytes,latest_filedate,latest_filename,latest_filesize
0,us_politics_pro_trump,339,47.8 GB,51319712186,2018-01-05 06:37:09,/scratch/olympus/us_politics_pro_trump/data/us...,137.17 MB
1,us_election_marcorubio_2016,449,17.36 GB,18642360066,2018-01-05 06:37:10,/scratch/olympus/us_election_marcorubio_2016/d...,5.2 MB
2,turkey_referendum_2017,345,2.88 GB,3095326292,2018-01-05 06:37:10,/scratch/olympus/turkey_referendum_2017/data/t...,80.49 KB


In [23]:
import math

def convert_size(size_bytes):
    '''
    Bytes to a human-readable format.
    '''
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)

    return "%s %s" % (s, size_name[i])

In [24]:
olympus_btyes = df_meta['collection_size_bytes'].sum()
olympus_btyes

4106391005076

In [25]:
convert_size(olympus_btyes)

'3.73 TB'

In [28]:
len(df_meta['collection'].unique())

116

In [29]:
# how big is the largest collection?
largest_collection_size = df_meta['collection_size_bytes'].max()
largest_collection_size

423205150185

In [30]:
# how do we find the largest collection?
largest_collection = df_meta[df_meta['collection_size_bytes'] == largest_collection_size]
largest_collection

Unnamed: 0,collection,collection_number_of_files,collection_size,collection_size_bytes,latest_filedate,latest_filename,latest_filesize
86,arab_events_3_2016,446,394.14 GB,423205150185,2018-01-05 07:08:49,/scratch/olympus/arab_events_3_2016/data/arab_...,946.52 MB


In [32]:
# how to get the latest_filedate?
largest_collection['latest_filedate'].iloc[0]

'2018-01-05 07:08:49'

In [36]:
largest_collection_file = largest_collection['latest_filename'].iloc[0]
largest_collection_file

'/scratch/olympus/arab_events_3_2016/data/arab_events_3_2016_data__11_24_2017__00_00_00__23_59_59.json.bz2'

## Let's Look at this File

In [33]:
import json
from smappdragon import JsonCollection

In [34]:
?JsonCollection()

In [40]:
collect = JsonCollection(largest_collection_file, compression='bz2', throw_error=0, verbose=1)

In [41]:
sum(1 for _ in collect.get_iterator())

1699837 rows are ok.
1 rows are corrupt.


1699839

In [41]:
for i, tweet in enumerate(collect.get_iterator()):
    if i == 0:
        print(json.dumps(tweet, indent=4))
    else:
        break

{
    "id": 933923407397257216,
    "smapp_timestamp": "Fri Nov 24 00:01:06 +0000 2017",
    "in_reply_to_screen_name": null,
    "filter_level": "low",
    "truncated": false,
    "place": null,
    "user": {
        "id": 327146497,
        "listed_count": 0,
        "location": null,
        "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
        "favourites_count": 4934,
        "is_translator": false,
        "profile_image_url": "http://pbs.twimg.com/profile_images/813385734497431554/hsc59alj_normal.jpg",
        "name": "GOODLUCK",
        "statuses_count": 3370,
        "description": null,
        "profile_link_color": "1DA1F2",
        "profile_sidebar_fill_color": "DDEEF6",
        "profile_image_url_https": "https://pbs.twimg.com/profile_images/813385734497431554/hsc59alj_normal.jpg",
        "following": null,
        "utc_offset": null,
        "profile_background_color": "C0DEED",
        "geo_enabled": true,
        "profile_backgroun

In [53]:
def is_getlocated(tweet):
    if isinstance(tweet, dict) and tweet['coordinates']:
        return True
    return False

In [54]:
geo_collect = collect.set_custom_filter(is_getlocated)