# Olympus Metadata

In [1]:
import os
import pandas as pd

## What are we working with?

In [2]:
# where is your data stored? Change this if you're not running this on HPC!
data_dir = '/scratch/olympus/filter_metadata/'

In [3]:
!ls -ltha $data_dir

total 2.0M
drwxrws---+   2 ly501  smapp 4.0K Jan  8 05:01 archive
-rwxrwx---+   1 ly501  smapp  21K Jan  8 05:01 collection_meta.csv
-rwxrwx---+   1 ly501  smapp 358K Jan  8 05:01 following_users.csv
-rwxrwx---+   1 ly501  smapp  64K Jan  8 05:00 tracking_terms.csv
-rwxrwx---+   1 ly501  smapp 1.5M Jan  8 05:00 user_lookup.json
drwxrws---+   3 ly501  smapp 4.0K Jan  5 16:12 .
drwxrws---+ 220 yns207 smapp  16K Jan  3 14:34 ..


In [4]:
# let's declare those files above as variables in Python.
f_user = os.path.join(data_dir, 'following_users.csv')
f_term = os.path.join(data_dir, 'tracking_terms.csv')
f_meta = os.path.join(data_dir, 'collection_meta.csv')

## Users

In [5]:
# we'll read the user file into a Pandas dataframe.
df_user = pd.read_csv(f_user)
df_user.head(3)

Unnamed: 0,collection,date_added,user.id,user.name
0,britain_broadcast_journalists_2016,2016-10-19,107807497,Sarah Vaughan-Brown
1,britain_broadcast_journalists_2016,2016-10-19,104483362,Morwenna Grills
2,britain_broadcast_journalists_2016,2016-10-19,102769529,Elaine Ly


In [6]:
# we can see ~our~ most followed users,
df_user['user.name'].value_counts()[:10]

user suspended        373
not found             117
Marco Rubio             4
Rep. Jared Huffman      3
Elizabeth Warren        3
Новости Украины         3
Светлана                3
Dina Titus              3
Jim McGovern            3
Bernie Sanders          3
Name: user.name, dtype: int64

Note: "user suspended" and "not found" are from a [response code](https://developer.twitter.com/en/docs/basics/response-codes) returned after requesting a username via Tweepy.<br>
-> Can that be used for some aspect of your research?

### Marcos!

<img src="http://i0.kym-cdn.com/entries/icons/facebook/000/019/754/marco-rubio-robot-memes-2.jpg" width="300"></img>

In [7]:
# exact match
df_user[df_user['user.name'] == 'Marco Rubio']

Unnamed: 0,collection,date_added,user.id,user.name
3356,us_election_marcorubio_2016,2016-10-18,15745368,Marco Rubio
3369,us_election_others_2016,2016-10-18,15745368,Marco Rubio
3605,us_legislators_2016,2016-10-18,15745368,Marco Rubio
4180,us_legislators_2017,2017-02-14,15745368,Marco Rubio


In [8]:
# is us_legislators_2016 still running??? should we make a us_legislators_2018 and stop the 2017 collection?
# and the marcorubio_2016 collection???

In [9]:
# regular expression aka wildcard patter
df_user[df_user['user.name'].str.contains('marco rubio', case=False)]

Unnamed: 0,collection,date_added,user.id,user.name
3356,us_election_marcorubio_2016,2016-10-18,15745368,Marco Rubio
3369,us_election_others_2016,2016-10-18,15745368,Marco Rubio
3605,us_legislators_2016,2016-10-18,15745368,Marco Rubio
4180,us_legislators_2017,2017-02-14,15745368,Marco Rubio


## Terms

In [10]:
df_term = pd.read_csv(f_term)
df_term.head(10)

# how is this sorted?

Unnamed: 0,collection,date_added,keyword
0,Jerusalem,2017-12-08,palestine
1,Jerusalem,2017-12-08,فلسطين
2,Jerusalem,2017-12-08,القدس
3,Jerusalem,2017-12-08,jerusalem
4,arab_events_3_2016,2016-10-18,#خليك_نسر
5,arab_events_3_2016,2016-10-18,سني
6,arab_events_3_2016,2016-10-18,الشيعة
7,arab_events_3_2016,2016-10-18,حزبالله
8,arab_events_3_2016,2016-10-18,الجهاد
9,arab_events_3_2016,2016-10-18,القاعدة


In [11]:
arab_event_terms = df_term[df_term['collection'] == 'arab_events_3_2016']['keyword'].tolist()
arab_event_terms[-5:] # last 5 terms

['أقباط', 'الأردن', 'سقوط', 'لاجئ', 'مجزرة']

In [72]:
df_term.dtypes

collection            object
date_added    datetime64[ns]
keyword               object
dtype: object

In [60]:
import datetime

In [71]:
df_term['date_added'] = pd.to_datetime(df_term['date_added'])

In [73]:
df_term['date_added'][0]

Timestamp('2017-12-08 00:00:00')

In [69]:
df_term[df_term['date_added'] < datetime.datetime(2016, 6, 6)]

Unnamed: 0,collection,date_added,keyword


# Looking at The Size of Collections

In [12]:
df_meta = pd.read_csv(f_meta)
df_meta.head(3)

Unnamed: 0,collection,collection_number_of_files,collection_size,collection_size_bytes,latest_filedate,latest_filename,latest_filesize
0,us_politics_pro_trump,342,50.18 GB,53875396596,2018-01-08 04:01:01,/scratch/olympus/us_politics_pro_trump/data/us...,182.58 MB
1,us_election_marcorubio_2016,452,17.48 GB,18766707310,2018-01-08 04:01:01,/scratch/olympus/us_election_marcorubio_2016/d...,6.12 MB
2,turkey_referendum_2017,348,2.88 GB,3095740629,2018-01-08 04:01:01,/scratch/olympus/turkey_referendum_2017/data/t...,19.42 KB


In [13]:
import math

def convert_size(size_bytes):
    '''
    Bytes to a human-readable format.
    '''
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)

    return "%s %s" % (s, size_name[i])

In [14]:
olympus_btyes = df_meta['collection_size_bytes'].sum()
olympus_btyes

4334735510197

In [15]:
convert_size(olympus_btyes)

'3.94 TB'

In [16]:
len(df_meta['collection'].unique())

116

In [36]:
# how big is the largest collection?
largest_collection_size = df_meta['collection_size_bytes'][16]
largest_collection_size

13509920546

In [37]:
# how do we find the largest collection?
largest_collection = df_meta[df_meta['collection_size_bytes'] == largest_collection_size]
largest_collection

Unnamed: 0,collection,collection_number_of_files,collection_size,collection_size_bytes,latest_filedate,latest_filename,latest_filesize
16,us_politics_oppose_ban,342,12.58 GB,13509920546,2018-01-08 04:30:31,/scratch/olympus/us_politics_oppose_ban/data/u...,27.77 MB


In [38]:
# how to get the latest_filedate?
largest_collection['latest_filedate'].iloc[0]

'2018-01-08 04:30:31'

In [39]:
largest_collection_file = largest_collection['latest_filename'].iloc[0]
largest_collection_file

'/scratch/olympus/us_politics_oppose_ban/data/us_politics_oppose_ban_data__01_07_2018__00_00_00__23_59_59.json.bz2'

## Let's Look at this File

In [40]:
!pip install smappdragon --upgrade

Requirement already up-to-date: smappdragon in /home/nhb228/anaconda3/lib/python3.6/site-packages
Requirement already up-to-date: pymongo>=3.2.1 in /home/nhb228/anaconda3/lib/python3.6/site-packages (from smappdragon)


In [41]:
import json
from smappdragon import JsonCollection

In [42]:
?JsonCollection()

In [43]:
collect = JsonCollection(largest_collection_file, compression='bz2', throw_error=0, verbose=1)

In [44]:
sum(1 for _ in collect.get_iterator())

48184 rows are ok.
0 rows are corrupt.


48185

In [None]:
# put in explanation of this particular for loop ^
# explanation of what collection.get_iterator() returns
# explain _ (anonymous returned variable) and what 1 does
# maybe also what sum does

TypeError: 'generator' object is not subscriptable

In [50]:
for tweet in collect.get_iterator():
    print(json.dumps(tweet, indent=4))
    break

{
    "created_at": "Sun Jan 07 05:01:08 +0000 2018",
    "place": null,
    "lang": "en",
    "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>",
    "entities": {
        "urls": [
            {
                "expanded_url": "http://hill.cm/ZA1kTJb",
                "display_url": "hill.cm/ZA1kTJb",
                "url": "https://t.co/DC6ppgCWG3",
                "indices": [
                    115,
                    138
                ]
            }
        ],
        "hashtags": [
            {
                "text": "TheResistance",
                "indices": [
                    100,
                    114
                ]
            }
        ],
        "user_mentions": [
            {
                "id": 14247236,
                "id_str": "14247236",
                "name": "Scott Dworkin",
                "indices": [
                    3,
                    10
                ],
                "screen_name": "

In [51]:
def is_getlocated(tweet):
    if isinstance(tweet, dict) and tweet['coordinates']:
        return True
    return False

In [52]:
geo_collect = collect.set_custom_filter(is_getlocated)