In [26]:
import urllib2 
import shutil
import os.path
import json

url = 'http://files.figshare.com/' + '1114090/usagov_bitly_data2012_11_06_total.json'

In [20]:
file = 'usagov_data.json'

if os.path.isfile(file):
    print('file ' + file + ' already exists')
else:
    print('\n Downloading ' + file + ' from ' + url)
    try:
        request = urllib2.urlopen(url)
        with open(file, 'wb') as f:
            shutil.copyfileobj(request, f)
    except urllib2.URLError as e:
        if hasattr(e, 'reason'):
            print('We failed to reach a server.')
            print('Reason: ', e.reason)
        elif hasattr(e, 'code'):
            print('The server couldn\'t fulfill the request')
            print('Error code: ' + e.code)
    print('\n File downloaded successfully.')


 Downloading usagov_data.json from http://files.figshare.com/1114090/usagov_bitly_data2012_11_06_total.json

 File downloaded successfully.


In [23]:
with open(file, 'rb') as f:
    data = f.readlines()

Since all the data fits into memory, we can transform the list of strings to list of dictionaries using `map()`

In [59]:
records = map(json.loads, data)
records[:1]

[{u'a': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4',
  u'al': u'vi-VN,vi;q=0.8,fr-FR;q=0.6,fr;q=0.4,en-US;q=0.2,en;q=0.2',
  u'c': None,
  u'g': u'Ut1gpJ',
  u'h': u'Ut1gpJ',
  u'hc': 1348583821,
  u'hh': u'1.usa.gov',
  u'l': u'bitly',
  u'nk': 0,
  u'r': u'http://www.facebook.com/groups/133327633404542/394748863929083/?ref=notif&notif_t=group_activity',
  u't': 1352163133,
  u'tz': u'',
  u'u': u'http://www.ncbi.nlm.nih.gov/Structure/cdd/docs/cdd_news.html?campaign=facebook-092542012a'}]

If we're dealing with larger dataset, we may want to perform online analytics which means manipulating only one structure at a time using `iterable map` from the `itertools` collection. 

In [56]:
from itertools import imap
irecords = imap(json.loads, data)
irecords

<itertools.imap at 0x109900e50>

`imap()` returns a generator which presents data one at a time, so indexing is `NOT` possible. Rather we use the `next()` methods.

In [39]:
record = irecords.next()
record

{u'a': u'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
 u'al': u'en-US',
 u'c': u'US',
 u'cy': u'Terry',
 u'g': u'bREk8i',
 u'gr': u'MS',
 u'h': u'dUvbu5',
 u'hc': 1303210724,
 u'hh': u'1.usa.gov',
 u'l': u'milt5658',
 u'll': [32.130001, -90.338799],
 u'nk': 0,
 u'r': u'http://www.facebook.com/l.php?u=http%3A%2F%2F1.usa.gov%2FdUvbu5&h=RAQFJXWOuAQE_mRuIlvCEGZornSmI_ZQBw0xOBw5nvl1qfA&s=1',
 u't': 1352163133,
 u'tz': u'America/Chicago',
 u'u': u'http://www.whitehouse.gov/the_press_office/economy_in_government_contracting'}

In [41]:

print(json.dumps(record, indent=4))


{
    "a": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)", 
    "c": "US", 
    "nk": 0, 
    "tz": "America/Chicago", 
    "gr": "MS", 
    "g": "bREk8i", 
    "h": "dUvbu5", 
    "cy": "Terry", 
    "l": "milt5658", 
    "al": "en-US", 
    "hh": "1.usa.gov", 
    "r": "http://www.facebook.com/l.php?u=http%3A%2F%2F1.usa.gov%2FdUvbu5&h=RAQFJXWOuAQE_mRuIlvCEGZornSmI_ZQBw0xOBw5nvl1qfA&s=1", 
    "u": "http://www.whitehouse.gov/the_press_office/economy_in_government_contracting", 
    "t": 1352163133, 
    "hc": 1303210724, 
    "ll": [
        32.130001, 
        -90.338799
    ]
}


In [42]:
# To make the list more readable, we can do some reassignment.

record['User Agent'] = record['a']

In [43]:
record['User Agent']

u'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'

We can define a function that does header conversion on data load.

In [44]:
def process_record(record):
    record['User Agent'] = record['a']
    del record['a']
    return record

Load the dataset one record/structure at a time using '`imap()`. Process the record and change the header.


In [48]:
irecords = imap(json.loads, data)
irecords = imap(process_record, irecords)

# Call next and observe that the key has changed.
irecords.next()

{'User Agent': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4',
 u'al': u'vi-VN,vi;q=0.8,fr-FR;q=0.6,fr;q=0.4,en-US;q=0.2,en;q=0.2',
 u'c': None,
 u'g': u'Ut1gpJ',
 u'h': u'Ut1gpJ',
 u'hc': 1348583821,
 u'hh': u'1.usa.gov',
 u'l': u'bitly',
 u'nk': 0,
 u'r': u'http://www.facebook.com/groups/133327633404542/394748863929083/?ref=notif&notif_t=group_activity',
 u't': 1352163133,
 u'tz': u'',
 u'u': u'http://www.ncbi.nlm.nih.gov/Structure/cdd/docs/cdd_news.html?campaign=facebook-092542012a'}

### Use a generator

In [51]:
records = (process_record(json.loads(line)) for line in data)
records

<generator object <genexpr> at 0x10f141f50>

In [50]:
records

<generator object <genexpr> at 0x1107463c0>

## Data analysis

We wish to extract the timezones but some records do not have timezones.

In [63]:
records = map(json.loads, data)
tz = [record.setdefault('tz', '') for record in records]
tz[:9]

[u'',
 u'Europe/Prague',
 u'America/Chicago',
 u'America/New_York',
 u'America/Rainy_River',
 u'America/New_York',
 u'America/Los_Angeles',
 u'America/New_York',
 u'America/New_York']

Find most popular `URL` by counting the frequency of clicks.

In [81]:
from collections import Counter
urls = [record['u'] for record in records if 'u' in record]
count_url = Counter(urls)

most_common = count_url.most_common()

# Gives tuple of URL and the count.
most_common[0]

(u'http://www.nasa.gov/mission_pages/asteroids/news/asteroid20121105.html',
 1479)