In [20]:
import pandas as pd
import json
import re
from collections import defaultdict as dd 

Load file

In [2]:
with open('../data/sal.json', 'rb') as f:
    sa1 = json.load(f)

with open('../data/twitter-data-small.json', 'rb') as f:
    twitter_data_small = json.load(f)

### Initial analysis on sa1

In [4]:
len(sa1)

15340

In [8]:
sa1_keys_sorted = list(sa1.keys())
sa1_keys_sorted.sort()

In [9]:
sa1_keys_sorted

['aarons pass',
 'abba river',
 'abbey',
 'abbeyard',
 'abbeywood',
 'abbotsbury',
 'abbotsford (nsw)',
 'abbotsford (qld)',
 'abbotsford (vic.)',
 'abbotsham',
 'abels bay',
 'abercorn',
 'abercrombie',
 'abercrombie river',
 'aberdare',
 'aberdeen (nsw)',
 'aberdeen (tas.)',
 'aberfeldie',
 'aberfeldy',
 'aberfoyle',
 'aberfoyle park',
 'aberglasslyn',
 'abergowrie',
 'abermain',
 'abernethy',
 'abingdon downs',
 'abington (nsw)',
 'abington (qld)',
 'acacia creek',
 'acacia gardens',
 'acacia hills (nt)',
 'acacia hills (tas.)',
 'acacia ridge',
 'acheron',
 'acland',
 'acton (act)',
 'acton (tas.)',
 'acton park (tas.)',
 'acton park (wa)',
 'ada',
 'adaminaby',
 'adams estate',
 'adamstown',
 'adamstown heights',
 'adamsvale',
 'adare',
 'adavale',
 'addington',
 'adelaide',
 'adelaide airport',
 'adelaide lead',
 'adelaide park',
 'adelaide river',
 'adelong',
 'adjungbilly',
 'advancetown',
 'adventure bay',
 'aeroglen',
 'afterlee',
 'agery',
 'agnes',
 'agnes banks',
 'agnes w

### duplicate suburbs

We want to create a duplicate sal dictonary ("dup_sal_dict") which stores all suburb names that occurs several times in different states. 
For example, Abbotsford in New South Wales, Victoria, Queensland etc.

Sal format:  
"""  
abbotsford (nsw)  
abbotsford (qld)  
abbotsford (vic.)     
"""  

convert it to:  
"""  
dup_sal_dict = {'abbotsford': {'nsw': 'abbotsford (nsw)', 'qld': 'abbotsford (qld)', 'vic': 'abbotsford (vic.)'}}  
"""  



In [79]:
dup_sal_dict = {}
for sal in sa1_keys_sorted:
    # for a duplicate suburb, there will be brackets indicating the state
    if "(" in sal:
        dic = {}
        suburb = ""
        state = ""
        for word in sal.split():
            # the word included in brackets implys that it is a state name
            if word.startswith("("):
                # use slicing to extract the state from the bracket
                state = word[word.find('(')+1:word.find(')')]
                # remove punctuations (especially period) if there is any
                state = re.sub(r'[^\w\s]', '', state)
            # not included in () means that it is a suburb name or part of the suburb name
            else:
                suburb += word
        dic[state] = sal

        # if this suburb already exist in dictionary, we update more state
        if suburb in dup_sal_dict.keys():
            dup_sal_dict[suburb][state] = sal
        # if this suburb has not exist, we add it into dictionary.
        else:
            dup_sal_dict[suburb]=dic


In [80]:
# check result
print(dup_sal_dict)
print(dup_sal_dict['richmond'])

{'abbotsford': {'nsw': 'abbotsford (nsw)', 'qld': 'abbotsford (qld)', 'vic': 'abbotsford (vic.)'}, 'aberdeen': {'nsw': 'aberdeen (nsw)', 'tas': 'aberdeen (tas.)'}, 'abington': {'nsw': 'abington (nsw)', 'qld': 'abington (qld)'}, 'acaciahills': {'nt': 'acacia hills (nt)', 'tas': 'acacia hills (tas.)'}, 'acton': {'act': 'acton (act)', 'tas': 'acton (tas.)'}, 'actonpark': {'tas': 'acton park (tas.)', 'wa': 'acton park (wa)'}, 'albertpark': {'sa': 'albert park (sa)', 'vic': 'albert park (vic.)'}, 'alberton': {'qld': 'alberton (qld)', 'sa': 'alberton (sa)', 'tas': 'alberton (tas.)', 'vic': 'alberton (vic.)'}, 'albion-qld)': {'brisban': 'albion (brisbane - qld)', 'richmon': 'albion (richmond - qld)'}, 'albion': {'vic': 'albion (vic.)'}, 'alexandra': {'qld': 'alexandra (qld)', 'vic': 'alexandra (vic.)'}, 'alisoncoast-nsw)': {'centra': 'alison (central coast - nsw)'}, 'alison-nsw)': {'dungo': 'alison (dungog - nsw)'}, 'allandale': {'nsw': 'allandale (nsw)', 'qld': 'allandale (qld)'}, 'alligator

In [53]:
dup_sal_dist = []
for sal in sa1_keys_sorted:
    # for a duplicate suburb, there will be brackets indicating the  
    if "(" in sal:
        for word in sal.split():
            suburb = ""
            state = ""
            # the word included in brackets implys that it is a state name
            if word.startswith("("):
                state += word
            # not included in () means that it is a suburb name or part of the suburb name
            else:
                suburb += word


### duplicate suburbs end

In [7]:
for key in sa1_keys_sorted:
    print(key)

aarons pass
abba river
abbey
abbeyard
abbeywood
abbotsbury
abbotsford (nsw)
abbotsford (qld)
abbotsford (vic.)
abbotsham
abels bay
abercorn
abercrombie
abercrombie river
aberdare
aberdeen (nsw)
aberdeen (tas.)
aberfeldie
aberfeldy
aberfoyle
aberfoyle park
aberglasslyn
abergowrie
abermain
abernethy
abingdon downs
abington (nsw)
abington (qld)
acacia creek
acacia gardens
acacia hills (nt)
acacia hills (tas.)
acacia ridge
acheron
acland
acton (act)
acton (tas.)
acton park (tas.)
acton park (wa)
ada
adaminaby
adams estate
adamstown
adamstown heights
adamsvale
adare
adavale
addington
adelaide
adelaide airport
adelaide lead
adelaide park
adelaide river
adelong
adjungbilly
advancetown
adventure bay
aeroglen
afterlee
agery
agnes
agnes banks
agnes water
ainslie
aintree
airdmillan
airds
aire valley
aireys inlet
airlie beach
airly
airport west
airville
aitkenvale
ajana
akaroa
akolele
alabama hill
alawa
alawoona
albacutya
albanvale
albany
albany creek
albert
albert park (sa)
albert park (vic.)
alb

In [None]:
sa1['melbourne']

{'ste': '2', 'gcc': '2gmel', 'sal': '21640'}

In [None]:
gcc_tally = dd(int)
ste_tally = dd(int)

for key in sa1:
    gcc_tally[sa1[key]['gcc']] += 1
    ste_tally[sa1[key]['ste']] += 1

gcc_tally
# gcc distribution and set

# 1 nsw 2 vic 3 qld 4 sa 5 wa 6 tas 7 nt 8 act 9 other

# 基本上第二个是 g 的就能算；除非第二个是a（act算吗？）

defaultdict(int,
            {'1gsyd': 920,
             '1rnsw': 3623,
             '2gmel': 572,
             '2rvic': 2373,
             '3gbri': 647,
             '3rqld': 2587,
             '4gade': 494,
             '4rsau': 1202,
             '5gper': 397,
             '5rwau': 1303,
             '6ghob': 120,
             '6rtas': 656,
             '7gdar': 102,
             '7rnte': 202,
             '8acte': 137,
             '9oter': 5})

删掉r的sal？ （利用error)

In [None]:
ste_tally
# state distribution and set

defaultdict(int,
            {'1': 4543,
             '2': 2945,
             '3': 3234,
             '4': 1696,
             '5': 1700,
             '6': 776,
             '7': 304,
             '8': 137,
             '9': 5})

In [None]:
sa1_list = [x for x in sa1]

In [None]:
sa1_list[:2]
# 需要去除掉 state (但同时也是一个很好的窗口去试图match it back to correct place)

['abbotsbury', 'abbotsford (nsw)']

In [None]:
sa1_list_clean = [x.split('(')[0] for x in sa1]

In [None]:
sa1_list_clean[:2]

['abbotsbury', 'abbotsford ']

Initial analysis on twitter

In [None]:
len(twitter_data_small)

715

In [None]:
twitter_data_small[0].keys()

dict_keys(['_id', '_rev', 'data', 'includes', 'matching_rules'])

In [10]:
places_tally = dd(int)

for tweet in twitter_data_small:
    places_tally[tweet['includes']['places'][0]['full_name']] += 1

In [11]:
places_tally_list = list(places_tally)
places_tally_list.sort()
places_tally

defaultdict(int,
            {'Australia': 2,
             'New South Wales, Australia': 15,
             'Central Coast, New South Wales': 6,
             'Nelson Bay - Corlette, New South Wales': 1,
             'Eveleigh, Sydney': 1,
             'Kirribilli, Sydney': 1,
             'Macquarie Park, Sydney': 1,
             'Picton, New South Wales': 3,
             'Sydney, New South Wales': 213,
             'Bathurst, New South Wales': 1,
             'Byron Bay, New South Wales': 1,
             'Cambewarra Village, New South Wales': 2,
             'Coffs Harbour, New South Wales': 3,
             'Gloucester, New South Wales': 1,
             'Helensburgh, New South Wales': 4,
             'Kempsey, New South Wales': 6,
             'Manilla, New South Wales': 2,
             'Murwillumbah': 1,
             'Newcastle, New South Wales': 17,
             'Queanbeyan, New South Wales': 3,
             'Tamworth, New South Wales': 1,
             'Toronto, Ontario': 3,
         

In [12]:
data_sa1_list = list()
for sa1_unfiltered in places_tally:
    try:
        data_sa1_list.append(sa1_unfiltered.split(',')[0])
    except:
        data_sa1_list.append(sa1_unfiltered)

In [13]:
count1 = 0
for tweet_sa1 in data_sa1_list:
    if tweet_sa1 not in sa1_list:
        print(tweet_sa1)
        count1 += 1
print(count1)

NameError: name 'sa1_list' is not defined

In [None]:
count2 = 0
for tweet_sa1 in data_sa1_list:
    if tweet_sa1 not in sa1_list_clean:
        print(tweet_sa1)
        count2 += 1
print(count2)

Australia
New South Wales
Central Coast
Nelson Bay - Corlette
Eveleigh
Kirribilli
Macquarie Park
Picton
Sydney
Bathurst
Byron Bay
Cambewarra Village
Coffs Harbour
Gloucester
Helensburgh
Kempsey
Manilla
Murwillumbah
Newcastle
Queanbeyan
Tamworth
Toronto
Tweed Heads
Whitton
Wollongong
Torquay - Jan Juc
Victoria
Shepparton - Mooroopna
Gisborne
Melbourne
Melton
Pakenham
Scoresby
Sunbury
Sunshine
Windsor
Anglesea
Ballarat
Bendigo
Geelong
Kilmore
Mansfield
Wangaratta
Warrnambool
Gold Coast
Sunshine Coast
Queensland
Rockhampton
Cairns
Airlie Beach - Cannonvale
Toowoomba
Townsville
Doonan - Tinbeerwah
Brisbane
Ayr
Clermont
Mossman Gorge
Tamborine Mountain
Westbrook
Crafers - Bridgewater
South Australia
Adelaide
Auburn
Carrickalinga
Western Australia
Perth
Serpentine
Broome
Margaret River
Tasmania
Hobart
Launceston
Darwin
Ghan
Canberra
Braddon
Fyshwick
77
