In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re

In [4]:
# load twitter data
with open('data/twitter-data-small.json', encoding='utf-8') as f:
    tweets = json.load(f)

# load location file
with open('data/sal.json', encoding='utf-8') as f:
    locations = json.load(f)

## Task 1- Count the number of tweets in each capital city and rank by most to least

In [5]:
capital_cities = {
    '1gsyd': 0,
    '2gmel': 0,
    '3gbri': 0,
    '4gade': 0,
    '5gper': 0,
    '6ghob': 0,
    '7gdar': 0,
    '8acte': 0,        
    }

irrelevant_areas = ['1rnsw', '2rvic', '3rqld', '4rsau', '5rwau', '6rtas', '7rnte', '9oter']

# take the city before the comma in full_name, iterate thru the locations and see if can match
for tweet in tweets:    
    # regex to extract the relevant precise location
    place = tweet['includes']['places'][0]['full_name'].lower()     
    matches = re.findall(r'\S+(?:\s+\S+)*(?=,)', place)    
    place = ''.join(matches)
    
    for city in locations:
        # check if the place from tweet matches anything in the locations data. If not from irrelevant area and match, increment count
        if place == city and locations[city]['gcc'] not in irrelevant_areas:
            capital_cities[locations[place]['gcc']] += 1

# sort by highest to lowest
top_cities = dict(sorted(capital_cities.items(), key=lambda item: item[1], reverse=True))
top_cities

{'1gsyd': 216,
 '2gmel': 129,
 '5gper': 71,
 '3gbri': 46,
 '4gade': 20,
 '8acte': 14,
 '6ghob': 7,
 '7gdar': 4}

## Task 2- Count the number of tweets made by each twitter user and rank by most to least

In [6]:
# NOTE: even tho spec doesnt specify that rural areas will be ignored, ED discussion forum says we ignore it
tweeters = {}

for tweet in tweets:
    if tweet['_id'] not in tweeters:
        tweeters[tweet['_id']] = 1
    else:
        tweeters[tweet['_id']] += 1
# TODO: we don't need to covert to dict??
top_tweeters = dict(sorted(tweeters.items(), key=lambda item: item[1], reverse=True))

# all tweet ids are unique in dictionary
len(top_tweeters)
len(set(top_tweeters.keys()))

715

## Task 3- Count the number of tweets made by each twitter user in certain cities and rank by number of most distinct cities

In [7]:
def get_place(tweet):
    # regex to extract the relevant precise location
    place = tweet['includes']['places'][0]['full_name'].lower()     
    matches = re.findall(r'\S+(?:\s+\S+)*(?=,)', place)    
    return ''.join(matches)

tweeters = {}
for tweet in tweets:
    if tweet['_id'] not in tweeters:
        place = get_place(tweet)
        for city in locations:
            # check if the place from tweet matches anything in the locations data. If not from irrelevant area and match, increment count
            if place == city and locations[city]['gcc'] not in irrelevant_areas:
                tweeters[tweet['_id']] = (1,{})
                tweeters[tweet['_id']][1][locations[place]['gcc']] = 1

    else:
        place = get_place(tweet)
        for city in locations:
            # check if the place from tweet matches anything in the locations data. If not from irrelevant area and match, increment count
            if place == city and locations[city]['gcc'] not in irrelevant_areas:
                tweeters[tweet['_id']][0] += 1
                if tweeters[tweet['_id']][1][locations[place]['gcc']]:
                    tweeters[tweet['_id']][1][locations[place]['gcc']] += 1
                else:
                    tweeters[tweet['_id']][1][locations[place]['gcc']] = 1

top_tweeters = sorted(tweeters.items(), key=lambda item: len(item[1][1]), reverse=True)
top_tweeters[9:]
# TODO: create tweet class and location class

[('1412185211400425476', (1, {'1gsyd': 1})),
 ('1412185228487979012', (1, {'1gsyd': 1})),
 ('1412185306057510912', (1, {'1gsyd': 1})),
 ('1412185361799782400', (1, {'1gsyd': 1})),
 ('1412185463838830592', (1, {'1gsyd': 1})),
 ('1412185507832860679', (1, {'1gsyd': 1})),
 ('1412185554955866122', (1, {'1gsyd': 1})),
 ('1412185754629906436', (1, {'1gsyd': 1})),
 ('1412185785072119809', (1, {'1gsyd': 1})),
 ('1412185889086705666', (1, {'1gsyd': 1})),
 ('1412185918614622209', (1, {'1gsyd': 1})),
 ('1412185948041846785', (1, {'1gsyd': 1})),
 ('1412185971181846528', (1, {'1gsyd': 1})),
 ('1412186219086110722', (1, {'1gsyd': 1})),
 ('1412186318482771968', (1, {'1gsyd': 1})),
 ('1412186459537231878', (1, {'1gsyd': 1})),
 ('1412189315329716227', (1, {'1gsyd': 1})),
 ('1412189417272315904', (1, {'1gsyd': 1})),
 ('1412189432711503874', (1, {'1gsyd': 1})),
 ('1412189492958556162', (1, {'1gsyd': 1})),
 ('1412189530292031490', (1, {'1gsyd': 1})),
 ('1412189660470669314', (1, {'1gsyd': 1})),
 ('1412189