In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re

In [2]:
# load twitter data
with open('data/twitter-data-small.json', encoding='utf-8') as f:
    tweets = json.load(f)

# load location file
with open('data/sal.json', encoding='utf-8') as f:
    locations = json.load(f)

In [3]:
class Tweet():
    def __init__(self, tweet) -> None:
        self.set_tweet_data(tweet)
        
    def set_tweet_data(self,tweet):
        # extract place and author id from tweet json
        place = tweet['includes']['places'][0]['full_name'].lower()     
        matches = re.findall(r'\S+(?:\s+\S+)*(?=,)', place)    
        place = ''.join(matches)
        self.place = place
        
        self.author = tweet['_id']
    
    def get_place(self):
        return self.place
    
    def get_author(self):
        return self.author

# Task Combined

In [6]:
capital_cities = {
    '1gsyd': 0,
    '2gmel': 0,
    '3gbri': 0,
    '4gade': 0,
    '5gper': 0,
    '6ghob': 0,
    '7gdar': 0,
    '8acte': 0,
    '9oter': 0,       
    }

irrelevant_areas = ['1rnsw', '2rvic', '3rqld', '4rsau', '5rwau', '6rtas', '7rnte']

tweeters = {}

for tweet in tweets:
    t = Tweet(tweet)
    city = t.get_place()
    id = t.get_author()
    # check if the place from tweet matches anything in the locations data. If not from irrelevant area and match, increment count
    if city in locations and locations[city]['gcc'] not in irrelevant_areas:
        gcc_code = locations[city]['gcc']
        capital_cities[gcc_code] += 1
        if id not in tweeters:
            # (# of tweets the user made, count for each city)
            tweeters[id] = (1, {gcc_code: 1})
        else:
            tweeters[id][0] += 1
            # add if city already exist
            if tweeters[id][1][gcc_code]:
                tweeters[id][1][gcc_code] += 1
            else:
                tweeters[id][1][gcc_code] = 1
                
    

# t1
top_cities = dict(sorted(capital_cities.items(), key=lambda item: item[1], reverse=True))
print(top_cities)

# t2
t2top_tweeters = sorted(tweeters.items(), key=lambda item: item[1][0], reverse=True)[:10]
print(t2top_tweeters)

# t3
t3top_tweeters = sorted(tweeters.items(), key=lambda item: len(item[1][1]), reverse=True)[:10]
print(t3top_tweeters)

{'1gsyd': 216, '2gmel': 129, '5gper': 71, '3gbri': 46, '4gade': 20, '8acte': 14, '6ghob': 7, '7gdar': 4, '9oter': 0}
[('1412192437640916992', (1, {'1gsyd': 1})), ('1412197721105108994', (1, {'1gsyd': 1})), ('1412195208792449025', (1, {'1gsyd': 1})), ('1412185770241052672', (1, {'1gsyd': 1})), ('1412185638904889346', (1, {'1gsyd': 1})), ('1412184906029961219', (1, {'1gsyd': 1})), ('1412184968730529798', (1, {'1gsyd': 1})), ('1412184990499049473', (1, {'1gsyd': 1})), ('1412185143083556866', (1, {'1gsyd': 1})), ('1412185211400425476', (1, {'1gsyd': 1}))]
[('1412192437640916992', (1, {'1gsyd': 1})), ('1412197721105108994', (1, {'1gsyd': 1})), ('1412195208792449025', (1, {'1gsyd': 1})), ('1412185770241052672', (1, {'1gsyd': 1})), ('1412185638904889346', (1, {'1gsyd': 1})), ('1412184906029961219', (1, {'1gsyd': 1})), ('1412184968730529798', (1, {'1gsyd': 1})), ('1412184990499049473', (1, {'1gsyd': 1})), ('1412185143083556866', (1, {'1gsyd': 1})), ('1412185211400425476', (1, {'1gsyd': 1}))]


## Task 1- Count the number of tweets in each capital city and rank by most to least

In [5]:
capital_cities = {
    '1gsyd': 0,
    '2gmel': 0,
    '3gbri': 0,
    '4gade': 0,
    '5gper': 0,
    '6ghob': 0,
    '7gdar': 0,
    '8acte': 0,        
    }

irrelevant_areas = ['1rnsw', '2rvic', '3rqld', '4rsau', '5rwau', '6rtas', '7rnte', '9oter']

# take the city before the comma in full_name, iterate thru the locations and see if can match
for tweet in tweets:    
    t = Tweet()
    t.set_tweet_data(tweet)
    city = t.get_place()
    id = t.get_author()
    # check if the place from tweet matches anything in the locations data. If not from irrelevant area and match, increment count
    if city in locations and locations[city]['gcc'] not in irrelevant_areas:
        capital_cities[locations[city]['gcc']] += 1

# sort by highest to lowest
top_cities = dict(sorted(capital_cities.items(), key=lambda item: item[1], reverse=True))
top_cities

TypeError: __init__() missing 1 required positional argument: 'tweet'

## Task 2- Count the number of tweets made by each twitter user and rank by most to least

In [16]:
# NOTE: even tho spec doesnt specify that rural areas will be ignored, ED discussion forum says we ignore it
tweeters = {}

for tweet in tweets:
    if tweet['_id'] not in tweeters:
        tweeters[tweet['_id']] = 1
    else:
        tweeters[tweet['_id']] += 1
# TODO: we don't need to covert to dict??
top_tweeters = dict(sorted(tweeters.items(), key=lambda item: item[1], reverse=True))

# all tweet ids are unique in dictionary
len(top_tweeters)
len(set(top_tweeters.keys()))

715

## Task 3- Count the number of tweets made by each twitter user in certain cities and rank by number of most distinct cities

In [17]:
def get_place(tweet):
    # regex to extract the relevant precise location
    place = tweet['includes']['places'][0]['full_name'].lower()     
    matches = re.findall(r'\S+(?:\s+\S+)*(?=,)', place)    
    return ''.join(matches)

tweeters = {}
for tweet in tweets:
    if tweet['_id'] not in tweeters:
        place = get_place(tweet)
        for city in locations:
            # check if the place from tweet matches anything in the locations data. If not from irrelevant area and match, increment count
            if place == city and locations[city]['gcc'] not in irrelevant_areas:
                # (# of tweets the user made, count for each city)
                tweeters[tweet['_id']] = (1,{})
                # insert when there is a new place
                tweeters[tweet['_id']][1][locations[place]['gcc']] = 1

    else:
        place = get_place(tweet)
        for city in locations:
            # check if the place from tweet matches anything in the locations data. If not from irrelevant area and match, increment count
            if place == city and locations[city]['gcc'] not in irrelevant_areas:
                tweeters[tweet['_id']][0] += 1
                if tweeters[tweet['_id']][1][locations[place]['gcc']]:
                    # add if exist
                    tweeters[tweet['_id']][1][locations[place]['gcc']] += 1
                else:
                    # insert when there is a new place
                    tweeters[tweet['_id']][1][locations[place]['gcc']] = 1

top_tweeters = sorted(tweeters.items(), key=lambda item: len(item[1][1]), reverse=True)
top_tweeters[-9:]
# TODO: create tweet class and location class

[('1412194546075635721', (1, {'8acte': 1})),
 ('1412195093939818500', (1, {'8acte': 1})),
 ('1412196248912760835', (1, {'8acte': 1})),
 ('1412197623805603840', (1, {'8acte': 1})),
 ('1412198117932371969', (1, {'8acte': 1})),
 ('1412198454407794689', (1, {'8acte': 1})),
 ('1412185329184821253', (1, {'8acte': 1})),
 ('1412190755452424209', (1, {'8acte': 1})),
 ('1412197569585848320', (1, {'8acte': 1}))]