In [3]:
### Get boroughs from tweet locations ###
### Programmer: Dan Qin ###
### Date: 26.05.2018    ###

# This code is used to retrieve borough locations from tweet data extracted by methods in the following post:
# http://www.mikaelbrunila.fi/2017/03/27/scraping-extracting-mapping-geodata-twitter/
# (in which the author scrapes geodata from twitter API and saves it as "primary_geo")


# THEORY:
# Borough locations will be extracted from the "primary_geo" column. The values in "primary_geo" are basically:
# 1)the exact coordinates from where the tweet was created,
# 2)place tags the user chose from a list of candidate Twitter Places when they tweeted,
# 3)or locations provided in the user profile

# METHOD:
# 1) check the address in "primary_geo", if a borough is mentioned, store the borough name
# 2) if an area is mentioned, store the borough it is referenced to
# 3) if it's a specific place, return the coordinates using public geocoding API
# 4) finally, check whether the coordinate points are within borough polygons
# 5) points with unclear locations or located out of London boundary are excluded

# p.s.Versions of libraries used in the code are printed at the end.

#### Import libraries and data ####
import pandas as pd
import numpy as np
import json


In [4]:
# load twitter data(omitted)
data_raw.head()

Unnamed: 0,user_id,screen_name,created_at,timestamp,text_tweet,amount_tweeted,language,location,primary_geo
0,2324423000.0,WeatherWoking,Thu Apr 19 17:00:00 +0000 2018,1524157000000.0,Tmp 26.1°C Wind 8mph Press 1014.0mb Cloud 8584...,31.0,en,"Woking, South East","51.33222222, -0.55777778"
1,152434300.0,NewhavenTownWx,Thu Apr 19 17:00:01 +0000 2018,1524157000000.0,"Wind 0.0 kts N. Barometer 1022.6 hPa, Falling ...",6.0,en,,"50.8, 0.04666667"
2,404401400.0,ThurrockWeather,Thu Apr 19 17:00:01 +0000 2018,1524157000000.0,"19/1800 \r\nFcast:Fairly fine, occasional show...",6.0,en,"South Ockendon, Essex","51.49972222, 0.25027778"
3,2789607000.0,ProforcaTheatre,Thu Apr 19 17:00:02 +0000 2018,1524157000000.0,Delighted to have @em_c_wroe and @MitchellReev...,1.0,en,London,"Hackney, London, United Kingdom"
4,72431970.0,ElmsteadWeather,Thu Apr 19 17:00:03 +0000 2018,1524157000000.0,"Wind 2.2 mph S\r\nBarometer 1037.7 mb,Falling ...",10.0,en,"Elmstead, Essex","51.87472222, 0.98694444"


In [5]:
# check total number of tweets
data_raw.count()

user_id           11598
screen_name       11598
created_at        11598
timestamp         11598
text_tweet        11598
amount_tweeted    11598
language          11598
location          10220
primary_geo       11598
dtype: int64

In [6]:
# get location column from the dataframe
twtloc = data_raw.primary_geo
twtloc.describe()

count                               11598
unique                               1632
top       London, England, United Kingdom
freq                                  772
Name: primary_geo, dtype: object

In [7]:
# london borough shapes
with open('data/london_boroughs.geojson') as f:
    BoroughShp = json.load(f)

In [8]:
# borough polygons
import shapely
from shapely.geometry import Polygon
from shapely.geometry import Point


columns = ["Polygon","Borough"]
index = range(len(BoroughShp["features"]))

polys = pd.DataFrame(columns = columns, index = index)

for i in range(len(index)):
    polys["Polygon"].iloc[i] = Polygon(BoroughShp["features"][i]["geometry"]["coordinates"][0][0])
    polys["Borough"].iloc[i] = BoroughShp["features"][i]["properties"]["name"]
    
polys.describe()    

Unnamed: 0,Polygon,Borough
count,33,33
unique,33,33
top,"POLYGON ((-0.139075 51.41929, -0.139359 51.419...",Hounslow
freq,1,1


In [9]:
#### Preparation ####
# list of location names above region level
region_names = ["East","South","South East","London","England","United Kingdom"]

In [1]:
# list of towns surrounding London
# only look for towns in counties sharing border with London: Herts, Essex, Kent,Surrey, Berkshier, Bucks
town_names = []
neighbour_counties = ["Herts", "Essex", "Kent","Surrey", "Berkshier", "Bucks"]

town_temp = pd.read_csv("data/UK_towns.csv")


for i in range(len(town_temp.index)):
    if town_temp.County[i] in neighbour_counties:
        town_names.append(town_temp.Town[i])

town_names


In [12]:
# list of areas in London
area_names = pd.read_csv("data/London_areas.csv")
area_names.head()

Unnamed: 0,Area,Borough,PostTown,Postcode,Dialcode
0,Barking,Barking and Dagenham,BARKING,IG11,20
1,Becontree,Barking and Dagenham,DAGENHAM,RM9,20
2,Becontree Heath,Barking and Dagenham,DAGENHAM,RM8,20
3,Castle Green,Barking and Dagenham,DAGENHAM,RM9,20
4,Creekmouth,Barking and Dagenham,BARKING,IG11,20


In [13]:
# list of boroughs
borough_names = []
for b in BoroughShp["features"]:
    borough_names.append(b["properties"]["name"])
    
print(borough_names)

['Barking and Dagenham', 'Barnet', 'Bexley', 'Brent', 'Bromley', 'Camden', 'City of London', 'Croydon', 'Ealing', 'Enfield', 'Greenwich', 'Hackney', 'Hammersmith and Fulham', 'Haringey', 'Harrow', 'Havering', 'Hillingdon', 'Hounslow', 'Islington', 'Kensington and Chelsea', 'Kingston upon Thames', 'Lambeth', 'Lewisham', 'Merton', 'Newham', 'Redbridge', 'Richmond upon Thames', 'Southwark', 'Sutton', 'Tower Hamlets', 'Waltham Forest', 'Wandsworth', 'Westminster']


In [16]:
#### Get borough data from twitter locations ####
from geopy.geocoders import Nominatim
from geopy.geocoders import GoogleV3

borough = [0]* len(twtloc) # list to store borough tags
geo_count = [0]* len(twtloc) # attribute of geo locations

# loop through location data
for i in range(len(twtloc)):
    # if coordinates, keep it
    try :
        temp_loc = twtloc[i].split(",")
        temp_coord = (float(temp_loc[0]),float(temp_loc[1]))
        borough[i] = temp_coord
        geo_count[i] = "coord"
      
    # if NaN value in twtloc
    except AttributeError:
        borough[i] = None
        geo_count[i] = "vagueLoc"
        
    # if a string text
    except ValueError:
        # look at words before the first comma, if a borough, keep it(geo_count:"borough")
        if temp_loc[0] in borough_names:
            borough[i] = temp_loc[0]
            geo_count[i] = "borough"
            
        # if an area, refer to the borough it belongs(geo_count:"borough")
        elif temp_loc[0] in area_names["Area"].values:
            ref_temp =  area_names.loc[area_names["Area"] == temp_loc[0]]
            borough[i] = ref_temp["Borough"].item()
            geo_count[i] = "borough"
            
        # if a town outside London, convert it to null(geo_count:"outofLdn")
        elif temp_loc[0] in town_names:
            borough[i] = None
            geo_count[i] = "outofLdn"
            
        # if direction names, convert it to null(geo_count:"vagueLoc")
        elif temp_loc[0] in region_names:
            borough[i] = None
            geo_count[i] = "vagueLoc"
            
        # else, convert place names to coordinates(geo_count:"place")
        else:
            # comment out the Options below and check geo_count first(ensure it doesn't go over API limits)
            
            # Option1: use OSM Nominatim(1 request/sec)
            # geolocator = Nominatim(format_string="%s, London")
            # location = geolocator.geocode(temp_loc[0])
            # borough[i] = (location.latitude,location.longitude)
           
            # Option2: use Google Geocoding API(slow, 2500 requests/d)
            api_k = "******"            
            geolocator = GoogleV3(api_key = api_k,timeout=5, domain="maps.google.co.uk")            
            location = geolocator.geocode(twtloc[i])            
            borough[i] = (location.latitude,location.longitude)
            
            
            geo_count[i] = "place"
        

In [15]:
# check the counts of geo values
geo_check = pd.DataFrame({'value_count':geo_count})
geo_check.fillna(value = np.nan, inplace = True)
geo_check.value_count.value_counts()

borough     6621
coord       2002
vagueLoc    1982
place        662
outofLdn     331
Name: value_count, dtype: int64

Number of locations above regional level is about 17% of total(after eliminating tweets out of London boundary). So perhaps consider dropping them.

In [65]:
# number of place locations is under API limit
# uncomment "Option" and run "Get borough data" again

# save the borough tags to file if needed
# borough_check.to_csv('data/borough_tag.csv')

In [17]:
# check whether coordinates in boroughs
# create a copy of borough tags
borough_cp = borough[:]

for i in range(len(borough)):
    # if coordinates, check whether it's within boroughs
    if (type(borough[i]) == tuple):
        pt = Point(borough[i][1],borough[i][0])
        for p in range(len(polys.index)):
            # if it is, store the borough name and break the loop
            if(pt.within(polys.Polygon.iloc[p])):
                borough_cp[i] = polys.Borough.iloc[p]
                geo_count[i] = "borough"
                break
            # otherwise it's out of London 
            borough_cp[i] = None
            geo_count[i] = "outofLdn"
    else:
        continue
        

In [18]:
# check the counts of geo values
borough_check2 = pd.DataFrame({'borough':borough_cp,'value_count':geo_count})
borough_check2.fillna(value = np.nan, inplace = True)
borough_check2.value_count.value_counts()

borough     8526
vagueLoc    1982
outofLdn    1090
Name: value_count, dtype: int64

In [19]:
# add borough column back to twitter data
data_raw["borough"] = borough_check2.borough.values

# dump rows with null location data
data_clean = data_raw.copy()
data_clean = data_clean.dropna(subset = ["borough"])

In [20]:
# check number of tweets left
data_clean.describe()

Unnamed: 0,user_id,timestamp,amount_tweeted
count,8526.0,8526.0,8526.0
mean,7.533952e+16,1524329000000.0,2.080929
std,2.416661e+17,273134400.0,3.343417
min,13536.0,1524157000000.0,1.0
25%,80061650.0,1524162000000.0,1.0
50%,318668400.0,1524168000000.0,1.0
75%,1487224000.0,1524780000000.0,2.0
max,9.89629e+17,1524810000000.0,103.0


In [21]:
# save it to file
data_clean.to_csv(file_output,index = False)

As a result, 7847 tweets remain with location in boroughs.

In [160]:
# versions of libraries used 
import sys
import geopy

print("Python version:{}".format(sys.version))
print("Pandas version:{}".format(pd.__version__))
print("Numpy version:{}".format(np.__version__))
print("Shapely version:{}".format(shapely.__version__))
print("Geopy version:{}".format(geopy.__version__))

Python version:3.6.2 (v3.6.2:5fd33b5, Jul  8 2017, 04:57:36) [MSC v.1900 64 bit (AMD64)]
Pandas version:0.22.0
Numpy version:1.14.2
Shapely version:1.6.4.post1
Geopy version:1.13.0
