# Data Cleaning and Pre-Processing

### Import libraries needed

In [79]:
from math import sqrt
import pandas as pd
import json
import requests
import random
import folium
from shapely.geometry import Point
from shapely.geometry import shape
from bs4 import BeautifulSoup
import re

Credentials for map search later

In [184]:
# Foursquare credentials
with open('credentials.json') as f:
    creds = json.load(f)

    CLIENT_ID = creds[0]['CLIENT_ID']
    CLIENT_SECRET = creds[0]['CLIENT_SECRET']

VERSION =  "20200101"
LIMIT = 100

In [185]:
crime_df = pd.read_csv('MPS Borough Level Crime (most recent 24 months).csv')
crime_df.head()

Unnamed: 0,MajorText,MinorText,LookUp_BoroughName,201812,201901,201902,201903,201904,201905,201906,...,202002,202003,202004,202005,202006,202007,202008,202009,202010,202011
0,Arson and Criminal Damage,Arson,Barking and Dagenham,1,5,2,5,5,11,3,...,5,6,2,2,4,4,6,2,7,4
1,Arson and Criminal Damage,Criminal Damage,Barking and Dagenham,88,97,127,138,130,140,113,...,103,107,80,86,121,122,114,116,119,100
2,Burglary,Burglary - Business and Community,Barking and Dagenham,33,45,24,29,27,21,27,...,17,28,29,16,16,28,24,32,21,19
3,Burglary,Burglary - Residential,Barking and Dagenham,164,114,107,99,96,114,96,...,123,97,57,42,63,72,63,54,67,90
4,Drug Offences,Drug Trafficking,Barking and Dagenham,4,6,2,6,5,9,6,...,6,6,15,13,12,21,9,11,14,17


In [186]:
crime_df['Total_Crime'] = crime_df.sum(axis=1)

In [187]:
crime_df = crime_df[['MajorText', 'LookUp_BoroughName', 'Total_Crime']]
crime_df.columns = ['Crime', 'Borough', 'Total_Crime']
crime_df.head()

Unnamed: 0,Crime,Borough,Total_Crime
0,Arson and Criminal Damage,Barking and Dagenham,115
1,Arson and Criminal Damage,Barking and Dagenham,2687
2,Burglary,Barking and Dagenham,656
3,Burglary,Barking and Dagenham,2193
4,Drug Offences,Barking and Dagenham,228


In [188]:
type_crime_df = crime_df.groupby(['Borough', 'Crime']).sum()
type_crime_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Total_Crime
Borough,Crime,Unnamed: 2_level_1
Barking and Dagenham,Arson and Criminal Damage,2802
Barking and Dagenham,Burglary,2849
Barking and Dagenham,Drug Offences,2768
Barking and Dagenham,Miscellaneous Crimes Against Society,633
Barking and Dagenham,Possession of Weapons,353


In [189]:
total_crime_df = crime_df.groupby('Borough').sum()
total_crime_df.head()

Unnamed: 0_level_0,Total_Crime
Borough,Unnamed: 1_level_1
Barking and Dagenham,39456
Barnet,59167
Bexley,33929
Brent,59257
Bromley,47499


In [250]:
url = "https://en.wikipedia.org/wiki/List_of_London_boroughs"
wikitable = "wikitable sortable jquery-tablesorter"
response = requests.get(url)
print(response.status_code)

200


In [278]:
soup = BeautifulSoup(response.text, 'html.parser')
london, city = soup.findAll('table', {'class': "wikitable"})

london_df = pd.read_html(str(london))
london_df = pd.DataFrame(london_df[0])
city_df = pd.read_html(str(city))
city_df = pd.DataFrame(city_df[0])
london_df.columns

Index(['Borough', 'Inner', 'Status', 'Local authority', 'Political control',
       'Headquarters', 'Area (sq mi)', 'Population (2019 est)[1]',
       'Co-ordinates', 'Nr. in map'],
      dtype='object')

In [279]:
city_df.columns

Index(['Borough', 'Inner', 'Status', 'Local authority', 'Political control',
       'Headquarters', 'Area (sq mi)', 'Population(2019 est)', 'Co-ordinates',
       'Nr. inmap'],
      dtype='object')

In [280]:
london_df = london_df[['Borough','Population (2019 est)[1]', 'Co-ordinates']]
london_df.columns = ['Borough','Population(2019 est)', 'Co-ordinates']
london_df.head()

Unnamed: 0,Borough,Population(2019 est),Co-ordinates
0,Barking and Dagenham [note 1],212906,".mw-parser-output .geo-default,.mw-parser-outp..."
1,Barnet,395896,51°37′31″N 0°09′06″W﻿ / ﻿51.6252°N 0.1517°W
2,Bexley,248287,51°27′18″N 0°09′02″E﻿ / ﻿51.4549°N 0.1505°E
3,Brent,329771,51°33′32″N 0°16′54″W﻿ / ﻿51.5588°N 0.2817°W
4,Bromley,332336,51°24′14″N 0°01′11″E﻿ / ﻿51.4039°N 0.0198°E


In [281]:
city_df = city_df[['Borough', 'Population(2019 est)', 'Co-ordinates']]

In [282]:
london_df = pd.concat([london_df, city_df])

In [283]:
london_df.reset_index(drop=True, inplace=True)
london_df.tail()

Unnamed: 0,Borough,Population(2019 est),Co-ordinates
28,Tower Hamlets,324745,51°30′36″N 0°00′21″W﻿ / ﻿51.5099°N 0.0059°W
29,Waltham Forest,276983,51°35′27″N 0°00′48″W﻿ / ﻿51.5908°N 0.0134°W
30,Wandsworth,329677,51°27′24″N 0°11′28″W﻿ / ﻿51.4567°N 0.1910°W
31,Westminster,261317,51°29′50″N 0°08′14″W﻿ / ﻿51.4973°N 0.1372°W
32,City of London,9721,51°30′56″N 0°05′32″W﻿ / ﻿51.5155°N 0.0922°W


In [284]:
london_df.head()


Unnamed: 0,Borough,Population(2019 est),Co-ordinates
0,Barking and Dagenham [note 1],212906,".mw-parser-output .geo-default,.mw-parser-outp..."
1,Barnet,395896,51°37′31″N 0°09′06″W﻿ / ﻿51.6252°N 0.1517°W
2,Bexley,248287,51°27′18″N 0°09′02″E﻿ / ﻿51.4549°N 0.1505°E
3,Brent,329771,51°33′32″N 0°16′54″W﻿ / ﻿51.5588°N 0.2817°W
4,Bromley,332336,51°24′14″N 0°01′11″E﻿ / ﻿51.4039°N 0.0198°E


sigh, regex here we come

In [285]:
london_df.Borough = london_df['Borough'].replace(to_replace =" \[note [1-9]\]", value = '', regex = True)
london_df.head()

Unnamed: 0,Borough,Population(2019 est),Co-ordinates
0,Barking and Dagenham,212906,".mw-parser-output .geo-default,.mw-parser-outp..."
1,Barnet,395896,51°37′31″N 0°09′06″W﻿ / ﻿51.6252°N 0.1517°W
2,Bexley,248287,51°27′18″N 0°09′02″E﻿ / ﻿51.4549°N 0.1505°E
3,Brent,329771,51°33′32″N 0°16′54″W﻿ / ﻿51.5588°N 0.2817°W
4,Bromley,332336,51°24′14″N 0°01′11″E﻿ / ﻿51.4039°N 0.0198°E


In [286]:
spl = u'\ufeff'
spl = f"/ {spl}"
london_df[['junk', 'coords']] = london_df['Co-ordinates'].str.split(spl, expand=True)
london_df.head()

Unnamed: 0,Borough,Population(2019 est),Co-ordinates,junk,coords
0,Barking and Dagenham,212906,".mw-parser-output .geo-default,.mw-parser-outp...",".mw-parser-output .geo-default,.mw-parser-outp...",51.5607°N 0.1557°E
1,Barnet,395896,51°37′31″N 0°09′06″W﻿ / ﻿51.6252°N 0.1517°W,51°37′31″N 0°09′06″W﻿,51.6252°N 0.1517°W
2,Bexley,248287,51°27′18″N 0°09′02″E﻿ / ﻿51.4549°N 0.1505°E,51°27′18″N 0°09′02″E﻿,51.4549°N 0.1505°E
3,Brent,329771,51°33′32″N 0°16′54″W﻿ / ﻿51.5588°N 0.2817°W,51°33′32″N 0°16′54″W﻿,51.5588°N 0.2817°W
4,Bromley,332336,51°24′14″N 0°01′11″E﻿ / ﻿51.4039°N 0.0198°E,51°24′14″N 0°01′11″E﻿,51.4039°N 0.0198°E


In [293]:
london_df.drop(['Co-ordinates', 'junk'], axis=1, inplace=True)
london_df.head()

Unnamed: 0,Borough,Population(2019 est),coords,latitude,longitude
0,Barking and Dagenham,212906,51.5607°N 0.1557°E,51.5607,0.1557°E
1,Barnet,395896,51.6252°N 0.1517°W,51.6252,0.1517°W
2,Bexley,248287,51.4549°N 0.1505°E,51.4549,0.1505°E
3,Brent,329771,51.5588°N 0.2817°W,51.5588,0.2817°W
4,Bromley,332336,51.4039°N 0.0198°E,51.4039,0.0198°E


In [294]:
london_df[['latitude', 'longitude']] = london_df['coords'].str.split("°N ", expand=True)
london_df.head()

Unnamed: 0,Borough,Population(2019 est),coords,latitude,longitude
0,Barking and Dagenham,212906,51.5607°N 0.1557°E,51.5607,0.1557°E
1,Barnet,395896,51.6252°N 0.1517°W,51.6252,0.1517°W
2,Bexley,248287,51.4549°N 0.1505°E,51.4549,0.1505°E
3,Brent,329771,51.5588°N 0.2817°W,51.5588,0.2817°W
4,Bromley,332336,51.4039°N 0.0198°E,51.4039,0.0198°E


In [295]:
# W is negative

london_west = london_df.loc[london_df.longitude.str.contains('W')]
london_west.loc[:, 'longitude'] = london_west.loc[:, 'longitude'].replace(to_replace='°W', value='', regex=True)
london_west.loc[:, 'longitude'] = london_west.loc[:, 'longitude'].astype(float)
london_west.loc[:, 'longitude'] = london_west.loc[:, 'longitude'] * -1

london_west.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,Borough,Population(2019 est),coords,latitude,longitude
1,Barnet,395896,51.6252°N 0.1517°W,51.6252,-0.1517
3,Brent,329771,51.5588°N 0.2817°W,51.5588,-0.2817
5,Camden,270029,51.5290°N 0.1255°W,51.529,-0.1255
6,Croydon,386710,51.3714°N 0.0977°W,51.3714,-0.0977
7,Ealing,341806,51.5130°N 0.3089°W,51.513,-0.3089


In [296]:
london_east = london_df.loc[london_df.longitude.str.contains('E')]
london_east.loc[:, 'longitude'] = london_east.loc[:, 'longitude'].replace(to_replace='°E', value='', regex=True)
london_east.loc[:, 'longitude'] = london_east.loc[:, 'longitude'].replace(to_replace=u'\ufeff', value='', regex=True)
london_east.loc[:, 'longitude'] = london_east.loc[:, 'longitude'].astype(float)
london_east.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,Borough,Population(2019 est),coords,latitude,longitude
0,Barking and Dagenham,212906,51.5607°N 0.1557°E,51.5607,0.1557
2,Bexley,248287,51.4549°N 0.1505°E,51.4549,0.1505
4,Bromley,332336,51.4039°N 0.0198°E,51.4039,0.0198
9,Greenwich,287942,51.4892°N 0.0648°E,51.4892,0.0648
14,Havering,259552,51.5812°N 0.1837°E,51.5812,0.1837


In [297]:
london_df = pd.concat([london_west, london_east])
london_df.sort_values(by='Borough', ascending=True, axis=0, inplace=True)
london_df.reset_index(drop=True, inplace=True)

london_df.head()

Unnamed: 0,Borough,Population(2019 est),coords,latitude,longitude
0,Barking and Dagenham,212906,51.5607°N 0.1557°E,51.5607,0.1557
1,Barnet,395896,51.6252°N 0.1517°W,51.6252,-0.1517
2,Bexley,248287,51.4549°N 0.1505°E,51.4549,0.1505
3,Brent,329771,51.5588°N 0.2817°W,51.5588,-0.2817
4,Bromley,332336,51.4039°N 0.0198°E,51.4039,0.0198


In [298]:
london_crimes = pd.merge(london_df, total_crime_df, on='Borough')
london_crimes.head()

Unnamed: 0,Borough,Population(2019 est),coords,latitude,longitude,Total_Crime
0,Barking and Dagenham,212906,51.5607°N 0.1557°E,51.5607,0.1557,39456
1,Barnet,395896,51.6252°N 0.1517°W,51.6252,-0.1517,59167
2,Bexley,248287,51.4549°N 0.1505°E,51.4549,0.1505,33929
3,Brent,329771,51.5588°N 0.2817°W,51.5588,-0.2817,59257
4,Bromley,332336,51.4039°N 0.0198°E,51.4039,0.0198,47499


In [299]:
# crime rate = (total crimes / population) * 100000

london_crimes['Crime_Rate'] = round((london_crimes['Total_Crime'] / london_crimes['Population(2019 est)']) * 100000, 2)
london_crimes.head()

Unnamed: 0,Borough,Population(2019 est),coords,latitude,longitude,Total_Crime,Crime_Rate
0,Barking and Dagenham,212906,51.5607°N 0.1557°E,51.5607,0.1557,39456,18532.12
1,Barnet,395896,51.6252°N 0.1517°W,51.6252,-0.1517,59167,14945.09
2,Bexley,248287,51.4549°N 0.1505°E,51.4549,0.1505,33929,13665.23
3,Brent,329771,51.5588°N 0.2817°W,51.5588,-0.2817,59257,17969.14
4,Bromley,332336,51.4039°N 0.0198°E,51.4039,0.0198,47499,14292.46


let's try and get some places then

In [362]:
def get_venues(boroughs, lats, lngs, cat):
    radius = 500

    venues_list=[]
    for borough, lat, lng in zip(boroughs, lats, lngs):
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            LIMIT,
            cat
            )

        results = requests.get(url).json()["response"]['venues']

        venues_list.append([(
            borough,
            lat,
            lng,
            v['name'],
            v['location']['lat'],
            v['location']['lng'],
            v['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough',
                             'Latitude',
                             'Longitude',
                             'Venue',
                             'Venue Latitude',
                             'Venue Longitude',
                             'Venue Category']

    return nearby_venues

In [363]:
cat_art = '4d4b7104d754a06370d81259'  # Arts & Entertainment
cat_out = '4d4b7105d754a06377d81259' # outdoors recreation
cat_study = '4d4b7105d754a06372d81259,4bf58dd8d48988d12f941735' # college and university, libraries
cat_social = '52e81612bcbc57f1066b7a34,52e81612bcbc57f1066b7a33,4bf58dd8d48988d131941735'  # community centre, social club, religious centres

bor = london_crimes.Borough
lat = london_crimes.latitude
lng = london_crimes.longitude

arts = get_venues(bor, lat, lng, cat_art)
outs = get_venues(bor, lat, lng, cat_out)
studs = get_venues(bor, lat, lng, cat_study)
socials = get_venues(bor, lat, lng, cat_social)

In [364]:
arts.head()

Unnamed: 0,Borough,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Barking and Dagenham,51.5607,0.1557,Jolly Jungle,51.561132,0.150931,General Entertainment
1,Barnet,51.6252,-0.1517,The Emerald Suite,51.62706,-0.152635,General Entertainment
2,Barnet,51.6252,-0.1517,A&C 11,51.622103,-0.15274,Art Gallery
3,Bexley,51.4549,0.1505,Buzz Bingo,51.456011,0.150528,General Entertainment
4,Bexley,51.4549,0.1505,Cineworld,51.455847,0.150358,Multiplex


In [365]:
outs.head()

Unnamed: 0,Borough,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Barking and Dagenham,51.5607,0.1557,Jolly Jungle,51.561132,0.150931,General Entertainment
1,Barnet,51.6252,-0.1517,The Emerald Suite,51.62706,-0.152635,General Entertainment
2,Barnet,51.6252,-0.1517,A&C 11,51.622103,-0.15274,Art Gallery
3,Bexley,51.4549,0.1505,Buzz Bingo,51.456011,0.150528,General Entertainment
4,Bexley,51.4549,0.1505,Cineworld,51.455847,0.150358,Multiplex


In [366]:
studs.head()

Unnamed: 0,Borough,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Barking and Dagenham,51.5607,0.1557,Cu London University,51.559606,0.154491,University
1,Barking and Dagenham,51.5607,0.1557,William bellamy school,51.558514,0.155535,General College & University
2,Bexley,51.4549,0.1505,Central Library,51.455971,0.144016,Library
3,Brent,51.5588,-0.2817,Wembley Library,51.558666,-0.280904,Library
4,Brent,51.5588,-0.2817,"iQ Raffles House, London",51.557503,-0.283815,College Residence Hall
