# Converting address to latitude and longitude

In [1]:
import pandas as pd
import requests
import geocoder

In [2]:
# import dataset
crime = pd.read_csv(r'C:\Users\Elena\Desktop\Spring 19\Capstone\Crime_Data.csv')
print(crime.shape)
crime.head()

(30347, 8)


Unnamed: 0,RecordID,Offense,IncidentID,BlockNumber,StreetName,Agency,DateReported,HourReported
0,1,Burglary,201000073238,1700.0,ALLIED LN,CPD,2017-09-22T16:00:00.000Z,1200
1,2,Drug/Narcotics Violation,201300002276,1200.0,LANDONIA CIR,CPD,2013-12-13T16:03:00.000Z,1103
2,3,Drug/Narcotics Violation,201300002481,500.0,WATER ST,CPD,2014-01-02T19:00:00.000Z,1400
3,4,Drug/Narcotics Violation,201300003848,100.0,DICE ST,CPD,2014-02-18T23:30:00.000Z,1830
4,5,Drug/Narcotics Violation,201300004323,1800.0,HYDRAULIC RD,CPD,2013-12-27T19:00:00.000Z,1400


In [3]:
# count number of missing values
crime.isna().sum()

RecordID          0
Offense           0
IncidentID        0
BlockNumber     406
StreetName        2
Agency            0
DateReported      0
HourReported      0
dtype: int64

In [4]:
# distribution of block numbers
crime.BlockNumber.unique()

array([1700., 1200.,  500.,  100., 1800., 2100., 1100.,  700.,  600.,
         nan,  800., 1400.,  400.,  900., 2300., 2000., 1000., 1500.,
        300.,  200., 1600., 2400., 1300., 2500., 1900., 2200., 2700.,
       2600., 4000., 6200., 3000., 2800., 9100., 5300., 3100., 8000.,
       3400., 7200., 2900., 4400., 3300.,    0., 3800.])

In [5]:
crime_med = crime.groupby('StreetName', as_index = False)['BlockNumber'].median()
crime_med.head()

Unnamed: 0,StreetName,BlockNumber
0,10 1/2 ST NW,300.0
1,10TH ST,400.0
2,10TH ST NE,300.0
3,10TH ST NW,300.0
4,10TH ST NW / ANDERSON ST,


In [6]:
crime_count = crime.groupby('StreetName', as_index = False)['BlockNumber'].size()
crime_count.head()

StreetName
10 1/2 ST NW                 50
10TH ST                       4
10TH ST NE                   14
10TH ST NW                  167
10TH ST NW / ANDERSON ST      1
dtype: int64

In [7]:
crime_na = crime.StreetName[crime['BlockNumber'].isnull()]
crime_na.head()

11          6TH ST SE
173       WERTLAND ST
357    UNIVERSITY CIR
404     UNIVERSITY CT
538     UNIVERSITY CT
Name: StreetName, dtype: object

In [8]:
# if values are missing, fill them with empty 1
crime['BlockNumber'] = crime['BlockNumber'].fillna(0)
crime['BlockNumber'].isna().sum()

0

In [9]:
crime['BlockNumber'] = pd.to_numeric(crime['BlockNumber']).astype(int)
crime['BlockNumber'].head(15)

0     1700
1     1200
2      500
3      100
4     1800
5      100
6     2100
7      100
8     1100
9      700
10     600
11       0
12     700
13     700
14     800
Name: BlockNumber, dtype: int32

In [10]:
crime.head()

Unnamed: 0,RecordID,Offense,IncidentID,BlockNumber,StreetName,Agency,DateReported,HourReported
0,1,Burglary,201000073238,1700,ALLIED LN,CPD,2017-09-22T16:00:00.000Z,1200
1,2,Drug/Narcotics Violation,201300002276,1200,LANDONIA CIR,CPD,2013-12-13T16:03:00.000Z,1103
2,3,Drug/Narcotics Violation,201300002481,500,WATER ST,CPD,2014-01-02T19:00:00.000Z,1400
3,4,Drug/Narcotics Violation,201300003848,100,DICE ST,CPD,2014-02-18T23:30:00.000Z,1830
4,5,Drug/Narcotics Violation,201300004323,1800,HYDRAULIC RD,CPD,2013-12-27T19:00:00.000Z,1400


In [11]:
crime['Address'] = crime.BlockNumber.map(str) + " " + crime.StreetName + ", Charlottesville VA"
crime.head()

Unnamed: 0,RecordID,Offense,IncidentID,BlockNumber,StreetName,Agency,DateReported,HourReported,Address
0,1,Burglary,201000073238,1700,ALLIED LN,CPD,2017-09-22T16:00:00.000Z,1200,"1700 ALLIED LN, Charlottesville VA"
1,2,Drug/Narcotics Violation,201300002276,1200,LANDONIA CIR,CPD,2013-12-13T16:03:00.000Z,1103,"1200 LANDONIA CIR, Charlottesville VA"
2,3,Drug/Narcotics Violation,201300002481,500,WATER ST,CPD,2014-01-02T19:00:00.000Z,1400,"500 WATER ST, Charlottesville VA"
3,4,Drug/Narcotics Violation,201300003848,100,DICE ST,CPD,2014-02-18T23:30:00.000Z,1830,"100 DICE ST, Charlottesville VA"
4,5,Drug/Narcotics Violation,201300004323,1800,HYDRAULIC RD,CPD,2013-12-27T19:00:00.000Z,1400,"1800 HYDRAULIC RD, Charlottesville VA"


In [276]:
url = pd.Series([])
for i in range(len(crime)):
    url[i] = "http://www.datasciencetoolkit.org/street2coordinates/" + str(crime['Address'].iloc[i])
url[1]

'http://www.datasciencetoolkit.org/street2coordinates/1200 LANDONIA CIR, Charlottesville VA'

In [277]:
r = pd.Series([])
for i in range(len(url)):
    r[i] = requests.get(url[i])
r[1].content

b'{\n  "1200 LANDONIA CIR, Charlottesville VA": {\n    "country_code3": "USA",\n    "latitude": 38.036315,\n    "country_name": "United States",\n    "longitude": -78.462432,\n    "street_address": "1200 Landonia Cir",\n    "region": "VA",\n    "confidence": 0.902,\n    "street_number": "1200",\n    "locality": "Charlottesville",\n    "street_name": "Landonia Cir",\n    "fips_county": "51540",\n    "country_code": "US"\n  }\n}'

In [282]:
re.search(r".*latitude\": *(.*?) *,\n*", str(r[10].content)).group(1)

'38.034974'

In [284]:
lat = pd.Series([])
lng = pd.Series([])

for i in range(len(r)):
    x = re.search(r".*latitude\": *(.*?) *,\n*", str(r[i].content))
    if x is not None:
        lat[i] = x.group(1)
    else:
        lat[i] = 0
    
    y = re.search(r".*longitude\": *(.*?) *,\n*", str(r[i].content))
    if y is not None:
        lng[i] = y.group(1)
    else:
        lng[i] = 0 
      
print(lat[1])
print(lng[1])

38.036315
-78.462432


In [287]:
crime['Latitude'] = lat
crime['Longitude'] = lng
crime.head()

Unnamed: 0,RecordID,Offense,IncidentID,BlockNumber,StreetName,Agency,DateReported,HourReported,Address,Latitude,Longitude
0,1,Burglary,201000073238,1700,ALLIED LN,CPD,2017-09-22T16:00:00.000Z,1200,"1700 ALLIED LN, Charlottesville VA",38.041381,-78.479806
1,2,Drug/Narcotics Violation,201300002276,1200,LANDONIA CIR,CPD,2013-12-13T16:03:00.000Z,1103,"1200 LANDONIA CIR, Charlottesville VA",38.036315,-78.462432
2,3,Drug/Narcotics Violation,201300002481,500,WATER ST,CPD,2014-01-02T19:00:00.000Z,1400,"500 WATER ST, Charlottesville VA",38.029081,-78.478864
3,4,Drug/Narcotics Violation,201300003848,100,DICE ST,CPD,2014-02-18T23:30:00.000Z,1830,"100 DICE ST, Charlottesville VA",38.027426,-78.483989
4,5,Drug/Narcotics Violation,201300004323,1800,HYDRAULIC RD,CPD,2013-12-27T19:00:00.000Z,1400,"1800 HYDRAULIC RD, Charlottesville VA",38.060175,-78.490386


In [289]:
crime.to_csv('crime_ll.csv', sep=',')

In [24]:
# import sys
# import time
# import urllib.request
# import re

In [195]:
# def addresstoll(df):
    
#     # Create empty lists for lats and lngs
#     lat_list = pd.Series([])
#     lng_list = pd.Series([])
   
#     # Create opener
#     opener = urllib.request.FancyURLopener({})
    
#     # Iterate over all rows of dataframe
#     for i in range(len(df)):
        
#         # Get latitude and longitude
#         block = df['BlockNumber'][i]
#         street = df['StreetName'][i]
        
#         # Create url from lat/long
#         url = 'http://www.datasciencetoolkit.org/street2coordinates/'\
#         + str(block) + '+' + str(street) + '+' + 'Charlottesville+VA'
        
#         # Store webpage contents
#         f = opener.open(url)
#         content = f.read()
        
#         # Get lat and long out of content string
#         lat = re.search(r".*latitude\": *(.*?) *,\n*", str(content)).group(1)
#         lng = re.search(r".*longitude\": *(.*?) *,\n*", str(content)).group(1)
        
#         # Add to the lists
#         lat_list[i] = lat
#         lng_list[i] = lng
               
#     return lat_list, lng_list

# #       sys.stdout.write('\r' + str(lat) +', '+ str(lng))
# #       time.sleep(0.1)

In [203]:
# crime_lats, crime_lngs = addresstoll(crime[:5])
# crime['Lats'] = crime_lats
# crime['Lngs'] = crime_lngs

  


In [None]:
# crime_lats, crime_lngs = addresstoll(crime[:5])
# crime['Lats'] = crime_lats
# crime['Lngs'] = crime_lngs

In [274]:
# crime.head()

Unnamed: 0,RecordID,Offense,IncidentID,BlockNumber,StreetName,Agency,DateReported,HourReported,Address,Lats,Lngs
0,1,Burglary,201000073238,1700,ALLIED LN,CPD,2017-09-22T16:00:00.000Z,1200,"1700 ALLIED LN, Charlottesville VA",38.041381,-78.479806
1,2,Drug/Narcotics Violation,201300002276,1200,LANDONIA CIR,CPD,2013-12-13T16:03:00.000Z,1103,"1200 LANDONIA CIR, Charlottesville VA",38.036315,-78.462432
2,3,Drug/Narcotics Violation,201300002481,500,WATER ST,CPD,2014-01-02T19:00:00.000Z,1400,"500 WATER ST, Charlottesville VA",38.029081,-78.478864
3,4,Drug/Narcotics Violation,201300003848,100,DICE ST,CPD,2014-02-18T23:30:00.000Z,1830,"100 DICE ST, Charlottesville VA",38.027426,-78.483989
4,5,Drug/Narcotics Violation,201300004323,1800,HYDRAULIC RD,CPD,2013-12-27T19:00:00.000Z,1400,"1800 HYDRAULIC RD, Charlottesville VA",38.060175,-78.490386
