In [1]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup as bs
from sqlalchemy import create_engine
import numpy as np
import pandas as pd
import requests
import pymongo
from datetime import datetime
import ast

In [2]:
file = "data/Chicago/Chicago_CRIME_Data.csv"
data1 = pd.read_csv(file)

In [3]:
# Column Names from csv
# ----------------------------
# ID	Case Number	Date	Block	IUCR	Primary Type	Description	Location Description	
# Arrest	Domestic	Beat	District	Ward	Community Area	FBI Code	X Coordinate	
# Y Coordinate	Year	Updated On	Latitude	Longitude	Location	Historical Wards 2003-2015	
# Zip Codes	Community Areas	Census Tracts	Wards	Boundaries - ZIP Codes	Police Districts	Police Beats

data = data1[["ID", "IUCR", "Block", "Date", "Primary Type", "Latitude", "Longitude"]].copy()
data["city"] = "chicago"
# data.rename(columns={'Latitude':'latitude', 'Longitude':'longitude'}, inplace=True)


In [4]:
dates = []
years = []

for date in data["Date"]:
    date = datetime.strptime(date, "%m/%d/%Y %H:%M")
    dates.append(date)
    years.append(date.year)
    
data["date"] = dates
data["years"] = years

data = data.loc[(data["years"] >= 2014) & (data["years"] <= 2018)]
data

Unnamed: 0,ID,IUCR,Block,Date,Primary Type,Latitude,Longitude,city,date,years
170220,11556487,1320,112XX S SACRAMENTO AVE,12/31/2018 23:59,CRIMINAL DAMAGE,41.689079,-87.696064,chicago,2018-12-31 23:59:00,2018
170221,11561837,1153,013XX W 72ND ST,12/31/2018 23:59,DECEPTIVE PRACTICE,41.763181,-87.657709,chicago,2018-12-31 23:59:00,2018
170222,11552699,1310,084XX S SANGAMON ST,12/31/2018 23:57,CRIMINAL DAMAGE,41.740521,-87.647391,chicago,2018-12-31 23:57:00,2018
170223,11552724,440,018XX S ALLPORT ST,12/31/2018 23:56,BATTERY,41.857068,-87.657625,chicago,2018-12-31 23:56:00,2018
170224,11552731,486,078XX S SANGAMON ST,12/31/2018 23:55,BATTERY,41.751914,-87.647717,chicago,2018-12-31 23:55:00,2018
170225,11552715,041A,052XX W GLADYS AVE,12/31/2018 23:49,BATTERY,41.875684,-87.760479,chicago,2018-12-31 23:49:00,2018
170226,11552741,486,079XX S LAFLIN ST,12/31/2018 23:48,BATTERY,41.750154,-87.661009,chicago,2018-12-31 23:48:00,2018
170227,11552602,460,018XX W BELMONT AVE,12/31/2018 23:47,BATTERY,41.939625,-87.673996,chicago,2018-12-31 23:47:00,2018
170228,11553488,890,032XX N SHEFFIELD AVE,12/31/2018 23:45,THEFT,41.940519,-87.654124,chicago,2018-12-31 23:45:00,2018
170229,11554852,1310,032XX W EVERGREEN AVE,12/31/2018 23:45,CRIMINAL DAMAGE,41.905562,-87.707589,chicago,2018-12-31 23:45:00,2018




----

# Data Munging

In [5]:
# Addressing address numbers with 'XX' or 'X' (e.g. 004XX W MADISON ST) by setting constant since the 
# addresses refer to city blocks usually <0.2 mile distance between 00-99

data["Block"] = [x.replace('XX', '00') for x in data["Block"]]
data["Block"] = [x.replace('00X', '001') for x in data["Block"]]
data["Block"] = [(str(x)+" Chicago") for x in data["Block"]]
data


Unnamed: 0,ID,IUCR,Block,Date,Primary Type,Latitude,Longitude,city,date,years
170220,11556487,1320,11200 S SACRAMENTO AVE Chicago,12/31/2018 23:59,CRIMINAL DAMAGE,41.689079,-87.696064,chicago,2018-12-31 23:59:00,2018
170221,11561837,1153,01300 W 72ND ST Chicago,12/31/2018 23:59,DECEPTIVE PRACTICE,41.763181,-87.657709,chicago,2018-12-31 23:59:00,2018
170222,11552699,1310,08400 S SANGAMON ST Chicago,12/31/2018 23:57,CRIMINAL DAMAGE,41.740521,-87.647391,chicago,2018-12-31 23:57:00,2018
170223,11552724,440,01800 S ALLPORT ST Chicago,12/31/2018 23:56,BATTERY,41.857068,-87.657625,chicago,2018-12-31 23:56:00,2018
170224,11552731,486,07800 S SANGAMON ST Chicago,12/31/2018 23:55,BATTERY,41.751914,-87.647717,chicago,2018-12-31 23:55:00,2018
170225,11552715,041A,05200 W GLADYS AVE Chicago,12/31/2018 23:49,BATTERY,41.875684,-87.760479,chicago,2018-12-31 23:49:00,2018
170226,11552741,486,07900 S LAFLIN ST Chicago,12/31/2018 23:48,BATTERY,41.750154,-87.661009,chicago,2018-12-31 23:48:00,2018
170227,11552602,460,01800 W BELMONT AVE Chicago,12/31/2018 23:47,BATTERY,41.939625,-87.673996,chicago,2018-12-31 23:47:00,2018
170228,11553488,890,03200 N SHEFFIELD AVE Chicago,12/31/2018 23:45,THEFT,41.940519,-87.654124,chicago,2018-12-31 23:45:00,2018
170229,11554852,1310,03200 W EVERGREEN AVE Chicago,12/31/2018 23:45,CRIMINAL DAMAGE,41.905562,-87.707589,chicago,2018-12-31 23:45:00,2018


In [6]:
# NA NAN Checking
num_NA = data.isna().sum()
print(num_NA)
print("\nPercentage of NA in whole data: \n" + str(num_NA/data["ID"].count()))

ID                  0
IUCR                0
Block               0
Date                0
Primary Type        0
Latitude        11805
Longitude       11805
city                0
date                0
years               0
dtype: int64

Percentage of NA in whole data: 
ID              0.00000
IUCR            0.00000
Block           0.00000
Date            0.00000
Primary Type    0.00000
Latitude        0.01344
Longitude       0.01344
city            0.00000
date            0.00000
years           0.00000
dtype: float64


In [7]:
# Identify rows with Nan in lat/long
data_NA = data.loc[data["Latitude"].isna()]
data_NA

Unnamed: 0,ID,IUCR,Block,Date,Primary Type,Latitude,Longitude,city,date,years
170289,11584102,1210,03100 W 59TH ST Chicago,12/31/2018 22:00,DECEPTIVE PRACTICE,,,chicago,2018-12-31 22:00:00,2018
170329,11641169,890,00800 N TRIPP AVE Chicago,12/31/2018 21:00,THEFT,,,chicago,2018-12-31 21:00:00,2018
170362,11579842,810,03000 E 88TH ST Chicago,12/31/2018 20:00,THEFT,,,chicago,2018-12-31 20:00:00,2018
170425,11578779,560,07500 S PAULINA ST Chicago,12/31/2018 18:00,ASSAULT,,,chicago,2018-12-31 18:00:00,2018
170427,11587591,2825,05000 N MANGO AVE Chicago,12/31/2018 18:00,OTHER OFFENSE,,,chicago,2018-12-31 18:00:00,2018
170429,11583288,1752,04300 N MOBILE AVE Chicago,12/31/2018 18:00,OFFENSE INVOLVING CHILDREN,,,chicago,2018-12-31 18:00:00,2018
170452,11585658,1150,07900 S DAMEN AVE Chicago,12/31/2018 17:27,DECEPTIVE PRACTICE,,,chicago,2018-12-31 17:27:00,2018
170526,11576815,2826,11700 S CHURCH ST Chicago,12/31/2018 15:00,OTHER OFFENSE,,,chicago,2018-12-31 15:00:00,2018
170528,11590516,1152,09800 S WINSTON AVE Chicago,12/31/2018 15:00,DECEPTIVE PRACTICE,,,chicago,2018-12-31 15:00:00,2018
170548,11567378,820,03500 S COTTAGE GROVE AVE Chicago,12/31/2018 14:42,THEFT,,,chicago,2018-12-31 14:42:00,2018


In [18]:
# Utilize GeoPy to geolocate addresses and provide lat/long values which were NaN
lat = []
long = []
not_found = True

geolocator = Nominatim(user_agent="jjree")
df = data_NA.loc[np.isnan(data["Latitude"])]["Block"]
for d in df:
#     print(d)
    try:
        location = geolocator.geocode(d)
        lat.append(location.latitude)
        long.append(location.longitude)
    except Exception:
        print(d+"-> Problem at this address1")
        print()
        new_add = d
        while(not_found):
            new_add = (str(int(new_add.split()[0])+1)) + str(new_add.split()[1:])
            print(new_add)
            try:
                location = geolocator.geocode(new_add)
                lat.append(location.latitude)
                long.append(location.longitude)
                not_found = False
                pass
            except Exception:
                print(new_add+" Problem at this address2")
                pass
            
        
#         lat.append("NaN")
#         long.append("NaN")

data_NA["latitude"] = lat
data_NA["longitude"] = long
# data_NA["latitude"] = [geolocator.geocode(x).latitude for x in data_NA.loc[np.isnan(data["Latitude"])]["Block"]]
data_NA

07500 S PAULINA ST Chicago-> Problem at this address1

7501['S', 'PAULINA', 'ST', 'Chicago']
7501['S', 'PAULINA', 'ST', 'Chicago'] Problem at this address2


ValueError: invalid literal for int() with base 10: "7501['S',"

In [None]:
for index, l in enumerate(data_NA["Latitude"]):
    if (np.isnan(l)):
        print(data_NA[index:index+1]["Block"])
        loc = data_NA[index:index+1]["Block"]
        location = geolocator.geocode(loc)
#         print(location)
#         print(location.address, location.latitude, location.longitude)
        lat.append(location.latitude)
        long.append(location.longitude)
    else:
        lat.append(add["Latitude"])
        long.append(add["Longitude"])

data_NA["latitude"] = lat
data_NA["longitude"] = long

data_NA

# 41.8662808,-87.722445

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="specify_your_app_name_here")
location = geolocator.geocode("175 5th Avenue NYC")
print(location.address)
Flatiron Building, 175, 5th Avenue, Flatiron, New York, NYC, New York, ...
print((location.latitude, location.longitude))
(40.7410861, -73.9896297241625)
print(location.raw)
{'place_id': '9167009604', 'type': 'attraction', ...}

In [None]:
dates = []
years = []

for date in data["Date"]:
    date = datetime.strptime(date, "%m/%d/%Y %H:%M")
    dates.append(date)
    years.append(date.year)
    
data["date"] = dates
data["years"] = years

data = data.loc[(data["years"] >= 2014) & (data["years"] <= 2018)]
data

In [None]:
# data_2014 = pd.DataFrame()
# data_2015 = pd.DataFrame()
# data_2016 = pd.DataFrame()
# data_2017 = pd.DataFrame()
# data_2018 = pd.DataFrame()

# data_2014 = data.loc[data["years"] == 2014]
# print(data_2014)
# data_2015 = data.loc[data["years"] == 2015]
# print(data_2015)
# data_2016 = data.loc[data["years"] == 2016]
# print(data_2016)
# data_2017 = data.loc[data["years"] == 2017]
# print(data_2017)
# data_2018 = data.loc[data["years"] == 2018]
# data_2018



In [None]:
# def reformat_datetime(df):
#     datetimes = []
#     for d,t  in zip(df["date"], df["Updated On"]):
# #         print(d)
# #         print(type(d))
#     #     datetime = (datetime.strftime(d, format='%Y-%m-%d') + " " + str(t))
#         d = datetime.strptime(str(d), "%Y-%m-%d %H:%M:%S")
#         date_time = datetime.strftime(d, format='%Y-%m-%d') + "T" + str((int(t/100))) + ":00:00"
#     #     datetime = d + " " + t
#         datetimes.append(date_time)
#     df["date_time"] = datetimes
#     return df

# def reformat_datetime(df):
#     datetimes = []
#     for d in df["date"]:
#         date_time = datetime.strftime(d, format='%Y-%m-%dT%H:00:00')
#         datetimes.append(date_time)
#     df["date_time"] = datetimes
#     return df

def reformat_datetime(df):
    dates = []
    times = []
    for d in df["date"]:
        dy = datetime.strftime(d, format='%Y-%m-%d')
        ti = datetime.strftime(d, format='%H')
        dates.append(dy)
        times.append(ti)
    df["date"] = dates
    df["time"] = times
    return df

In [None]:
data_chicago = reformat_datetime(data)
data_chicago




----

# Final view of date_time into weather data datetime format

In [None]:
data_chicago.rename(columns={"IUCR": "code"}, inplace=True)
data_chicago = data_chicago[["city", "code", "date", "time", "latitude", "longitude"]].copy()
data_chicago.to_csv (r'data/Chicago/data_chicago.csv', index = None, header=True)

data_chicago

In [None]:
# df_years = [data_2014, data_2015, data_2016, data_2017, data_2018]

# for i, yr in enumerate(df_years):
#     df = reformat_datetime(yr)
#     if i == 0:
#         data_2014_1 = df.copy()
#     elif i ==1:
#         data_2015_1 = df.copy()
#     elif i ==2:
#         data_2016_1 = df.copy()
#     elif i ==3:
#         data_2017_1 = df.copy()
#     elif i ==4:
#         data_2018_1 = df.copy()





----

# Final view of date_time into weather data datetime format

In [None]:
data_2014_1

In [None]:
data_2015_1

In [None]:
data_2016_1

In [None]:
data_2017_1

In [None]:
data_2018_1

In [None]:
# .to_csv(index=False)
data_2014_1.to_csv (r'data/Chicago/data_2014.csv', index = None, header=True)
data_2015_1.to_csv (r'data/Chicago/data_2015.csv', index = None, header=True)
data_2016_1.to_csv (r'data/Chicago/data_2016.csv', index = None, header=True)
data_2017_1.to_csv (r'data/Chicago/data_2017.csv', index = None, header=True)
data_2018_1.to_csv (r'data/Chicago/data_2018.csv', index = None, header=True)