In [1]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup as bs
from sqlalchemy import create_engine
import numpy as np
import pandas as pd
import requests
import pymongo
from datetime import datetime
import ast

In [2]:
file = "data/Boston/Boston_CRIME_Data.csv"
data1 = pd.read_csv(file)

In [3]:
# Column Names from csv
# ----------------------------
# INCIDENT_NUMBER	OFFENSE_CODE	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	
# OCCURRED_ON_DATE	YEAR	MONTH	DAY_OF_WEEK	HOUR	UCR_PART	STREET	Lat	Long	Location

data = data1[["INCIDENT_NUMBER", "OFFENSE_CODE", "OCCURRED_ON_DATE", "OFFENSE_DESCRIPTION", "Lat", "Long"]].copy()
data.rename(columns={'Lat':'latitude', 'Long':'longitude'}, inplace=True)
data["city"] = "boston"
data

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OCCURRED_ON_DATE,OFFENSE_DESCRIPTION,latitude,longitude,city
0,I192070092,3115,2019-09-02 21:42:00,INVESTIGATE PERSON,42.311763,-71.063242,boston
1,I192070085,413,2019-09-02 21:36:00,ASSAULT - AGGRAVATED - BATTERY,42.286710,-71.108229,boston
2,I192070082,3006,2019-09-02 21:35:00,SICK/INJURED/MEDICAL - PERSON,,,boston
3,I192070081,2610,2019-09-02 21:30:00,TRESPASSING,42.356741,-71.057947,boston
4,I192070078,2905,2019-09-02 21:11:00,VAL - OPERATING WITHOUT LICENSE,42.320711,-71.086023,boston
5,I192070078,2900,2019-09-02 21:11:00,VAL - VIOLATION OF AUTO LAW - OTHER,42.320711,-71.086023,boston
6,I192070077,3018,2019-09-02 21:43:00,SICK/INJURED/MEDICAL - POLICE,,,boston
7,I192070075,3205,2019-09-02 21:39:00,M/V PLATES - LOST,42.353002,-71.045023,boston
8,I192070071,3831,2019-09-02 17:54:00,M/V - LEAVING SCENE - PROPERTY DAMAGE,42.316130,-71.091089,boston
9,I192070070,801,2019-09-02 21:03:00,ASSAULT - SIMPLE,42.311631,-71.069743,boston


In [5]:
dates = []
years = []

for date in data["OCCURRED_ON_DATE"]:
    date = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
    dates.append(date)
    years.append(date.year)
    
data["date"] = dates
data["years"] = years
data = data.loc[(data["years"] >= 2014) & (data["years"] <= 2018)]
data

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OCCURRED_ON_DATE,OFFENSE_DESCRIPTION,latitude,longitude,city,date,years
118,I192069941,1102,2016-06-21 12:00:00,FRAUD - FALSE PRETENSE / SCHEME,42.339170,-71.103583,boston,2016-06-21 12:00:00,2016
119,I192069941,619,2016-06-21 12:00:00,LARCENY ALL OTHERS,42.339170,-71.103583,boston,2016-06-21 12:00:00,2016
341,I192069688,619,2017-12-10 20:00:00,LARCENY ALL OTHERS,42.311628,-71.080943,boston,2017-12-10 20:00:00,2017
342,I192069688,1102,2017-12-10 20:00:00,FRAUD - FALSE PRETENSE / SCHEME,42.311628,-71.080943,boston,2017-12-10 20:00:00,2017
386,I192069629,619,2017-03-19 17:00:00,LARCENY ALL OTHERS,42.303441,-71.066746,boston,2017-03-19 17:00:00,2017
387,I192069629,1102,2017-03-19 17:00:00,FRAUD - FALSE PRETENSE / SCHEME,42.303441,-71.066746,boston,2017-03-19 17:00:00,2017
534,I192069452,619,2017-09-12 15:00:00,LARCENY ALL OTHERS,42.326817,-71.071110,boston,2017-09-12 15:00:00,2017
834,I192069083,1107,2017-08-01 00:00:00,FRAUD - IMPERSONATION,42.336112,-71.046654,boston,2017-08-01 00:00:00,2017
970,I192068943,1109,2018-02-28 00:00:00,FRAUD - WIRE,42.370818,-71.039291,boston,2018-02-28 00:00:00,2018
1326,I192068538,1107,2018-01-01 00:00:00,FRAUD - IMPERSONATION,42.349780,-71.134230,boston,2018-01-01 00:00:00,2018




----

# Data Munging

In [6]:
# NA NAN Checking
num_NA = data.isna().sum()
print(num_NA)
print("\nPercentage of NA in whole data: \n" + str(num_NA/data["INCIDENT_NUMBER"].count()))
    

INCIDENT_NUMBER            0
OFFENSE_CODE               0
OCCURRED_ON_DATE           0
OFFENSE_DESCRIPTION        0
latitude               22431
longitude              22431
city                       0
date                       0
years                      0
dtype: int64

Percentage of NA in whole data: 
INCIDENT_NUMBER        0.000000
OFFENSE_CODE           0.000000
OCCURRED_ON_DATE       0.000000
OFFENSE_DESCRIPTION    0.000000
latitude               0.063515
longitude              0.063515
city                   0.000000
date                   0.000000
years                  0.000000
dtype: float64


In [7]:
# Identify rows with Nan in lat/long
data_NA = data.loc[data["latitude"].isna()]
data_NA

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OCCURRED_ON_DATE,OFFENSE_DESCRIPTION,latitude,longitude,city,date,years
2221,I192067544,1402,2018-08-23 20:00:00,VANDALISM,,,boston,2018-08-23 20:00:00,2018
7756,I192061504,3115,2018-06-05 00:00:00,INVESTIGATE PERSON,,,boston,2018-06-05 00:00:00,2018
9326,I192059833,1107,2018-08-21 00:00:00,FRAUD - IMPERSONATION,,,boston,2018-08-21 00:00:00,2018
26008,I192042185,1102,2018-08-03 06:15:00,FRAUD - FALSE PRETENSE / SCHEME,,,boston,2018-08-03 06:15:00,2018
34033,I192033783,619,2018-10-27 15:21:00,LARCENY ALL OTHERS,,,boston,2018-10-27 15:21:00,2018
34041,I192033774,3201,2018-05-07 15:02:00,PROPERTY - LOST,,,boston,2018-05-07 15:02:00,2018
37816,I192029756,3201,2016-04-23 11:24:00,PROPERTY - LOST,,,boston,2016-04-23 11:24:00,2016
42359,I192024916,1102,2017-01-17 19:00:00,FRAUD - FALSE PRETENSE / SCHEME,,,boston,2017-01-17 19:00:00,2017
43052,I192024207,619,2018-11-21 09:00:00,LARCENY ALL OTHERS,,,boston,2018-11-21 09:00:00,2018
45861,I192021161,3114,2018-11-17 17:00:00,INVESTIGATE PROPERTY,,,boston,2018-11-17 17:00:00,2018


In [8]:
# With 6.35% NaNs, remove rows with NaN in Latitude/Longitude
data = data.dropna()
data

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OCCURRED_ON_DATE,OFFENSE_DESCRIPTION,latitude,longitude,city,date,years
118,I192069941,1102,2016-06-21 12:00:00,FRAUD - FALSE PRETENSE / SCHEME,42.339170,-71.103583,boston,2016-06-21 12:00:00,2016
119,I192069941,619,2016-06-21 12:00:00,LARCENY ALL OTHERS,42.339170,-71.103583,boston,2016-06-21 12:00:00,2016
341,I192069688,619,2017-12-10 20:00:00,LARCENY ALL OTHERS,42.311628,-71.080943,boston,2017-12-10 20:00:00,2017
342,I192069688,1102,2017-12-10 20:00:00,FRAUD - FALSE PRETENSE / SCHEME,42.311628,-71.080943,boston,2017-12-10 20:00:00,2017
386,I192069629,619,2017-03-19 17:00:00,LARCENY ALL OTHERS,42.303441,-71.066746,boston,2017-03-19 17:00:00,2017
387,I192069629,1102,2017-03-19 17:00:00,FRAUD - FALSE PRETENSE / SCHEME,42.303441,-71.066746,boston,2017-03-19 17:00:00,2017
534,I192069452,619,2017-09-12 15:00:00,LARCENY ALL OTHERS,42.326817,-71.071110,boston,2017-09-12 15:00:00,2017
834,I192069083,1107,2017-08-01 00:00:00,FRAUD - IMPERSONATION,42.336112,-71.046654,boston,2017-08-01 00:00:00,2017
970,I192068943,1109,2018-02-28 00:00:00,FRAUD - WIRE,42.370818,-71.039291,boston,2018-02-28 00:00:00,2018
1326,I192068538,1107,2018-01-01 00:00:00,FRAUD - IMPERSONATION,42.349780,-71.134230,boston,2018-01-01 00:00:00,2018


In [9]:
# data_2014 = pd.DataFrame()
# data_2015 = pd.DataFrame()
# data_2016 = pd.DataFrame()
# data_2017 = pd.DataFrame()
# data_2018 = pd.DataFrame()

# data_2014 = data.loc[data["years"] == 2014]
# print(data_2014)
# data_2015 = data.loc[data["years"] == 2015]
# print(data_2015)
# data_2016 = data.loc[data["years"] == 2016]
# print(data_2016)
# data_2017 = data.loc[data["years"] == 2017]
# print(data_2017)
# data_2018 = data.loc[data["years"] == 2018]
# data_2018

In [10]:
# def reformat_datetime(df):
#     datetimes = []
#     for d,t  in zip(df["date"], df["Time Occurred"]):
# #         print(d)
# #         print(type(d))
#     #     datetime = (datetime.strftime(d, format='%Y-%m-%d') + " " + str(t))
#         d = datetime.strptime(str(d), "%Y-%m-%d %H:%M:%S")
#         date_time = datetime.strftime(d, format='%Y-%m-%d') + "T" + str((int(t/100))) + ":00:00"
#     #     datetime = d + " " + t
#         datetimes.append(date_time)
#     df["date_time"] = datetimes
#     return df

def reformat_datetime(df):
    dates = []
    times = []
    for d in df["date"]:
        dy = datetime.strftime(d, format='%Y-%m-%d')
        ti = datetime.strftime(d, format='%H')
        dates.append(dy)
        times.append(ti)
    df["date"] = dates
    df["time"] = times
    return df

In [11]:
data_boston = reformat_datetime(data)
data_boston

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OCCURRED_ON_DATE,OFFENSE_DESCRIPTION,latitude,longitude,city,date,years,time
118,I192069941,1102,2016-06-21 12:00:00,FRAUD - FALSE PRETENSE / SCHEME,42.339170,-71.103583,boston,2016-06-21,2016,12
119,I192069941,619,2016-06-21 12:00:00,LARCENY ALL OTHERS,42.339170,-71.103583,boston,2016-06-21,2016,12
341,I192069688,619,2017-12-10 20:00:00,LARCENY ALL OTHERS,42.311628,-71.080943,boston,2017-12-10,2017,20
342,I192069688,1102,2017-12-10 20:00:00,FRAUD - FALSE PRETENSE / SCHEME,42.311628,-71.080943,boston,2017-12-10,2017,20
386,I192069629,619,2017-03-19 17:00:00,LARCENY ALL OTHERS,42.303441,-71.066746,boston,2017-03-19,2017,17
387,I192069629,1102,2017-03-19 17:00:00,FRAUD - FALSE PRETENSE / SCHEME,42.303441,-71.066746,boston,2017-03-19,2017,17
534,I192069452,619,2017-09-12 15:00:00,LARCENY ALL OTHERS,42.326817,-71.071110,boston,2017-09-12,2017,15
834,I192069083,1107,2017-08-01 00:00:00,FRAUD - IMPERSONATION,42.336112,-71.046654,boston,2017-08-01,2017,00
970,I192068943,1109,2018-02-28 00:00:00,FRAUD - WIRE,42.370818,-71.039291,boston,2018-02-28,2018,00
1326,I192068538,1107,2018-01-01 00:00:00,FRAUD - IMPERSONATION,42.349780,-71.134230,boston,2018-01-01,2018,00





----

# Final view of date_time into weather data datetime format

In [12]:
data_boston.rename(columns={"OFFENSE_CODE": "code"}, inplace=True)
data_boston = data_boston[["city", "code", "date", "time", "latitude", "longitude"]].copy()
# data_boston.to_csv (r'data/Boston/data_boston.csv', index = None, header=True)

data_boston

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


Unnamed: 0,city,code,date,time,latitude,longitude
118,boston,1102,2016-06-21,12,42.339170,-71.103583
119,boston,619,2016-06-21,12,42.339170,-71.103583
341,boston,619,2017-12-10,20,42.311628,-71.080943
342,boston,1102,2017-12-10,20,42.311628,-71.080943
386,boston,619,2017-03-19,17,42.303441,-71.066746
387,boston,1102,2017-03-19,17,42.303441,-71.066746
534,boston,619,2017-09-12,15,42.326817,-71.071110
834,boston,1107,2017-08-01,00,42.336112,-71.046654
970,boston,1109,2018-02-28,00,42.370818,-71.039291
1326,boston,1107,2018-01-01,00,42.349780,-71.134230


In [None]:
# df_years = [data_2014, data_2015, data_2016, data_2017, data_2018]

# for i, yr in enumerate(df_years):
#     df = reformat_datetime(yr)
#     if i == 0:
#         data_2014_1 = df.copy()
#     elif i ==1:
#         data_2015_1 = df.copy()
#     elif i ==2:
#         data_2016_1 = df.copy()
#     elif i ==3:
#         data_2017_1 = df.copy()
#     elif i ==4:
#         data_2018_1 = df.copy()





----

# Final view of date_time into weather data datetime format

In [None]:
data_2014_1

In [None]:
data_2015_1

In [None]:
data_2016_1

In [None]:
data_2017_1

In [None]:
data_2018_1

In [None]:
# .to_csv(index=False)
data_2014_1.to_csv (r'data/Boston/data_2014.csv', index = None, header=True)
data_2015_1.to_csv (r'data/Boston/data_2015.csv', index = None, header=True)
data_2016_1.to_csv (r'data/Boston/data_2016.csv', index = None, header=True)
data_2017_1.to_csv (r'data/Boston/data_2017.csv', index = None, header=True)
data_2018_1.to_csv (r'data/Boston/data_2018.csv', index = None, header=True)