In [1]:
# imports
import pandas as pd
import numpy as np
import requests
import gmaps
import os
import datetime

from google_key import g_key
gmaps.configure(api_key = g_key)

# setting variables
incidents_filename = "../Resources/NoHo_Street_Traffic_Collision_Data_2015_2019.csv"

In [2]:
# Load the data from the .csv

Incidents_df = pd.read_csv(incidents_filename, sep=",")
Incidents_df.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Reporting District,Address,Cross Street,Location,MO Codes
0,191519215,10/17/2019,10/17/2019,1520,1526,LANKERSHIM BL,ERWIN ST,"(34.1831, -118.3856)",3003 3026 3029 3037 3101 3401 3701 4015
1,191518670,10/5/2019,10/4/2019,1507,1526,ERWIN ST,LANKERSHIM BL,"(34.1831, -118.3878)",4015 3008 3025 3029 3037 3101 3401 3701
2,191515782,8/17/2019,8/17/2019,1529,1505,TROOST AV,SHERMAN WY,"(34.2012, -118.3866)",3034 3006 3037 3028 3602 3101 3401 3701 4026
3,191515783,8/17/2019,8/17/2019,1530,1505,TROOST AV,SHERMAN WY,"(34.2012, -118.3866)",3034 3006 3037 3026 3101 3401 3701 4026
4,191520926,11/21/2019,11/21/2019,1515,1546,RIVERSIDE DR,BECK AV,"(34.1577, -118.3834)",3003 3025 3036 3101 4026


In [None]:
# Check to see if any additional data cleaning is needed.

In [3]:
Incidents_df.dtypes

DR Number              int64
Date Reported         object
Date Occurred         object
Time Occurred          int64
Reporting District     int64
Address               object
Cross Street          object
Location              object
MO Codes              object
dtype: object

In [4]:
Incidents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13078 entries, 0 to 13077
Data columns (total 9 columns):
DR Number             13078 non-null int64
Date Reported         13078 non-null object
Date Occurred         13078 non-null object
Time Occurred         13078 non-null int64
Reporting District    13078 non-null int64
Address               13078 non-null object
Cross Street          12759 non-null object
Location              13078 non-null object
MO Codes              12900 non-null object
dtypes: int64(3), object(6)
memory usage: 919.7+ KB


In [5]:
Incidents_df.index

RangeIndex(start=0, stop=13078, step=1)

In [6]:
Incidents_df.columns

Index(['DR Number', 'Date Reported', 'Date Occurred', 'Time Occurred',
       'Reporting District', 'Address', 'Cross Street', 'Location',
       'MO Codes'],
      dtype='object')

In [7]:
Incidents_df.duplicated('DR Number')

0        False
1        False
2        False
3        False
4        False
         ...  
13073    False
13074    False
13075    False
13076    False
13077    False
Length: 13078, dtype: bool

In [8]:
# The above shows no duplicate DR numbers, which is good.

In [9]:
Incidents_df.duplicated('Location').value_counts()

True     11338
False     1740
dtype: int64

In [10]:
# The above shows a lot of non-unique accident locations. Let's see how many locations there are.

Incidents_df.Location.nunique()


1740

In [11]:
# The locations field is a string. We need to take it apart and turn it into a long and lat.
# ( long, Lat )
print(Incidents_df.Location[0])
Raw_Lat, Raw_Lon = Incidents_df.Location[0].split(',')
print(Raw_Lat)
print(Raw_Lon)

(34.1831, -118.3856)
(34.1831
 -118.3856)


In [12]:
# okay, we've split it in two. Now lets get rid of the parens
Lat = Raw_Lat[1:]
print(Lat)
Lon = Raw_Lon[:-1]
print(Lon)

34.1831
 -118.3856


In [13]:
float(Lon)

-118.3856

In [14]:
# Let's add 2 new columns of map coordinates

Incidents_df['Lat'] = 1.1
Incidents_df['Lon'] = 1.1

# And fill them in

for x in range(len(Incidents_df)):
    Raw_Lat, Raw_Lon = Incidents_df.Location[x].split(',')
    Lat = Raw_Lat[1:]
    Lon = Raw_Lon[:-1]
    Incidents_df['Lat'][x] = float(Lat)
    Incidents_df['Lon'][x] = float(Lon)

Incidents_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Reporting District,Address,Cross Street,Location,MO Codes,Lat,Lon
0,191519215,10/17/2019,10/17/2019,1520,1526,LANKERSHIM BL,ERWIN ST,"(34.1831, -118.3856)",3003 3026 3029 3037 3101 3401 3701 4015,34.1831,-118.3856
1,191518670,10/5/2019,10/4/2019,1507,1526,ERWIN ST,LANKERSHIM BL,"(34.1831, -118.3878)",4015 3008 3025 3029 3037 3101 3401 3701,34.1831,-118.3878
2,191515782,8/17/2019,8/17/2019,1529,1505,TROOST AV,SHERMAN WY,"(34.2012, -118.3866)",3034 3006 3037 3028 3602 3101 3401 3701 4026,34.2012,-118.3866
3,191515783,8/17/2019,8/17/2019,1530,1505,TROOST AV,SHERMAN WY,"(34.2012, -118.3866)",3034 3006 3037 3026 3101 3401 3701 4026,34.2012,-118.3866
4,191520926,11/21/2019,11/21/2019,1515,1546,RIVERSIDE DR,BECK AV,"(34.1577, -118.3834)",3003 3025 3036 3101 4026,34.1577,-118.3834


In [15]:
# hmm. Lat and lon are being stored as intergers??
Incidents_df.dtypes

DR Number               int64
Date Reported          object
Date Occurred          object
Time Occurred           int64
Reporting District      int64
Address                object
Cross Street           object
Location               object
MO Codes               object
Lat                   float64
Lon                   float64
dtype: object

In [None]:
# okay. Let's map this!

# SKIP!

#locations = Incidents_df[["Lat","Lon"]]

#fig = gmaps.figure()
#fig.add_layer(gmaps.marker_layer(locations))

#fig

In [16]:
# huh. We got incidents south to LAX? That's not NoHo.

# Hollywood (not North Hollywood) is 34.101667, -118.326667, so let's find the ones with a 
# lat < 34.101667.

OutOfArea_df = Incidents_df.loc[Incidents_df.Lat < 34.101667]
OutOfArea_df

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Reporting District,Address,Cross Street,Location,MO Codes,Lat,Lon
10099,191519526,10/26/2019,10/26/2019,1310,1554,CULVER BL,MARINA,"(33.9821, -118.4281)",,33.9821,-118.4281
10410,181509635,4/17/2018,4/14/2018,1135,1552,PACIFIC,CATAMARAN,"(33.9783, -118.464)",,33.9783,-118.464


In [17]:
# And now let's get rid of them.

Cleaned_df = Incidents_df.loc[Incidents_df.Lat >= 34.101667]
Cleaned_df.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Reporting District,Address,Cross Street,Location,MO Codes,Lat,Lon
0,191519215,10/17/2019,10/17/2019,1520,1526,LANKERSHIM BL,ERWIN ST,"(34.1831, -118.3856)",3003 3026 3029 3037 3101 3401 3701 4015,34.1831,-118.3856
1,191518670,10/5/2019,10/4/2019,1507,1526,ERWIN ST,LANKERSHIM BL,"(34.1831, -118.3878)",4015 3008 3025 3029 3037 3101 3401 3701,34.1831,-118.3878
2,191515782,8/17/2019,8/17/2019,1529,1505,TROOST AV,SHERMAN WY,"(34.2012, -118.3866)",3034 3006 3037 3028 3602 3101 3401 3701 4026,34.2012,-118.3866
3,191515783,8/17/2019,8/17/2019,1530,1505,TROOST AV,SHERMAN WY,"(34.2012, -118.3866)",3034 3006 3037 3026 3101 3401 3701 4026,34.2012,-118.3866
4,191520926,11/21/2019,11/21/2019,1515,1546,RIVERSIDE DR,BECK AV,"(34.1577, -118.3834)",3003 3025 3036 3101 4026,34.1577,-118.3834


In [None]:
# let's map again.

# SKIP!

#locations = Cleaned_df[["Lat","Lon"]]

#fig = gmaps.figure()
#fig.add_layer(gmaps.marker_layer(locations))

#fig

In [18]:
# that's a sea of red. Let's pare this down to just 2019.

# so we need to convert the 'Date Occured' field to a datetime format.

Cleaned_df['Date Occurred']=pd.to_datetime(Cleaned_df['Date Occurred'])
Cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13076 entries, 0 to 13077
Data columns (total 11 columns):
DR Number             13076 non-null int64
Date Reported         13076 non-null object
Date Occurred         13076 non-null datetime64[ns]
Time Occurred         13076 non-null int64
Reporting District    13076 non-null int64
Address               13076 non-null object
Cross Street          12757 non-null object
Location              13076 non-null object
MO Codes              12900 non-null object
Lat                   13076 non-null float64
Lon                   13076 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(3), object(5)
memory usage: 1.2+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [19]:
Cleaned_df['Date Occurred'][0]

Timestamp('2019-10-17 00:00:00')

In [20]:
# set the date values we want investigate.

January_1_2015 = datetime.datetime(2015, 1, 1)
January_1_2019 = datetime.datetime(2019, 1, 1)
February_1_2019 = datetime.datetime(2019, 2, 1)

April_1_2019 = datetime.datetime(2019, 4, 1)

Spring_Forward_Monday = datetime.datetime(2020, 3, 11)
Day_After_Spring_Forward_Monday = datetime.datetime(2020, 3, 12)
Day_Before_Thanksgiving = datetime.datetime(2019, 11, 27)
Thanksgiving = datetime.datetime(2019, 11, 28)
Black_Friday = datetime.datetime(2019, 11, 29)
Saturday_After_Thanksgiving = datetime.datetime(2019, 11, 30)

January_1_2020 = datetime.datetime(2020, 1, 1)
#print(January_1_2020)

In [21]:
# This is just the month of January 2019

Cleaned_5Year_df = Cleaned_df.loc[(Cleaned_df['Date Occurred']>=January_1_2015)\
               & (Cleaned_df['Date Occurred']<January_1_2020)]
Cleaned_5Year_df['DR Number'].count()

13076

In [22]:
info_box_template = """
<dl>
<dt>DR Number</dt><dd>{DR Number}</dd>
<dt>Date Occured</dt><dd>{Date Occurred}</dd>
</dl>
"""
incident_info = [info_box_template.format(**row) for index, row in Cleaned_5Year_df.iterrows()]
locations = Cleaned_5Year_df[["Lat","Lon"]]

In [None]:
# Map again

# SKIP!

#fig = gmaps.figure()
#fig.add_layer(gmaps.symbol_layer(locations, info_box_content=incident_info, fill_color='blue',scale=1))

#fig

In [23]:
# To create a heatmap, we'd want to find the unique number of incident locations.
Cleaned_5Year_df.Location.nunique()

1738

In [24]:
# Let's reindex this thing.

Cleaned_5Year_df.index=range(len(Cleaned_5Year_df))
Cleaned_5Year_df.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Reporting District,Address,Cross Street,Location,MO Codes,Lat,Lon
0,191519215,10/17/2019,2019-10-17,1520,1526,LANKERSHIM BL,ERWIN ST,"(34.1831, -118.3856)",3003 3026 3029 3037 3101 3401 3701 4015,34.1831,-118.3856
1,191518670,10/5/2019,2019-10-04,1507,1526,ERWIN ST,LANKERSHIM BL,"(34.1831, -118.3878)",4015 3008 3025 3029 3037 3101 3401 3701,34.1831,-118.3878
2,191515782,8/17/2019,2019-08-17,1529,1505,TROOST AV,SHERMAN WY,"(34.2012, -118.3866)",3034 3006 3037 3028 3602 3101 3401 3701 4026,34.2012,-118.3866
3,191515783,8/17/2019,2019-08-17,1530,1505,TROOST AV,SHERMAN WY,"(34.2012, -118.3866)",3034 3006 3037 3026 3101 3401 3701 4026,34.2012,-118.3866
4,191520926,11/21/2019,2019-11-21,1515,1546,RIVERSIDE DR,BECK AV,"(34.1577, -118.3834)",3003 3025 3036 3101 4026,34.1577,-118.3834


In [25]:
# We're going to "bin" by locations.

# Create an empty dataframe
HeatBins_df = pd.DataFrame()

# Set up a list of locations.
HeatBins_df['Location'] = Cleaned_5Year_df['Location'].unique()
HeatBins_df

Unnamed: 0,Location
0,"(34.1831, -118.3856)"
1,"(34.1831, -118.3878)"
2,"(34.2012, -118.3866)"
3,"(34.1577, -118.3834)"
4,"(34.1875, -118.3921)"
...,...
1733,"(34.1595, -118.3571)"
1734,"(34.1958, -118.4105)"
1735,"(34.2051, -118.4247)"
1736,"(34.1215, -118.3855)"


In [26]:
# Lets fill in the lat and long for the mapping.

# Let's add 2 new columns of map coordinates

HeatBins_df['Lat'] = 1.1
HeatBins_df['Lon'] = 1.1

# And fill them in
for x in range(len(HeatBins_df)):
    Raw_Lat, Raw_Lon = HeatBins_df.Location[x].split(',')
    Lat = Raw_Lat[1:]
    Lon = Raw_Lon[:-1]
    HeatBins_df['Lat'][x] = float(Lat)
    HeatBins_df['Lon'][x] = float(Lon)

HeatBins_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Location,Lat,Lon
0,"(34.1831, -118.3856)",34.1831,-118.3856
1,"(34.1831, -118.3878)",34.1831,-118.3878
2,"(34.2012, -118.3866)",34.2012,-118.3866
3,"(34.1577, -118.3834)",34.1577,-118.3834
4,"(34.1875, -118.3921)",34.1875,-118.3921


In [27]:
Cleaned_5Year_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13076 entries, 0 to 13075
Data columns (total 11 columns):
DR Number             13076 non-null int64
Date Reported         13076 non-null object
Date Occurred         13076 non-null datetime64[ns]
Time Occurred         13076 non-null int64
Reporting District    13076 non-null int64
Address               13076 non-null object
Cross Street          12757 non-null object
Location              13076 non-null object
MO Codes              12900 non-null object
Lat                   13076 non-null float64
Lon                   13076 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(3), object(5)
memory usage: 1.1+ MB


In [28]:
# now to create a column for the count

HeatBins_df['Count']=0

# and now iterate through the data

for x in range(len(Cleaned_5Year_df)):
    for y in range(len(HeatBins_df)):
        try:
            if Cleaned_5Year_df.Location[x]==HeatBins_df.Location[y]:
                HeatBins_df.Count[y]+=1
                break
        except:
            print(Cleaned_5Year_df['DR Number'][x])
          

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [29]:
HeatBins_df

Unnamed: 0,Location,Lat,Lon,Count
0,"(34.1831, -118.3856)",34.1831,-118.3856,11
1,"(34.1831, -118.3878)",34.1831,-118.3878,11
2,"(34.2012, -118.3866)",34.2012,-118.3866,13
3,"(34.1577, -118.3834)",34.1577,-118.3834,6
4,"(34.1875, -118.3921)",34.1875,-118.3921,7
...,...,...,...,...
1733,"(34.1595, -118.3571)",34.1595,-118.3571,1
1734,"(34.1958, -118.4105)",34.1958,-118.4105,1
1735,"(34.2051, -118.4247)",34.2051,-118.4247,1
1736,"(34.1215, -118.3855)",34.1215,-118.3855,1


In [30]:
Max_Intensity = HeatBins_df.Count.max()
Weight = HeatBins_df.Count
Locations = HeatBins_df[["Lat","Lon"]]

In [31]:
fig = gmaps.figure()

# Here's our list of markers from before
fig.add_layer(gmaps.symbol_layer(locations, info_box_content=incident_info, fill_color='blue',scale=1))
# Here's our heatmap
fig.add_layer(gmaps.heatmap_layer(Locations, weights=Weight, max_intensity=Max_Intensity))

#Display the figure
fig

Figure(layout=FigureLayout(height='420px'))

In [33]:
# write out CSV file

Cleaned_5Year_df.to_csv('../Resources/Cleaned_5Year.csv', index=False)