# COGS 108 - Final Project

- Import all necessary libraries, then import csv and clean all unnecessary data

In [13]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import patsy
import statsmodels.api as sm
from scipy.stats import ttest_ind

- Read in randomly sampled crime data and drop all unrelevant columns

In [14]:
# This df_crime is complete with only na values dropped.
df_crime = pd.read_csv('./Data/incidents-100k.csv')
df_crime = df_crime[['date','is_night','type','lat','lon']]
df_crime = df_crime.dropna()
# print (df_crime)


- Check all crime types and remove all crimes that are not physical. We did this because we are relating street crimes locations with other factors such as street lamp location and unemployment rates which are location sensitive, thus all non-physical crimes are not relevant.

In [15]:
crime_type = df_crime['type'].value_counts()
print (crime_type)

DRUGS/ALCOHOL VIOLATIONS    23198
THEFT/LARCENY               13211
VEHICLE BREAK-IN/THEFT      12586
MOTOR VEHICLE THEFT         10621
BURGLARY                     8787
ASSAULT                      8012
VANDALISM                    7624
FRAUD                        5412
DUI                          4917
ROBBERY                      2243
SEX CRIMES                   2122
WEAPONS                       997
ARSON                         203
HOMICIDE                       67
Name: type, dtype: int64


- Remove FRAUD crimes

In [16]:
#This df_crime has fraud data and na data removed
df_crime = df_crime[df_crime.type != "FRAUD"]
print (df_crime)

             date  is_night                      type        lat         lon
0      2008-09-19         0                   WEAPONS  32.957337 -117.143777
2      2009-05-25         1                   ASSAULT  32.796761 -117.254577
3      2011-04-22         0  DRUGS/ALCOHOL VIOLATIONS  32.757287 -117.129870
4      2012-05-12         1                   ASSAULT  32.836098 -117.206645
5      2010-12-21         0                   ROBBERY  32.820347 -117.182419
6      2009-09-23         1                SEX CRIMES  32.707287 -117.158730
7      2008-10-12         1                  BURGLARY  32.836803 -117.208681
8      2008-06-06         0    VEHICLE BREAK-IN/THEFT  32.774932 -117.144025
9      2007-03-20         0  DRUGS/ALCOHOL VIOLATIONS  32.755078 -117.099457
10     2011-05-13         0    VEHICLE BREAK-IN/THEFT  32.791932 -117.126213
11     2011-09-12         1                     ARSON  32.749915 -117.162472
12     2007-03-16         1    VEHICLE BREAK-IN/THEFT  32.819463 -117.110901

- Next we import street_light data. We do not need to remove any additional rows

In [17]:
# This df_lights has only lat and lon data with na dropped.
df_lights = pd.read_csv('./Data/street_lights.csv')
df_lights = df_lights[['lat','lon']]
df_lights = df_lights.dropna()
print (df_lights)

             lat         lon
0      32.766618 -117.236854
1      32.766391 -117.234878
2      32.727515 -117.158183
3      32.725645 -117.154312
4      32.719670 -117.108944
5      32.719685 -117.108724
6      32.719280 -117.109084
7      32.719316 -117.108864
8      32.757065 -117.146967
9      32.756942 -117.146445
10     32.756748 -117.146439
11     32.756396 -117.146444
12     32.796264 -117.082365
13     32.796625 -117.081877
14     32.796722 -117.081907
15     32.795939 -117.082294
16     32.795658 -117.081646
17     32.794265 -117.080834
18     32.794242 -117.080516
19     32.793862 -117.081514
20     32.763081 -117.117993
21     32.763031 -117.117450
22     32.805300 -117.147254
23     32.805712 -117.147033
24     32.805524 -117.147407
25     32.679223 -117.036293
26     32.679638 -117.035682
27     32.680097 -117.034970
28     32.680408 -117.035497
29     32.680169 -117.035546
...          ...         ...
48425  32.752595 -117.250439
48426  32.727885 -117.158092
48427  32.7287

- Next we import census tracts

In [18]:
# This df_census has only relevant info with na dropped.
df_census = pd.read_csv('./Data/sandiegocensustract.csv')
df_census = df_census[['SingMother','PovertyRt','TotalPopulation','Longitude','Latitude','PollutionBurdenScore','Education','Unemployment']]
df_census = df_census.dropna()
print (df_census)


     SingMother  PovertyRt  TotalPopulation   Longitude   Latitude  \
1      0.221421      0.370           3479.0 -117.170098  33.141805   
2      0.048000      0.151           2875.0 -117.138174  32.717632   
3      0.198259      0.428           4115.0 -117.138140  32.708456   
4      0.317631      0.469           3250.0 -117.116515  32.690986   
5      0.150833      0.186           6810.0 -117.358627  33.211511   
6      0.159590      0.190           4765.0 -117.053978  33.133471   
7      0.166667      0.188           4147.0 -117.249598  33.208474   
8      0.139986      0.260           5715.0 -117.240665  33.207787   
9      0.224571      0.377           3079.0 -117.107411  32.692937   
10     0.011771      0.079           3711.0 -117.349985  33.154791   
11     0.058417      0.175           4819.0 -117.189465  33.139222   
13     0.012237      0.125           3849.0 -117.235014  32.789837   
14     0.010101      0.166           3669.0 -117.163058  32.753332   
15     0.057279     

- Next we get alcohol permit data

In [19]:
# This df_alc only has lat and lon data with na dropped.
df_alc = pd.read_csv('./Data/abs-licenses-casnd.csv')
df_alc = df_alc[['lat','lon']]
df_alc = df_alc.dropna()
print (df_alc)

            lat         lon
0     32.777847 -117.248361
1     32.916755 -117.123423
2     32.701524 -117.113723
3     32.749524 -117.117612
4     32.800508 -117.236281
5     32.788560 -117.237149
6     32.711729 -117.104456
7     32.651772 -117.097562
8     25.945487 -136.178773
9     32.701503 -117.115966
10    33.170043 -117.096125
11    33.129334 -117.089836
12    33.043428 -117.294747
13    32.748128 -117.148553
14    32.705522 -117.134833
15    33.099189 -117.002897
16    32.798226 -117.220333
17    25.945487 -136.178773
18    32.913865 -117.130063
19    32.593614 -117.046372
20    33.202693 -117.388922
21    33.198923 -117.364572
22    32.742869 -117.041554
23    32.977668 -117.230363
24    33.034459 -117.063417
25    33.032768 -117.273133
26    32.719575 -117.173374
27    25.945487 -136.178773
28    32.625898 -117.031158
29    33.046785 -116.633057
...         ...         ...
4957  33.121530 -117.082569
4958  32.795212 -116.960862
4959  33.094793 -117.056463
4960  25.945487 -136

- Here we count the crimes that match street light locations. After downsizing our sample 100x, we found that crimes do not happen under street lights. The count we found was 0. Later we will increase the sample size for a more accurate results.

In [20]:
### This is O(n^2) loop that is too slow.
# def check_lat_lon(a, b):
#     return abs(a-b) <= 0.0001
# count = 0
# for i_1, r_1 in df_crime.iterrows():
#     for i_2, r_2 in df_lights.iterrows():
#         if(check_lat_lon(r_1['lat'],r_2['lat']) and check_lat_lon(r_1['lon'],r_2['lon'])):
#             count += 1
            
# This is crime and lights data with lat and lon truncated to 4 digits after the decimal and also night only crime data. Also save untruncated crime night only.
df_crime_trun4 = df_crime.round({'lat': 4, 'lon':4})
df_crime_night = df_crime[df_crime['is_night'] == 1]
df_crime_trun4_night_only = df_crime_trun4[df_crime_trun4['is_night'] == 1]
df_lights_trun4 = df_lights.round({'lat': 4, 'lon':4})
# print (df_crime_trun4_night_only)

# This is crime data with only lat and long truncated to 4 digits and night only data.
df_crime_trun4_night_only = df_crime_trun4_night_only[['lat','lon']]

# Since our loop will take too long we have to sample only 8k data from crimes and 485 from street lights (downsized 100 times)
df_crime_rand_trun4_night_only = df_crime_trun4_night_only.sample(8000)
df_lights_rand_trun4 = df_lights_trun4.sample(485)
# print (df_crime_rand_trun4_night_only)



# new loop
# counter = 0
# count = 0
# for i_1, r_1 in df_crime_rand_trun4_night_only.iterrows():
#     counter += 485
#     print (counter)
#     for i_2, r_2 in df_lights_rand_trun4.iterrows(): 
#         if(r_1['lat'] == r_2['lat'] and r_1['lon'] == r_2['lon']):
#             count += 1


# print (count)

- Here we visualize the crimes that happen at night, street lamp locations, and alcohol permit locations

In [21]:
# import gmap functions
import gmaps
import gmaps.datasets
from itertools import product
gmaps.configure(api_key = "AIzaSyD7PjQ3edhgPcImWTk5lCcnJPyX7U_u0KU")

In [22]:
##### plot all night crimes
numpy_crime_array = list(zip(df_crime_night['lat'].tolist(), df_crime_night['lon'].tolist()))
# print (numpy_crime_array)
fig = gmaps.figure()
fig.add_layer(gmaps.heatmap_layer(numpy_crime_array))
fig


In [23]:
##### plot all street lamp locations
numpy_light_array = list(zip(df_lights['lat'].tolist(), df_lights['lon'].tolist()))
# print (numpy_crime_array)
fig2 = gmaps.figure()
fig2.add_layer(gmaps.heatmap_layer(numpy_light_array))
fig2


In [24]:
##### plot all alcohol permit locations  !!!!!temp set to 1000, as original is too laggy!!!!
# print (df_alc)
# numpy_alc_array = list(zip(df_alc['lat'].tolist(), df_alc['lon'].tolist()))
# print (numpy_crime_array)

alc_layer = gmaps.symbol_layer(df_alc.sample(1000),fill_color="green",stroke_color="green", scale=2)
fig3 = gmaps.figure()
fig3.add_layer(alc_layer)
fig3.add_layer(gmaps.heatmap_layer(numpy_crime_array))
fig3
