# Assignment 2

### Imports

In [59]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
import numpy as np
import seaborn as sns
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.models import Legend
from bokeh.core.properties import value
from bokeh.transform import factor_cmap

# select a palette
from bokeh.palettes import Spectral3
from bokeh.palettes import Category20b_13 as palette
from bokeh.palettes import Category20b_14 as palette2
# itertools handles the cycling
import itertools  


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
from sklearn import tree

sns.set(style='darkgrid', palette='muted', color_codes=True)

import warnings
warnings.filterwarnings('ignore')


# Magic command useful for jupyter notebook
%matplotlib inline

# Set plot size. 
plt.rcParams['figure.figsize'] = [13, 6]

# Set font size
plt.rcParams.update({'font.size': 22})

# Part 3: Data visualization

First we need to load the data and convert the __Date__ and __Time__ columns to datetime. 

In [56]:
df = pd.read_csv('Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv')
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

mask = (df['Date'] > '2010-01-01') & (df['Date'] <= '2018-12-31')
df = df.loc[mask]
df['Time'] = pd.to_datetime(df['Time'])

In [57]:
focuscrimes = set(['WEAPON LAWS', 'PROSTITUTION', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'BURGLARY', 'ASSAULT', 'DRUNKENNESS', 'DRUG/NARCOTIC', 'TRESPASS', 'LARCENY/THEFT', 'VANDALISM', 'VEHICLE THEFT', 'STOLEN PROPERTY', 'DISORDERLY CONDUCT'])

For our relevant dataframe only three column are needed. They are selected here, and the __Time__ column is rounded to the hour. 

In [60]:
df_hour = df[['Category','Time','IncidntNum']]
df_hour['Time'] = df_hour.Time.dt.hour

In [6]:
df_hour

Unnamed: 0,Category,Time,IncidntNum
0,MISSING PERSON,16,110308742
2,SECONDARY CODES,2,130132311
5,OTHER OFFENSES,21,130407330
6,LARCENY/THEFT,12,136080803
13,NON-CRIMINAL,11,100125879
...,...,...,...
2215012,BURGLARY,17,110338937
2215016,MISSING PERSON,13,130291616
2215017,ASSAULT,0,100403681
2215020,VEHICLE THEFT,23,130355795


In order to get a dataframe with Crimes as columns and hour as index, the pandas function **pivot_table** is applied. This function gets us the correnct column and rows and then counts the number of **IncidentNum** entries for each combiunation of **Time** and **Category**. 

In [7]:
df_hour = pd.pivot_table(df_hour, values='IncidntNum', index=['Time'], columns=['Category'], aggfunc='count').fillna(0)

In [8]:
df_hour

Category,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,"SEX OFFENSES, NON FORCIBLE",STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,147.0,5967.0,47.0,29.0,1943.0,247.0,408.0,1665.0,403.0,265.0,...,42.0,331.0,27.0,3046.0,0.0,315.0,3568.0,1808.0,2078.0,676.0
1,135.0,5078.0,1.0,17.0,1344.0,183.0,377.0,986.0,388.0,5.0,...,3.0,253.0,18.0,1457.0,0.0,227.0,2501.0,1215.0,1405.0,487.0
2,139.0,4603.0,1.0,20.0,1520.0,156.0,335.0,794.0,352.0,2.0,...,1.0,221.0,18.0,1292.0,2.0,283.0,2327.0,916.0,1149.0,418.0
3,144.0,2470.0,1.0,11.0,1604.0,86.0,160.0,587.0,134.0,7.0,...,1.0,175.0,11.0,882.0,0.0,235.0,1690.0,603.0,910.0,290.0
4,128.0,1535.0,1.0,9.0,1432.0,72.0,58.0,436.0,71.0,8.0,...,2.0,148.0,8.0,606.0,0.0,178.0,1162.0,506.0,714.0,198.0
5,108.0,1325.0,0.0,7.0,1258.0,285.0,38.0,247.0,25.0,7.0,...,3.0,124.0,10.0,503.0,1.0,424.0,955.0,504.0,559.0,102.0
6,64.0,1676.0,1.0,6.0,1121.0,567.0,43.0,517.0,46.0,4.0,...,3.0,128.0,20.0,759.0,1.0,818.0,1115.0,770.0,855.0,143.0
7,52.0,2425.0,2.0,8.0,1534.0,485.0,29.0,1204.0,100.0,15.0,...,3.0,185.0,18.0,1201.0,1.0,809.0,1418.0,1138.0,1623.0,273.0
8,53.0,3586.0,15.0,15.0,2316.0,359.0,27.0,1604.0,87.0,95.0,...,10.0,212.0,24.0,1959.0,0.0,711.0,2076.0,1659.0,1895.0,329.0
9,50.0,3859.0,14.0,18.0,2151.0,254.0,46.0,1846.0,108.0,93.0,...,12.0,239.0,40.0,2230.0,0.0,630.0,1976.0,1662.0,2191.0,429.0


The remaining thing to do is to normalize the columns. This is done by creating a panda series where the index is each crime and the values are the total number of that crime. 

In [61]:
total = df.groupby('Category').size()
total

Category
ARSON                            2171
ASSAULT                        106737
BAD CHECKS                        313
BRIBERY                           520
BURGLARY                        48143
DISORDERLY CONDUCT               4802
DRIVING UNDER THE INFLUENCE      3376
DRUG/NARCOTIC                   48039
DRUNKENNESS                      5033
EMBEZZLEMENT                     1394
EXTORTION                         372
FAMILY OFFENSES                   571
FORGERY/COUNTERFEITING           6647
FRAUD                           22983
GAMBLING                          147
KIDNAPPING                       2987
LARCENY/THEFT                  299641
LIQUOR LAWS                      1632
LOITERING                         653
MISSING PERSON                  37601
NON-CRIMINAL                   146514
OTHER OFFENSES                 163238
PORNOGRAPHY/OBSCENE MAT            33
PROSTITUTION                     5815
RECOVERED VEHICLE                5153
ROBBERY                         29738
RUN

The **df_hour** dataframe is now normalized by dividing with the **total** pandas series.  

In [10]:
df_hour = df_hour.div(total,axis = 1)
df_hour

Category,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,"SEX OFFENSES, NON FORCIBLE",STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.067711,0.055904,0.15016,0.055769,0.040359,0.051437,0.120853,0.034659,0.080072,0.1901,...,0.17284,0.043398,0.04232,0.063355,0.0,0.028363,0.054167,0.036015,0.03973,0.052582
1,0.062183,0.047575,0.003195,0.032692,0.027917,0.038109,0.111671,0.020525,0.077091,0.003587,...,0.012346,0.033172,0.028213,0.030305,0.0,0.020439,0.037969,0.024202,0.026863,0.037881
2,0.064026,0.043125,0.003195,0.038462,0.031573,0.032486,0.09923,0.016528,0.069938,0.001435,...,0.004115,0.028976,0.028213,0.026873,0.142857,0.025482,0.035327,0.018246,0.021968,0.032514
3,0.066329,0.023141,0.003195,0.021154,0.033317,0.017909,0.047393,0.012219,0.026624,0.005022,...,0.004115,0.022945,0.017241,0.018345,0.0,0.02116,0.025657,0.012011,0.017399,0.022558
4,0.058959,0.014381,0.003195,0.017308,0.029745,0.014994,0.01718,0.009076,0.014107,0.005739,...,0.00823,0.019405,0.012539,0.012605,0.0,0.016027,0.017641,0.010079,0.013651,0.015401
5,0.049747,0.012414,0.0,0.013462,0.02613,0.05935,0.011256,0.005142,0.004967,0.005022,...,0.012346,0.016258,0.015674,0.010462,0.071429,0.038178,0.014498,0.010039,0.010688,0.007934
6,0.02948,0.015702,0.003195,0.011538,0.023285,0.118076,0.012737,0.010762,0.00914,0.002869,...,0.012346,0.016782,0.031348,0.015787,0.071429,0.073654,0.016927,0.015338,0.016347,0.011123
7,0.023952,0.022719,0.00639,0.015385,0.031863,0.101,0.00859,0.025063,0.019869,0.01076,...,0.012346,0.024256,0.028213,0.02498,0.071429,0.072844,0.021527,0.022668,0.031031,0.021235
8,0.024413,0.033597,0.047923,0.028846,0.048107,0.074761,0.007998,0.03339,0.017286,0.068149,...,0.041152,0.027796,0.037618,0.040746,0.0,0.064019,0.031517,0.033046,0.036231,0.025591
9,0.023031,0.036154,0.044728,0.034615,0.044679,0.052895,0.013626,0.038427,0.021458,0.066714,...,0.049383,0.031336,0.062696,0.046383,0.0,0.056726,0.029998,0.033106,0.041891,0.03337


## Start visualizing with bokeh

We now have the dataframe we need for further analysis with Bokeh. The source object is created as well as an iterator that contains different colors of a palette for coloring the plot. 

In [49]:
source = ColumnDataSource(df_hour)
colors = itertools.cycle(palette)

In [50]:
hours = [str(elem) for elem in df_hour.index.to_list()]

In [51]:
output_notebook()

In [52]:
p = figure(x_range = FactorRange(factors = hours),width=900, height=400)

In [53]:

bar ={} # to store vbars
items = [] ### for the custom legend // you need to figure out where to add it
### here we will do a for loop:
for indx,i in enumerate(zip(focuscrimes,colors)):
    bar[i[0]] = p.vbar(x='Time', top=i[0],width = 0.9,source= source,muted_alpha = False, muted = True,color=i[1]) 
#i stands for a column that we use, top=y; we are specifying that our numbers comes from column i
#read up what legend_label, muted and muted_alpha do... you can add more attributes (you HAVE TO)
    items.append((i[0], [bar[i[0]]])) ### figure where to add it
legend = Legend(items=items, location=(0,20),click_policy="mute") ## figure where to add it
p.add_layout(legend, 'right') ## figure where to add it

p.title.text = "Crimes per hour"
p.grid.grid_line_alpha = 0.9
p.xaxis.axis_label = 'Hour of the day'
p.yaxis.axis_label = 'Relative Frequency'
p.ygrid.band_fill_color = "olive"
p.ygrid.band_fill_alpha = 0.1

In [54]:
show(p) #displays your plot