In [1]:
import pyspark

In [2]:
## Pyspark
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

## Numerical frameworks
import numpy as np
import pandas as pd

## Matplotlib
import matplotlib.pyplot as plt

## Bokeh
from bokeh.io import show, output_file, output_notebook
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.models import BasicTicker, ColorBar, ColumnDataSource, LinearColorMapper, PrintfTickFormatter

In [3]:
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [4]:
df = sqlContext.read.format('com.databricks.spark.csv')\
    .options(header='true', inferschema='true')\
    .load('data.csv')

In [5]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: integer (nullable = true)
 |-- Y Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



In [6]:
df.groupBy("Primary Type") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-------+
|        Primary Type|  count|
+--------------------+-------+
|               THEFT|1371774|
|             BATTERY|1196547|
|     CRIMINAL DAMAGE| 751428|
|           NARCOTICS| 701699|
|       OTHER OFFENSE| 406530|
|             ASSAULT| 403897|
|            BURGLARY| 379662|
| MOTOR VEHICLE THEFT| 307555|
|             ROBBERY| 249023|
|  DECEPTIVE PRACTICE| 248402|
|   CRIMINAL TRESPASS| 188642|
|        PROSTITUTION|  67754|
|   WEAPONS VIOLATION|  66762|
|PUBLIC PEACE VIOL...|  46812|
|OFFENSE INVOLVING...|  43603|
| CRIM SEXUAL ASSAULT|  25816|
|         SEX OFFENSE|  24164|
|            GAMBLING|  14234|
|INTERFERENCE WITH...|  14209|
|LIQUOR LAW VIOLATION|  13856|
+--------------------+-------+
only showing top 20 rows



### Number of crime over the years

In [68]:
year_list = df.groupBy("Year") \
    .count() \
    .orderBy(col("count").desc()).toPandas()

In [86]:
# Create the blank plot
source = year_list

p = figure(plot_height = 300, plot_width = 700, 
           title = 'Histogram of crimes from 2001 to 2018',
           x_axis_label = 'Years',
           y_axis_label = 'Number of crimes')

# Add a quad glyph
p.vbar(x='Year', top='count', width=1, source = source,
       line_color='black', fill_color='red', fill_alpha = 0.75,
           hover_fill_alpha = 1.0, hover_fill_color = 'navy')

p.y_range.start = 0
p.x_range.start = 2000
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1.2
p.outline_line_color = None

# Add a hover tool referring to the formatted columns
hover = HoverTool(tooltips = [('Year', '@Year'),
                             ('Num of crimes', '@count')])


# Add the hover tool to the graph
p.add_tools(hover)

show(p)

In [35]:
# bokeh basics
from bokeh.plotting import figure
from bokeh.io import show, output_notebook

# Create a blank figure with labels
p = figure(plot_width = 600, plot_height = 600, 
           title = 'Example Glyphs',
           x_axis_label = 'X', y_axis_label = 'Y')

# Example data
squares_x = [1, 3, 4, 5, 8]
squares_y = [8, 7, 3, 1, 10]
circles_x = [9, 12, 4, 3, 15]
circles_y = [8, 4, 11, 6, 10]

# Add squares glyph
p.square(squares_x, squares_y, size = 12, color = 'navy', alpha = 0.6)
# Add circle glyph
p.circle(circles_x, circles_y, size = 12, color = 'red')

# Set to output the plot in the notebook
output_notebook()
# Show the plot
show(p)

### Month wise crime visualization

In [29]:
split_col = pyspark.sql.functions.split(df['Date'], ' ')
df = df.withColumn("date_only", split_col.getItem(0)).withColumn("time_only", split_col.getItem(1))

In [33]:
years = np.unique(yr_month['Year'])
number_of_months = 12
yr_month_matrix = np.zeros((len(years), number_of_months))

In [15]:
yr_month.head()

NameError: name 'yr_month' is not defined

In [19]:
yr_month.head()

Unnamed: 0,Year,Month,count
0,2018,January,19702
1,2018,March,6361
2,2018,February,16577
3,2017,March,20436
4,2017,August,24580


In [79]:
yr_month.head()

Unnamed: 0,Year,Month,count
0,2018,1,19702
1,2018,3,6361
2,2018,2,16577
3,2017,3,20436
4,2017,8,24580


In [31]:
yr_month = df_date.groupby('Year', 'Month').count().orderBy(col('Year').desc()).toPandas()

In [35]:
yr_month_matrix = pd.DataFrame(yr_month_matrix, columns = months)

In [37]:
from bokeh.io import show
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LinearColorMapper,
    BasicTicker,
    PrintfTickFormatter,
    ColorBar,
)
from bokeh.plotting import figure

data['Year'] = data['Year'].astype(str)
data = data.set_index('Year')
data.columns.name = 'Month'

years = list(data.index)
months = list(data.columns)

In [40]:
df_yr = pd.DataFrame(data.stack(), columns=['rate']).reset_index()

In [36]:
yr_month_matrix['Year'] = years
data = yr_month_matrix

In [58]:

source = ColumnDataSource(df)

# this is the colormap from the original NYTimes plot
colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
mapper = LinearColorMapper(palette=colors, low=df.rate.min(), high=df.rate.max())

p = figure(plot_width=800, plot_height=300, title="US Unemployment 1948—2016",
           x_range=list(data.index), y_range=list(reversed(data.columns)),
           toolbar_location=None, tools="", x_axis_location="above")

p.rect(x="Year", y="Month", width=1, height=1, source=source,
       line_color=None, fill_color=transform('rate', mapper))

color_bar = ColorBar(color_mapper=mapper, location=(0, 0),
                     ticker=BasicTicker(desired_num_ticks=len(colors)),
                     formatter=PrintfTickFormatter(format="%d%%"))

p.add_layout(color_bar, 'right')

p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = 1.0

show(p)

E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: Count [renderer: GlyphRenderer(id='cde62474-29dd-4c43-b8c7-b946fa5f476d', ...)]
