In [1]:
%matplotlib inline
import pandas as pd

In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

IOError: [Errno 2] No such file or directory: 'style-table.css'

In [None]:
def readfile(filename):
    data = pd.read_csv(filename)
    return data

# change this to your local file
filename = 'NYNoise.csv'
# read file
data = readfile(filename)
# crimes match row count
data.head()

Since the data doesn't contain the full data for 2015 and 2017 we will just use the year 2016; otherwise we wuold have double data for some months.

In [None]:
#add a column with the year from the Date
data['year'] = pd.DatetimeIndex(data['Created Date']).year

In [None]:
# filter the dataset. Just keep the rows that belong to year 2016.
data2016 = data[data['year'] == 2016]

In [None]:
#check that we successfully removed the other cases
data2016.head()

In [None]:
#print number of cases in 2016
print "Total number of cases in 2016:", len(data2016)

### K Nearest Neighbors

Some of the cases are more common in certian areas of the city. We will take a llok at the most prominent ones: Construction Before/After Hours (NM1), Barking Dog (NR5), Alarms (NR3)

In [None]:
#start by creating a dataset for each type of case
#Construction
construction_data = data2016[data2016['Descriptor'] == 'Noise: Construction Before/After Hours (NM1)']
#Barking Dog
barkingDog_data = data2016[data2016['Descriptor'] == 'Noise, Barking Dog (NR5)']
#Alarms
alarms_data =  data2016[data2016['Descriptor'] == 'Noise: Alarms (NR3)']

In [None]:
construction_data.head()

Next we need to obtain the latitudes and longitudes for the plotting.

In [None]:
import math
#define function
def obtain_lat_long(data):
    latitude = [float(item) for item in data['Latitude'] if not math.isnan(item)] #we need to remove some 'NaN' values
    longitude = [float(item) for item in data['Longitude'] if not math.isnan(item)]
    geodata = {"lat": latitude,
                "lon": longitude}
    return geodata

#create dicts for each type
lat_lon_construction = obtain_lat_long(construction_data)
lat_lon_barkingdog = obtain_lat_long(barkingDog_data)
lat_lon_alarms = obtain_lat_long(alarms_data)

In [None]:
import geoplotlib 
import numpy as np
from geoplotlib.utils import BoundingBox

def plot_geodata(geodata, cmap_color):

    # plot kernel density map
    geoplotlib.kde(geodata, bw=5, cut_below=1e-4, cmap=cmap_color)
    #we define limits for the bounding box
    n = max(geodata['lat']) 
    s = min(geodata['lat'])
    e = max(geodata['lon'])
    w = min(geodata['lon']) 
    # create bounding box
    bbox = BoundingBox(north=n, south=s, west=w, east=e)
    # set bouding box
    geoplotlib.set_bbox(bbox)
    geoplotlib.inline()
    
print "Construction:"
plot_geodata(lat_lon_construction, 'hot')
print "Barking Dogs"
plot_geodata(lat_lon_barkingdog, 'hot')
print "Alarms"
plot_geodata(lat_lon_alarms, 'hot')

In [None]:
# Calculate rows for of each type
construction = construction_data.shape[0]
dogbarking = barkingDog_data.shape[0]
alarms = alarms_data.shape[0]
total = (construction + dogbarking + alarms)
print "Number of noise cases: %d " % construction
print "Amount of barking dog cases: %d" % dogbarking
print "Number of alarm: %d" % alarms
print 'Total: {}'.format(total)
print "Percentage of noise cases:", construction/(total/100)
print "Percentage of barking dog cases:", dogbarking/(total/100)
print "Percentage of alarm:", alarms/(total/100)

In [None]:
#we will work with the unbalanced data
def get_types(dataset, case_type):
    df = dataset[dataset['Descriptor'].isin(case_type)]
    # filter the dataset. Just keep the rows that don't have NaN lat/lon
    df = df[np.isfinite(df['Latitude'])]
    return df

case_types = ['Noise: Construction Before/After Hours (NM1)', 'Noise, Barking Dog (NR5)', 'Noise: Alarms (NR3)']
plotting_data = get_types(data2016, case_types)
print len(plotting_data)

In [None]:
# create a grid of SF with N*N points
N = 100
geo_data = obtain_lat_long(data2016)

#we define the size of the grid to fit the map
n = max(geo_data['lat']) 
s = min(geo_data['lat'])
e = max(geo_data['lon']) 
w = min(geo_data['lon']) 
    
#we obtain the partitions for lat and lon.
latitude_partition = np.arange(s, n, float(n-s)/N)
longitude_partition = np.arange(w, e, float(e-w)/N)
    
# reset north, south, west and east in order to set the boundingbox
n = np.mean(geo_data["lat"]) + 0.135
s = np.mean(geo_data["lat"]) - 0.15
e = np.mean(geo_data["lon"]) + 0.135
w = np.mean(geo_data["lon"]) - 0.15
    
grid = []
# create the grid
for lat in latitude_partition:
    for lon in longitude_partition:
        grid.append((lat,lon))
        
#we obtain the latitude and longitude for the dict from the grid points  
lat = [item[0] for item in grid]
lon = [item[1] for item in grid]
    
#build the dict
geographical_grid_data = {"lat": lat,
                          "lon": lon}

#create the plot
geo_data = geographical_grid_data

# plot it
geoplotlib.dot(geographical_grid_data)
bbox = BoundingBox(north = n, south = s, west = w, east = e)
geoplotlib.set_bbox(bbox)
geoplotlib.inline()


In [None]:
# create list of labels
y = list()
for case_type in plotting_data['Descriptor']:
    if case_type == "Noise: Construction Before/After Hours (NM1)":
        y.append(1)
    elif case_type == "Noise, Barking Dog (NR5)":
        y.append(2)
    elif case_type == "Noise: Alarms (NR3)":
        y.append(3)

# get latitudes for the 3 types
latitudes = [float(item) for item in plotting_data['Latitude'] if not math.isnan(item)]
# get longitudes for the 3 crimes
longitudes = [float(item) for item in plotting_data['Longitude'] if not math.isnan(item)]
# build X as a list of lists 
X = [[latitudes[item], longitudes[item]] for item in range(len(latitudes))]
print 'Len X: ', len(X)

X_test = [list(item) for item in grid] 
print 'Len X_Test: ', len(X_test)

In [None]:
from sklearn import neighbors

def KNN(X, y, X_test, num_neighbors):

    knn=neighbors.KNeighborsClassifier(n_neighbors=num_neighbors)
    #train the classifier
    knn.fit(X, y)
    # predict the labels
    return knn.predict(X_test)

In [None]:
def color_grid(Z,geo_data, N):
    num_grid_points = N*N

    colorConst = {}
    colorConst['lat'] = []
    colorConst['lon'] = []
    colorDog = {}
    colorDog['lat'] = []
    colorDog['lon'] = []
    colorAlarm = {}
    colorAlarm['lat'] = []
    colorAlarm['lon'] = []

    for i in range(num_grid_points):
        if (Z[i] == 1):
            colorConst['lat'].append(geo_data['lat'][i])
            colorConst['lon'].append(geo_data['lon'][i])
        if (Z[i] == 2):
            colorDog['lat'].append(geo_data['lat'][i])
            colorDog['lon'].append(geo_data['lon'][i])
        if (Z[i] == 3):
            colorAlarm['lat'].append(geo_data['lat'][i])
            colorAlarm['lon'].append(geo_data['lon'][i])

    # bbox dimensions
    n = np.mean(geo_data["lat"]) + 0.135
    s = np.mean(geo_data["lat"]) - 0.15
    e = np.mean(geo_data["lon"]) + 0.135
    w = np.mean(geo_data["lon"]) - 0.15
    
    geoplotlib.dot(colorConst, color='green')
    geoplotlib.dot(colorDog, color='red')
    geoplotlib.dot(colorAlarm, color='blue')
    bbox = BoundingBox(north = n, south = s, west = w, east = e)
    geoplotlib.set_bbox(bbox)
    geoplotlib.inline()

print "K=5 NEAREST NEIGHBORS:"
print "Green: Construction Before/After Hours"
print "Red: Barking Dog"
print "Blue: Alarm"
Z = KNN(X, y, X_test, 5)
color_grid(Z,geo_data,N)

print "K=10 NEAREST NEIGHBORS:"
print "Green: Construction Before/After Hours"
print "Red: Barking Dog"
print "Blue: Alarm"
Z = KNN(X, y, X_test, 10)
color_grid(Z, geo_data, N)

print "K=30 NEAREST NEIGHBORS:"
print "Green: Construction Before/After Hours"
print "Red: Barking Dog"
print "Blue: Alarm"
Z = KNN(X, y, X_test, 30)
color_grid(Z, geo_data, N)

### Machine Learning

In [None]:
from __future__ import print_function
import os
import subprocess
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#### Data preparation

In [None]:
#remove NaN ZIP codes form the data
data2016 = data2016[np.isfinite(data2016['Incident Zip'])]

#add columns for date and time
data2016['date'] = pd.DatetimeIndex(data2016['Created Date']).date
data2016['time'] = pd.DatetimeIndex(data2016['Created Date']).time

#transform date to ordinal for the classifier
data2016['date_ord']=data2016['date'].apply(lambda x: x.toordinal())

#translate time into minutes
data2016['time_n_min']=data2016['time'].apply(lambda x: x.hour*60 + x.minute + x.second/60)

#create a funtion to categorize "name" variables like Street name or Descriptor
def categorize_target(df, target_column, new_col):

    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod[new_col] = df_mod[target_column].replace(map_to_int)

    return (df_mod)

#Categorize Descriptor
new_col= "Target"
data2016_descrp_cat = categorize_target(data2016, "Descriptor",new_col)

#Categorize Street Name
new_col= "street_cat"
data2016_cat = categorize_target(data2016_descrp_cat, "Street Name",new_col)

In [None]:
data2016_cat.head(1)

In [None]:
#Define function to preapre data for classifiers

def prep_data (data2016_cat, features, target,sample_size):
    
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()

    split_point = int(round(len(data2016_cat)*sample_size))

    if len(data2016_cat) > split_point : # len(df) > 10 would also work
        train_df = data2016_cat[:split_point]
        test_df = data2016_cat[split_point:]

    y = train_df[target]
    X = train_df[features]
    X_test = test_df[features]
    Y_test = test_df[target]
    
    return(X,y,X_test,Y_test)


### Decision Tree

#### Sample size 90% - 10% / Time, Date, Street Name

In [None]:
features = ['date_ord','time_n_min','street_cat']
target = ['Target']
sample_size = 0.9

X,y,X_test,Y_test = prep_data(data2016_cat,features,target,sample_size)
tr = tree.DecisionTreeClassifier()

In [None]:
tr.fit(X, y)

In [None]:
tr_predicted = tr.predict(X_test)

tr.score(X_test,Y_test)

#### Time, Date, ZIP

In [None]:
features = ['date_ord','time_n_min','Incident Zip']
target = ['Target']
sample_size = 0.9

X,y,X_test,Y_test = prep_data(data2016_cat,features,target,sample_size)
tr = tree.DecisionTreeClassifier()

tr.fit(X, y)

tr_predicted = tr.predict(X_test)

tr.score(X_test,Y_test)

#### Date and ZIP

In [None]:
features = ['date_ord','Incident Zip']
target = ['Target']
sample_size = 0.9

X,y,X_test,Y_test = prep_data(data2016_cat,features,target,sample_size)
tr = tree.DecisionTreeClassifier()

tr.fit(X, y)

tr_predicted = tr.predict(X_test)

tr.score(X_test,Y_test)

####  ZIP

In [None]:
features = ['Incident Zip']
target = ['Target']
sample_size = 0.9

X,y,X_test,Y_test = prep_data(data2016_cat,features,target,sample_size)
tr = tree.DecisionTreeClassifier()

tr.fit(X, y)

tr_predicted = tr.predict(X_test)

tr.score(X_test,Y_test)

#### Street Name

In [None]:
features = ['street_cat']
target = ['Target']
sample_size = 0.9

X,y,X_test,Y_test = prep_data(data2016_cat,features,target,sample_size)
tr = tree.DecisionTreeClassifier()

tr.fit(X, y)

tr_predicted = tr.predict(X_test)

tr.score(X_test,Y_test)

#### 80-20% data - Date, time and ZIP

In [None]:
features = ['date_ord','time_n_min','Incident Zip']
target = ['Target']
sample_size = 0.8

X,y,X_test,Y_test = prep_data(data2016_cat,features,target,sample_size)
tr = tree.DecisionTreeClassifier()

tr.fit(X, y)

tr_predicted = tr.predict(X_test)

tr.score(X_test,Y_test)

### Random Forest

In [None]:
features = ['date_ord','time_n_min','Incident Zip']
target = ['Target']
sample_size = 0.9

X,y,X_test,Y_test = prep_data(data2016_cat,features,target,sample_size)

RFC = RandomForestClassifier(n_estimators = 50)

RFC.fit(X,y)

RFC_predicted = RFC.predict(X_test)

RFC.score(X_test,Y_test)

#### Street Name

In [None]:
features = ['street_cat']
target = ['Target']
sample_size = 0.9

X,y,X_test,Y_test = prep_data(data2016_cat,features,target,sample_size)

RFC = RandomForestClassifier(n_estimators = 50)

RFC.fit(X,y)

RFC_predicted = RFC.predict(X_test)

RFC.score(X_test,Y_test)

### K Neighbors

In [None]:
features = ['date_ord','time_n_min','Incident Zip']
target = ['Target']
sample_size = 0.9

X,y,X_test,Y_test = prep_data(data2016_cat,features,target,sample_size)

KNC = KNeighborsClassifier(n_neighbors=3)

KNC.fit(X,y)

KNC_predicted = KNC.predict(X_test)

KNC.score(X_test,Y_test)