# Kaggle: San Francisco Crime Classification

Predict the category of crimes that occurred in the city by the bay

From 1934 to 1963, San Francisco was infamous for housing some of the world's most notorious criminals on the inescapable island of Alcatraz.

Today, the city is known more for its tech scene than its criminal past. But, with rising wealth inequality, housing shortages, and a proliferation of expensive digital toys riding BART to work, there is no scarcity of crime in the city by the bay.

From Sunset to SOMA, and Marina to Excelsior, this competition's dataset provides nearly 12 years of crime reports from across all of San Francisco's neighborhoods. Given time and location, you must predict the category of crime that occurred.

In [1]:
import numpy as np
import pandas as pd
import random
import tensorflow as tf

from sklearn import datasets, cross_validation, metrics
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.utils import np_utils

import csv
from copy import copy

from os.path import expanduser

Using TensorFlow backend.


In [2]:
def make_features(data, gps_data):
    # Convert Police district to one-hot matrix
    data = pd.concat([data, pd.get_dummies(data['PdDistrict'])], axis=1)
    
    # Extract date-time features
    data['Dates'] = pd.to_datetime(data['Dates'])
    data['year'] = data['Dates'].dt.year
    data['month'] = data['Dates'].dt.month
    data['day'] = data['Dates'].dt.day
    data['hour'] = data['Dates'].dt.hour
    data['minute'] = data['Dates'].dt.minute
    data['dayofyear'] = data['Dates'].dt.dayofyear
    data['dayofweek'] = data['Dates'].dt.dayofweek

    data['Z'] = gps_data['altitude (ft)']
    data[['X','Y','Z']] = preprocessing.normalize(data[['X','Y','Z']], norm='l2')
    
    return data

def make_PCA(X, n_comp):
    pca = PCA(n_components=n_comp)
    pca.fit(X)
    return pca

def build_model(input_dim, output_dim, hn=32, dp=0.5, layers=1,
                init_mode='glorot_uniform',
                batch_norm=True):
    model = Sequential()
    model.add(Dense(hn, input_dim=input_dim, init=init_mode))
    model.add(Activation('relu'))
    if batch_norm:
        model.add(BatchNormalization())
    model.add(Dropout(dp))

    for i in range(layers):
        model.add(Dense(hn, init=init_mode))
        model.add(Activation('relu'))
        if batch_norm:
            model.add(BatchNormalization())
        model.add(Dropout(dp))

    model.add(Dense(output_dim, init=init_mode))
    model.add(Activation('softmax'))

    return model


def save_model_weights(model, name):
    try:
        model.save_weights(name, overwrite=True)
    except:
        print("failed to save classifier weights")
    pass

def load_model_weights(model, name):
    try:
        model.load_weights(name)
    except:
        print("Can't load weights!")


def run_model(X, y, model, batch_size, nb_epoch, lr, load_name='SF-crime.h5', save_name='SF-crime.h5'):
    adam = Adam(lr=lr)
    model.compile(loss='categorical_crossentropy', optimizer=adam)
    load_model_weights(model, load_name)
    model.fit(X,
              y,
              nb_epoch=nb_epoch,
              batch_size=batch_size,
              validation_split=0.1,
              show_accuracy=True,
              verbose=True)

    save_model_weights(model, save_name)
    return model

In [3]:
use_PCA = True
save_preds = True

In [4]:
# Set paths for data to be imported

home = expanduser('~')
# path = str(home) + '\\Documents\\data-science\\kaggle\\sf-crime\\' # Windows
# path = str(home) + '/Documents/Personal/Summagers/kaggle/sfcrime/mkchang/' # Mac
path = str(home) + '/Documents/Summagers/kaggle/sfcrime/mkchang/' # Linux
trainfile = 'train.csv'
testfile = 'test.csv'
train_gps_file = 'train_gps.csv'
test_gps_file = 'test_gps.csv'

## Features

In [None]:
train_data = make_features(pd.read_csv(path+trainfile), pd.read_csv(path+train_gps_file))
test_data = make_features(pd.read_csv(path+testfile), pd.read_csv(path+test_gps_file))

In [None]:
# remove training data with incorrect latitude and longitude
train_data = train_data[train_data['Y']!=90]

In [None]:
# Decide which features to go into training set
features = ['dayofyear','dayofweek','hour','X','Y','Z','BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK',
       'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']

In [None]:
X_all = train_data.ix[:,features]
y_all = train_data.ix[:,'Category']
X_test = test_data.ix[:,features]

In [None]:
y = y_all.astype('category').cat.codes

X = X_all.as_matrix()
if use_PCA:
    pca = make_PCA(X, 15)
    X = pca.transform(X)

X_train, X_cv, y_train, y_cv = train_test_split(X, y, train_size=.5, random_state=1)

In [None]:
y_OH = np_utils.to_categorical(y.as_matrix(), y.nunique())
y_train_OH = np_utils.to_categorical(y_train.as_matrix(), y.nunique())
# y_cv_OH = np_utils.to_categorical(y_cv.as_matrix(), y.nunique())

In [None]:
input_dim = X.shape[1]
output_dim = y_OH.shape[1]

In [None]:
model = build_model(input_dim, output_dim, hn=512, dp=0.5, layers=1, init_mode='glorot_normal')

In [13]:
%time model = run_model(X, y_OH, model, 128, 20, 1e-2, load_name='SF-crime_FC512x1_PCA-15_train-0.5.h5', save_name='SF-crime_FC512x1_PCA-15_train-0.5.h5')

Train on 790244 samples, validate on 87805 samples
Epoch 1/20
Epoch 2/20

KeyboardInterrupt: 

In [None]:
# model = run_model(X, y_OH, model, 256, 20, 1e-2, load_name='SF-crime_FC256x5_PCA-15_train-0.5.h5', save_name='SF-crime_FC256x5_PCA-15_train-0.5.h5')

In [None]:
if save_preds:
    X_final_test = X_test[features].as_matrix()
    X_final_test = pca.transform(X_final_test)
    pred = model.predict_proba(X_final_test, batch_size=256, verbose=1)

    labels = list(pd.get_dummies(train_data['Category']).columns)

    with open('sf-nn.csv', 'w') as outf:
        fo = csv.writer(outf, lineterminator='\n')
        fo.writerow(['Id'] + labels)
        for i, p in enumerate(pred):
            fo.writerow([i] + list(p))