# Kaggle: San Francisco Crime Classification

Predict the category of crimes that occurred in the city by the bay

From 1934 to 1963, San Francisco was infamous for housing some of the world's most notorious criminals on the inescapable island of Alcatraz.

Today, the city is known more for its tech scene than its criminal past. But, with rising wealth inequality, housing shortages, and a proliferation of expensive digital toys riding BART to work, there is no scarcity of crime in the city by the bay.

From Sunset to SOMA, and Marina to Excelsior, this competition's dataset provides nearly 12 years of crime reports from across all of San Francisco's neighborhoods. Given time and location, you must predict the category of crime that occurred.

In [1]:
import numpy as np
import pandas as pd
import random
import tensorflow as tf

from sklearn import datasets, cross_validation, metrics
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.utils import np_utils

import csv
from copy import copy

from os.path import expanduser

Using TensorFlow backend.


In [28]:
def make_predictors(data):
    for col in ['DayOfWeek','PdDistrict']:
        dummies = pd.get_dummies(data[col])
        data[col[0:3]+"_"+dummies.columns] = dummies

    data['PandasDates'] = pd.to_datetime(data['Dates'])
    data[['X','Y']] = preprocessing.normalize(data[['X','Y']], norm='l2')
    data['Year'] = data['PandasDates'].dt.year
    data['Month'] = data['PandasDates'].dt.month
    data['Day'] = data['PandasDates'].dt.dayofyear
    data['Hour'] = data['PandasDates'].dt.hour
    data['Minute'] = data['PandasDates'].dt.minute

    return data

def make_PCA(X, n_comp):
    pca = PCA(n_components=n_comp)
    pca.fit(X)
    return pca

def build_model(input_dim, output_dim, hn=32, dp=0.5, layers=1,
                init_mode='glorot_uniform',
                batch_norm=True):
    model = Sequential()
    model.add(Dense(hn, input_dim=input_dim, init=init_mode))
    model.add(Activation('relu'))
    if batch_norm:
        model.add(BatchNormalization())
    model.add(Dropout(dp))

    for i in range(layers):
        model.add(Dense(hn, init=init_mode))
        model.add(Activation('relu'))
        if batch_norm:
            model.add(BatchNormalization())
        model.add(Dropout(dp))

    model.add(Dense(output_dim, init=init_mode))
    model.add(Activation('softmax'))

    return model


def save_model_weights(model, name):
    try:
        model.save_weights(name, overwrite=True)
    except:
        print("failed to save classifier weights")
    pass

def load_model_weights(model, name):
    try:
        model.load_weights(name)
    except:
        print("Can't load weights!")


def run_model(model,batch_size, nb_epoch, lr, load_name='SF-crime.h5', save_name='SF-crime.h5'):
    adam = Adam(lr=lr)
    model.compile(loss='categorical_crossentropy', optimizer=adam)
    load_model_weights(model, load_name)
    model.fit(X_train,
              y_train_OH,
              nb_epoch=nb_epoch,
              batch_size=batch_size,
              validation_split=0.1,
              show_accuracy=True,
              verbose=True)

    save_model_weights(model, save_name)
    return model

In [None]:
use_PCA = True
save_preds = False 

data = make_predictors(pd.read_csv('train.csv'))
test_data = make_predictors(pd.read_csv('test.csv'))

train_cols = [col for col in data.columns if col not in ['DayOfWeek','PandasDates', 'PdDistrict','Category','Address','Dates','Descript','Resolution']]
X = data[train_cols]

y = data['Category'].astype('category').cat.codes

X = X.as_matrix()
if use_PCA:
    pca = make_PCA(X, 15)
    X = pca.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=42)

y_OH = np_utils.to_categorical(y.as_matrix(), y.nunique())
y_train_OH = np_utils.to_categorical(y_train.as_matrix(), y.nunique())
y_test_OH = np_utils.to_categorical(y_test.as_matrix(), y.nunique())

input_dim = X.shape[1]
output_dim = y_OH.shape[1]

model = build_model(input_dim, output_dim, hn=256, dp=0.5, layers=5, init_mode='glorot_normal')

model = run_model(model, 256, 1, 1e-2, load_name='SF-crime_FC256x5_PCA-15_train-0.5.h5', save_name='SF-crime_FC256x5_PCA-15_train-0.5.h5')

if save_preds:
    X_final_test = test_data[train_cols].as_matrix()
    X_final_test = pca.transform(X_final_test)
    pred = model.predict_proba(X_final_test, batch_size=256, verbose=1)

    labels = list(pd.get_dummies(data['Category']).columns)

    with open('sf-nn.csv', 'w') as outf:
        fo = csv.writer(outf, lineterminator='\n')
        fo.writerow(['Id'] + labels)
        for i, p in enumerate(pred):
            fo.writerow([i] + list(p))

In [6]:
use_PCA = True
save_preds = False 

In [8]:
# Set paths for data to be imported

home = expanduser('~')
# path = str(home) + '\\Documents\\data-science\\kaggle\\sf-crime\\' # Windows
# path = str(home) + '/Documents/Personal/Summagers/kaggle/sfcrime/mkchang/' # Mac
path = str(home) + '/Documents/Summagers/kaggle/sfcrime/mkchang/' # Linux
trainfile = 'train.csv'
testfile = 'test.csv'
train_gps_file = 'train_gps.csv'
test_gps_file = 'test_gps.csv'

In [9]:
train_data_raw = pd.read_csv(path+trainfile)
test_data_raw = pd.read_csv(path+testfile)
train_gps = pd.read_csv(path+train_gps_file)
test_gps = pd.read_csv(path+test_gps_file)

In [22]:
data = make_predictors(pd.read_csv(path+trainfile))
test_data = make_predictors(pd.read_csv(path+testfile))

In [2]:
# %matplotlib inline

In [2]:
# from matplotlib import pyplot as plt


# from sklearn import cross_validation, preprocessing
# from os.path import expanduser, normpath
# import time
# import datetime

  "The Gtk3Agg backend is known to not work on Python 3.x with pycairo. "


## Features

In [11]:
train_data = train_data_raw.copy()
test_data = test_data_raw.copy()

In [12]:
# Remove unnecessary features
train_data.drop(['Descript', 
                 'Resolution', 
                 'PdDistrict', 
                 'DayOfWeek', 
                 'Address'], inplace=True, axis=1)

test_data.drop(['PdDistrict', 
                'DayOfWeek', 
                'Address'], inplace=True, axis=1)

In [13]:
train_data = pd.concat([train_data, pd.get_dummies(train_data_raw['PdDistrict'])], axis=1)
test_data = pd.concat([test_data, pd.get_dummies(test_data_raw['PdDistrict'])], axis=1)

In [14]:
train_data['Dates'] = pd.to_datetime(train_data['Dates'])
train_data['year'] = train_data['Dates'].dt.year
train_data['month'] = train_data['Dates'].dt.month 
train_data['day'] = train_data['Dates'].dt.day
train_data['hour'] = train_data['Dates'].dt.hour
train_data['minute'] = train_data['Dates'].dt.minute

train_data['dayofyear'] = train_data['Dates'].dt.dayofyear
train_data['dayofweek'] = train_data['Dates'].dt.dayofweek

In [15]:
test_data['Dates'] = pd.to_datetime(test_data['Dates'])
test_data['year'] = test_data['Dates'].dt.year
test_data['month'] = test_data['Dates'].dt.month 
test_data['day'] = test_data['Dates'].dt.day
test_data['hour'] = test_data['Dates'].dt.hour
test_data['minute'] = test_data['Dates'].dt.minute

test_data['dayofyear'] = test_data['Dates'].dt.dayofyear
test_data['dayofweek'] = test_data['Dates'].dt.dayofweek

In [16]:
# Add in altitude train and test data
train_data['Z'] = train_gps['altitude (ft)']
test_data['Z'] = test_gps['altitude (ft)']

In [17]:
# remove training data with incorrect latitude and longitude
train_data = train_data[train_data['Y']!=90]

In [18]:
# Decide which features to go into training set
features = ['dayofyear','dayofweek','hour','X','Y','Z','BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK',
       'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']

In [19]:
X_all = train_data.ix[:,features]
y_all = train_data.ix[:,'Category']
X_test = test_data.ix[:,features]

In [20]:
# generate training and cross-validation features
X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X_all, 
                                                                 y_all, 
                                                                 test_size=.5, 
                                                                 random_state=42)

In [15]:
# # polarize data
#     if tod:
#         times = index.hour
#         tody = np.cos(2*np.pi*times/24)
#         todx = np.sin(2*np.pi*times/24)     
        
#         X_train[:,2] = tody[shuffling][:n_points]
#         X_train[:,3] = todx[shuffling][:n_points]
        
#         X_test[:,2] = tody[shuffling][n_points:]
#         X_test[:,3] = todx[shuffling][n_points:]

In [35]:
use_PCA = False
save_preds = False 

data = make_predictors(pd.read_csv('train.csv'))
test_data = make_predictors(pd.read_csv('test.csv'))

train_cols = [col for col in data.columns if col not in ['DayOfWeek','PandasDates', 'PdDistrict','Category','Address','Dates','Descript','Resolution']]
X = data[train_cols]

y = data['Category'].astype('category').cat.codes

X = X.as_matrix()
if use_PCA:
    pca = make_PCA(X, 15)
    X = pca.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=42)


In [34]:
X_train

array([[  2.73106656e+01,   2.01394634e+01,   9.65228302e+00, ...,
          1.98987683e-01,  -4.13891213e-02,   1.90315428e-02],
       [ -4.37288345e+01,   2.01030402e+01,   1.69071867e+00, ...,
          1.93379623e-01,  -3.50536713e-02,   6.77769705e-03],
       [  4.33278999e+01,   1.62247845e+01,   6.72331082e-01, ...,
         -4.12522319e-01,  -7.68994396e-01,  -2.09143889e-01],
       ..., 
       [ -4.17160451e+01,   2.00465564e+01,   8.64732289e+00, ...,
          2.02399539e-01,  -7.22563617e-02,   3.02627602e-02],
       [ -1.27778111e+02,   1.70310969e+01,  -8.31883621e+00, ...,
         -4.43347427e-01,  -7.80022481e-01,  -1.37893008e-01],
       [ -9.26828310e+01,  -8.98071473e+00,  -1.46871114e+00, ...,
          9.64471407e-02,  -1.42734169e-02,  -5.49699016e-02]])

In [24]:
y_OH = np_utils.to_categorical(y.as_matrix(), y.nunique())
y_train_OH = np_utils.to_categorical(y_train.as_matrix(), y.nunique())
y_test_OH = np_utils.to_categorical(y_test.as_matrix(), y.nunique())

In [25]:
# y_OH = np_utils.to_categorical(y_all.as_matrix(), y_all.nunique())
# y_train_OH = np_utils.to_categorical(y_train.as_matrix(), y.nunique())
# y_cv_OH = np_utils.to_categorical(y_cv.as_matrix(), y.nunique())

In [26]:
input_dim = X.shape[1]
output_dim = y_OH.shape[1]

In [29]:
model = build_model(input_dim, output_dim, hn=256, dp=0.5, layers=5, init_mode='glorot_normal')

In [30]:
model = run_model(model, 256, 1, 1e-2, load_name='SF-crime_FC256x5_PCA-15_train-0.5.h5', save_name='SF-crime_FC256x5_PCA-15_train-0.5.h5')

Can't load weights!
Train on 395121 samples, validate on 43903 samples
Epoch 1/1
failed to save classifier weights


In [32]:
if save_preds:
    X_final_test = test_data[train_cols].as_matrix()
    X_final_test = pca.transform(X_final_test)
    pred = model.predict_proba(X_final_test, batch_size=256, verbose=1)

    labels = list(pd.get_dummies(data['Category']).columns)

    with open('sf-nn.csv', 'w') as outf:
        fo = csv.writer(outf, lineterminator='\n')
        fo.writerow(['Id'] + labels)
        for i, p in enumerate(pred):
            fo.writerow([i] + list(p))



## Random Forest Model

In [20]:
crime_forest = RandomForestClassifier(n_estimators=50, criterion='entropy')

In [21]:
%time crime_forest = crime_forest.fit(X_train, y_train)

CPU times: user 1min 48s, sys: 1.1 s, total: 1min 49s
Wall time: 1min 50s


In [22]:
%%time

score_train = crime_forest.score(X_train, y_train)
score_cv = crime_forest.score(X_cv, y_cv)

# test/train
# 20/80 split Training Score: 0.944199898638 , CV Score: 0.217073344343
# 50/50 split Training Score: 0.894782517584 , CV Score: 0.242728773988
# 80/20 split Training Score: 0.943824063687 , CV Score: 0.219235806617
print ('Training Score:', score_train, ', CV Score:', score_cv) 

MemoryError: 

In [None]:
feature_importance = zip(features, crime_forest.feature_importances_)
for x in sorted(feature_importance, key=lambda x: -x[1]):
    print (x)

In [35]:
prob_prediction = crime_forest.predict_proba(X_test)

In [36]:
submission = pd.DataFrame(prob_prediction, index=X_test.index, columns=crime_forest.classes_)

In [37]:
submission.to_csv('submission_2016_03_19-2.csv', index_label='Id')

### Plotting learning curves

In [None]:
# from sklearn.naive_bayes import GaussianNB
# from sklearn.svm import SVC
# from sklearn.datasets import load_digits
from sklearn.learning_curve import learning_curve


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 10)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


In [None]:
%%time 
X, y = X_train, y_train
train_data = X
# estimator = crime_forest

from sklearn.ensemble import GradientBoostingClassifier
estimator = GradientBoostingClassifier()
title = "Learning Curves (Random Forest)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = cross_validation.ShuffleSplit(train_data.shape[0], n_iter=5,
                                   test_size=0.3, random_state=0)

plot_learning_curve(estimator, title, X, y, n_jobs=4)
plt.show()

In [None]:
# try gradient boosted
# change tree sizes
# add features (Z above sea level), replace hour with time of day
# try regularization (in ensemble) to correct overfitting
# voting classifier

### Exhaustive Grid Search

In [None]:
# Split again, generate training and cross-validation features for grid search
X_grid_train, X_grid_cv, y_grid_train, y_grid_cv = cross_validation.train_test_split(X_train, 
                                                                                     y_train, 
                                                                                     test_size=0.40, 
                                                                                     random_state=1)

In [None]:
param_grid = [
    {'n_estimators': [200], 'min_samples_split': [1, 2]}
]
scores = ['precision', 'recall']
# , 'max_features': [2, 3, 5]

In [None]:
%%time
clf = GridSearchCV(RandomForestClassifier(), param_grid, error_score=0, n_jobs=1)
clf.fit(X_grid_train, y_grid_train)

print(clf.best_score_, clf.best_params_)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
          % (mean_score, scores.std() * 2, params))

In [None]:
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(OneVsRestClassifier(SVC()), param_grid,
                       scoring='%s_weighted' % score)
    clf.fit(X_grid_train, y_grid_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_grid_cv, clf.predict(X_grid_cv)
    print(classification_report(y_true, y_pred))
    print()