# Kaggle: San Francisco Crime Classification

Predict the category of crimes that occurred in the city by the bay

From 1934 to 1963, San Francisco was infamous for housing some of the world's most notorious criminals on the inescapable island of Alcatraz.

Today, the city is known more for its tech scene than its criminal past. But, with rising wealth inequality, housing shortages, and a proliferation of expensive digital toys riding BART to work, there is no scarcity of crime in the city by the bay.

From Sunset to SOMA, and Marina to Excelsior, this competition's dataset provides nearly 12 years of crime reports from across all of San Francisco's neighborhoods. Given time and location, you must predict the category of crime that occurred.

In [1]:
# to add
# probability parameter
# call .decision_function(x) to get probabilities

# normalize your features
# try day of year, day or month, day of week, hour of day...
# polarize features, as done in the M&V 

In [2]:
# %matplotlib qt

In [3]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import cross_validation
from sklearn import preprocessing
# from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from os.path import expanduser, normpath
import time
import datetime

In [4]:
# Set paths for data to be imported

home = expanduser('~')
# path = str(home) + '\\Documents\\data-science\\kaggle\\sf-crime\\' # Windows
path = str(home) + '/Documents/Personal/data-science/kaggle/sf-crime/' # Mac
trainfile = 'train.csv'
testfile = 'test.csv'

In [5]:
train_data_raw = pd.read_csv(path+trainfile)
test_data_raw = pd.read_csv(path+testfile)

In [6]:
# train_data.groupby('Category').size().sort_values()

## Features

In [7]:
train_data = train_data_raw.copy()
test_data = test_data_raw.copy()

In [8]:
# Remove unnecessary features
train_data.drop(['Descript', 
                 'Resolution', 
                 'PdDistrict', 
                 'DayOfWeek', 
                 'Address'], inplace=True, axis=1)

In [9]:
# Remove unnecessary features
test_data.drop(['PdDistrict', 
                'DayOfWeek', 
                'Address'], inplace=True, axis=1)

In [10]:
t0 = time.time()

train_data['Dates'] = pd.to_datetime(train_data['Dates'])
train_data['year'] = train_data['Dates'].dt.year
train_data['month'] = train_data['Dates'].dt.month 
train_data['day'] = train_data['Dates'].dt.day
train_data['hour'] = train_data['Dates'].dt.hour
train_data['minute'] = train_data['Dates'].dt.minute

train_data['dayofyear'] = train_data['Dates'].dt.dayofyear
train_data['dayofweek'] = train_data['Dates'].dt.dayofweek

t1 = time.time()
print(t1-t0)

1.1508030891418457


In [11]:
t0 = time.time()

test_data['Dates'] = pd.to_datetime(test_data['Dates'])
test_data['year'] = test_data['Dates'].dt.year
test_data['month'] = test_data['Dates'].dt.month 
test_data['day'] = test_data['Dates'].dt.day
test_data['hour'] = test_data['Dates'].dt.hour
test_data['minute'] = test_data['Dates'].dt.minute

test_data['dayofyear'] = test_data['Dates'].dt.dayofyear
test_data['dayofweek'] = test_data['Dates'].dt.dayofweek

t1 = time.time()
print(t1-t0)

1.1432721614837646


In [12]:
# Set up 0 to 1 scaler for preprocessing features in training and test sets
min_max_scaler = preprocessing.MinMaxScaler()

In [13]:
train_data_noscale = pd.DataFrame(train_data[['Category','Dates']])
train_data.drop(['Category', 'Dates'], inplace=True, axis=1)

In [14]:
train_data_minmax = pd.DataFrame(min_max_scaler.fit_transform(train_data),
                                 index = train_data.index, 
                                 columns = train_data.columns) 
train_data_minmax[['Category','Dates']] = train_data_noscale

In [15]:
test_data_noscale= pd.DataFrame(test_data[['Id','Dates']])
test_data.drop(['Id', 'Dates'], inplace=True, axis=1)

In [16]:
test_data_minmax = pd.DataFrame(min_max_scaler.transform(test_data),
                                index = test_data.index, 
                                columns = test_data.columns) 
test_data_minmax[['Id','Dates']] = test_data_noscale

In [17]:
# generate training and cross-validation features
X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(train_data_minmax.ix[:,['dayofyear','dayofweek','hour','X','Y']], train_data_minmax.ix[:,'Category'], test_size=0.90, random_state=1)

In [18]:
X_test = test_data.ix[:,['dayofyear','dayofweek','hour','X','Y']]

In [32]:
X_train

Unnamed: 0,dayofyear,dayofweek,hour,X,Y
851426,0.359890,0.000000,0.826087,0.037475,0.001319
49216,0.714286,0.500000,0.478261,0.051077,0.001450
273072,0.653846,0.833333,0.043478,0.036002,0.001756
205326,0.642857,0.333333,0.913043,0.017927,0.001028
29954,0.975275,0.000000,0.739130,0.060910,0.001480
541039,0.725275,0.833333,0.217391,0.046059,0.000691
24336,0.049451,0.000000,0.782609,0.040555,0.001380
6209,0.282967,0.166667,0.652174,0.052839,0.001505
802399,0.021978,0.666667,0.391304,0.027224,0.000011
66688,0.486264,0.666667,0.434783,0.053156,0.000942


In [36]:
type(y_train)

pandas.core.series.Series

In [None]:
# # polarize data
#     if tod:
#         times = index.hour
#         tody = np.cos(2*np.pi*times/24)
#         todx = np.sin(2*np.pi*times/24)     
        
#         X_train[:,2] = tody[shuffling][:n_points]
#         X_train[:,3] = todx[shuffling][:n_points]
        
#         X_test[:,2] = tody[shuffling][n_points:]
#         X_test[:,3] = todx[shuffling][n_points:]

## Regression Model

In [20]:
# def MAPE(true, predicted):
#     return np.mean(np.abs((true - predicted)/true))

# def RMSE(error):
#     return (np.mean(error**2))**.05

In [21]:
# # SUPPORT VECTOR MACHINE
# t0 = time.time()

# crime_svm = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
#      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
#      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
#      verbose=0)
# crime_svm.fit(X_train, y_train)

# t1 = time.time()
# total = t1-t0
# print (total) # 19 minutes

In [22]:
# SUPPORT VECTOR MACHINE
t0 = time.time()

crime_svm = OneVsRestClassifier(SVC(kernel='rbf'))
crime_svm.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print (total)

491.14027404785156


In [23]:
t0 = time.time()

score_train = crime_svm.score(X_train, y_train)

t1 = time.time()
total = t1-t0
print ('Score:', score_train, ', Time:', total) # Score:  0.0880649625019 , Time:  0.27322959899902344

Score: 0.0352945196119 , Time: 306.5215961933136


In [24]:
t0 = time.time()

score_cv = crime_svm.score(X_cv, y_cv)

t1 = time.time()
total = t1-t0
print ('Score:', score_cv, ', Time:', total) # Score:  0.0874252605205 , Time:  1.2154865264892578

Score: 0.036250782985 , Time: 2780.063549041748


In [25]:
t0 = time.time()

output = crime_svm.predict(X_test)

t1 = time.time()
total = t1-t0
print ('Time:', total) # Time: 0.3079204559326172

Time: 2001.2667248249054


In [26]:
pd.get_dummies(output)

Unnamed: 0,LARCENY/THEFT
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [31]:
cat_uniques = train_data_raw.Category.unique()

In [None]:
headers = 'Id,' + ','.join(sorted(cat_uniques)) + '\n'
f = open('y_test.csv', 'w')
f.write(headers)
for i in xrange(len(y_test)):
    arr = [0] * 39
    arr[int(y_test)]