In [1]:
# Basic Imports and Reads

In [2]:
from datetime import datetime
import numpy as np
import pandas
import sklearn

TS_FORMAT  = '%Y-%m-%d %H:%M:%S'
FILE_TRAIN = 'train.csv'
FILE_TEST  = 'test.csv'
with open(FILE_TRAIN, 'r') as f:
    dt_orig = pandas.read_csv(f)
with open(FILE_TEST, 'r') as f:
    dt_test_orig = pandas.read_csv(f)

In [3]:
# Let's sample the data
dt = dt_orig.sample(frac=1)
dt_test = dt_test_orig

# Exploration of Data
Here we do a basic exploration of the types of columns, number of rows, and the type of data they contain.

In [4]:
dt

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
589945,2007-01-08 10:00:00,NON-CRIMINAL,FOUND PROPERTY,Monday,INGLESIDE,NONE,1400 Block of NOE ST,-122.431437,37.746224
225474,2012-05-05 19:45:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Saturday,BAYVIEW,NONE,100 Block of JERROLD AV,-122.371625,37.728777
289538,2011-05-31 00:01:00,NON-CRIMINAL,FOUND PROPERTY,Tuesday,SOUTHERN,NONE,0 Block of MARKET ST,-122.395098,37.794059
211107,2012-07-24 22:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Tuesday,NORTHERN,NONE,1400 Block of GREENWICH ST,-122.423637,37.800562
449747,2009-01-10 15:00:00,MISSING PERSON,MISSING ADULT,Saturday,RICHMOND,LOCATED,3600 Block of CALIFORNIA ST,-122.454923,37.786281
700788,2005-06-04 16:50:00,NON-CRIMINAL,FOUND PROPERTY,Saturday,CENTRAL,UNFOUNDED,1600 Block of THE EMBARCADERONORTH ST,-122.410774,37.808435
637736,2006-04-23 15:00:00,BURGLARY,"BURGLARY OF RESIDENCE, FORCIBLE ENTRY",Sunday,NORTHERN,NONE,1500 Block of VALLEJO ST,-122.422808,37.796848
809219,2003-11-29 21:00:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Saturday,BAYVIEW,NONE,3100 Block of SAN BRUNO AV,-122.401583,37.722584
369496,2010-03-23 18:30:00,SUSPICIOUS OCC,SUSPICIOUS OCCURRENCE,Tuesday,BAYVIEW,NONE,1500 Block of SHAFTER AV,-122.389849,37.730474
278017,2011-07-30 02:13:00,ASSAULT,BATTERY,Saturday,MISSION,NONE,100 Block of JULIAN AV,-122.420908,37.765777


In [5]:
# Dataframe Info
dt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 878049 entries, 589945 to 557059
Data columns (total 9 columns):
Dates         878049 non-null object
Category      878049 non-null object
Descript      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
Resolution    878049 non-null object
Address       878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: float64(2), object(7)
memory usage: 67.0+ MB


In [6]:
# Types of Crimes
print dt.Category.nunique()  # Number of unique categories
dt.groupby('Category').size().sort_values(ascending=False)

39


Category
LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQ

In [7]:
district_names = list(dt.PdDistrict.unique())
process_feats = ['X', 'Y', 'DayOfWeek', 'DayOfYear', 'Month', 'Hour'] + district_names

def process(df):
    df['DateTimes'] = pandas.to_datetime(df.Dates)
    prelist = [
        df.X.values,
        df.Y.values,
        df.DateTimes.dt.dayofweek.values,
        df.DateTimes.dt.dayofyear.values,
        df.DateTimes.dt.month.values,
        df.DateTimes.dt.hour.values
    ]
    districts = pandas.get_dummies(df.PdDistrict, columns=df.PdDistrict.unique())
    result = np.hstack((np.array(prelist).T, districts.values))
    return result

In [8]:
training = process(dt)

In [9]:
# cut out validation set
from sklearn import cross_validation
classes = dt.Category.values
X_train, X_valid, y_train, y_valid = \
    cross_validation.train_test_split(training, classes, test_size=0.2, random_state=0)

In [10]:
# train the model
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, n_jobs=2)
forest = forest.fit(X_train, y_train)
#from sklearn.linear_model import LogisticRegression
#model = LogisticRegression(C=1e5).fit(X_train, y_train)

In [11]:
feature_importance = zip(process_feats, forest.feature_importances_)
for x in sorted(feature_importance, key=lambda (x, y): -y):
    print x

('DayOfYear', 0.24931370647673179)
('Y', 0.20847096477255808)
('X', 0.20586594410834294)
('Hour', 0.14209906232433034)
('DayOfWeek', 0.095720900098442863)
('Month', 0.087698691866397838)
('TARAVAL', 0.0038407364034725886)
('PARK', 0.0014071737153550465)
('INGLESIDE', 0.0010071997987660527)
('BAYVIEW', 0.00092453854052684705)
('RICHMOND', 0.00088608083191925953)
('NORTHERN', 0.00082448559870180983)
('SOUTHERN', 0.00068877845860013239)
('CENTRAL', 0.00044891202718784663)
('MISSION', 0.00044852659190232146)
('TENDERLOIN', 0.00035429838676430734)


In [12]:
# score the results
print forest.score(X_train, y_train)
print forest.score(X_valid, y_valid)

0.868741627387
0.283429189682


In [13]:
y_test = forest.predict_proba(process(dt_test))

In [14]:
submission = pandas.DataFrame(y_test, index=dt_test.index, columns=forest.classes_)
submission.to_csv('y_test.csv', index_label='Id')