In [136]:
# Basic Imports and Reads

In [137]:
from datetime import datetime
import numpy as np
import pandas
import sklearn

TS_FORMAT  = '%Y-%m-%d %H:%M:%S'
FILE_TRAIN = 'train.csv'
FILE_TEST  = 'test.csv'
with open(FILE_TRAIN, 'r') as f:
    dt_orig = pandas.read_csv(f)
with open(FILE_TEST, 'r') as f:
    dt_test_orig = pandas.read_csv(f)

In [138]:
# Let's sample the data
dt = dt_orig.sample(frac=1)
dt_test = dt_test_orig

# Exploration of Data
Here we do a basic exploration of the types of columns, number of rows, and the type of data they contain.

In [139]:
dt

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
447072,2009-01-24 17:00:00,LARCENY/THEFT,GRAND THEFT FROM A BUILDING,Saturday,TENDERLOIN,NONE,200 Block of MASON ST,-122.409524,37.785760
506914,2008-03-21 09:00:00,LARCENY/THEFT,PETTY THEFT FROM LOCKED AUTO,Friday,TARAVAL,NONE,MARTIN LUTHER KING JR DR / LINCOLN WY,-122.509400,37.764030
235089,2012-03-20 16:40:00,OTHER OFFENSES,RESISTING ARREST,Tuesday,TENDERLOIN,"ARREST, BOOKED",200 Block of TURK ST,-122.413259,37.782951
113155,2013-11-02 13:56:00,VEHICLE THEFT,"AUTO, GRAND THEFT OF",Saturday,INGLESIDE,"ARREST, BOOKED",100 Block of BEMIS ST,-122.430754,37.737076
479071,2008-08-18 10:45:00,LARCENY/THEFT,GRAND THEFT FROM PERSON,Monday,TENDERLOIN,"ARREST, BOOKED",100 Block of EDDY ST,-122.410135,37.784189
872700,2003-01-25 17:58:00,NON-CRIMINAL,"AIDED CASE, MENTAL DISTURBED",Saturday,NORTHERN,NONE,400 Block of DUBOCE AV,-122.429533,37.769376
634774,2006-05-15 22:00:00,BURGLARY,"BURGLARY, HOT PROWL, UNLAWFUL ENTRY",Monday,TARAVAL,NONE,0 Block of BYXBEE ST,-122.469739,37.715188
234121,2012-03-23 01:00:00,OTHER OFFENSES,"DRIVERS LICENSE, SUSPENDED OR REVOKED",Friday,MISSION,"ARREST, CITED",18TH ST / CAPP ST,-122.418272,37.761903
313738,2011-01-14 13:34:00,NON-CRIMINAL,"AIDED CASE, MENTAL DISTURBED",Friday,SOUTHERN,PSYCHOPATHIC CASE,800 Block of BRYANT ST,-122.403405,37.775421
742176,2004-10-28 14:40:00,WARRANTS,WARRANT ARREST,Thursday,SOUTHERN,"ARREST, BOOKED",800 Block of MARKET ST,-122.406521,37.785063


In [140]:
# Dataframe Info
dt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 878049 entries, 447072 to 40212
Data columns (total 9 columns):
Dates         878049 non-null object
Category      878049 non-null object
Descript      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
Resolution    878049 non-null object
Address       878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: float64(2), object(7)
memory usage: 67.0+ MB


In [141]:
# Types of Crimes
print dt.Category.nunique()  # Number of unique categories
dt.groupby('Category').size().sort_values(ascending=False)

39


Category
LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQ

In [142]:
def get_season(month):
    spring = summer = fall = winter = 0
    if 3 <= month <= 5:
        spring = 1
    elif 6 <= month <= 8:
        summer = 1
    elif 9 <= month <= 11:
        fall = 1
    else:
        winter = 1
    return spring, summer, fall, winter

def process(df_orig):
    df = df_orig.copy()
    
    df['DateTimes'] = pandas.to_datetime(df.Dates)
    df['DayOfWeek'] = df.DateTimes.dt.dayofweek
    df['DayOfYear'] = df.DateTimes.dt.dayofyear
    df['Year']      = df.DateTimes.dt.year
    df['Month']     = df.DateTimes.dt.month
    df['Hour']      = df.DateTimes.dt.hour
    df['Spring'], df['Summer'], df['Fall'], df['Winter'] = zip(*df.Month.apply(get_season))
    
    df['isWeekend']      = df.DayOfWeek.apply(lambda x: 1 if x in ('SATURDAY', 'SUNDAY') else 0)
    df['isAwake']        = df.Hour.apply(lambda x: 1 if (x == 0 or 8 <= x <= 23) else 0)
    df['isIntersection'] = df.Address.apply(lambda x: 1 if '/' in x else 0)
    
    districts = pandas.get_dummies(df.PdDistrict, prefix='PD')
    
    df = pandas.concat([df, districts], axis=1)
    
    cols = [
        'Id',
        'Dates', 
        'DateTimes',
        'PdDistrict',
        'Descript',
        'Resolution',
        'Address', 
        'Category',
    ]
    for col in cols:
        if col in df.columns:
            df = df.drop(col, axis=1)
    
    return df

In [143]:
ytrain = dt.Category
xtrain = process(dt)
process_feats = xtrain.columns.values

In [144]:
# cut out validation set
from sklearn import cross_validation
classes = dt.Category.values
X_train, X_valid, y_train, y_valid = \
    cross_validation.train_test_split(np.array(xtrain), np.array(ytrain), test_size=0.4, random_state=0)

In [145]:
# train the model
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=125)
forest = forest.fit(X_train, y_train)

In [146]:
feature_importance = zip(xtrain.columns.values, forest.feature_importances_)
for x in sorted(feature_importance, key=lambda (x, y): -y):
    print x

('Y', 0.18507363201853941)
('X', 0.18310704607991266)
('DayOfYear', 0.16999440769839591)
('Hour', 0.14016816659307699)
('Year', 0.11288403537564333)
('DayOfWeek', 0.098623722209867398)
('Month', 0.052675375730069453)
('isIntersection', 0.010522848414831172)
('Summer', 0.0062388633526517913)
('Spring', 0.0059495016495823439)
('Winter', 0.0059161942762902897)
('Fall', 0.005753739526461174)
('isAwake', 0.0056946536913180914)
('PD_TENDERLOIN', 0.0038891350475714188)
('PD_SOUTHERN', 0.0020950691428629343)
('PD_NORTHERN', 0.001894556823135873)
('PD_MISSION', 0.0017713458408753241)
('PD_BAYVIEW', 0.0015931236750062318)
('PD_CENTRAL', 0.001492190003823911)
('PD_INGLESIDE', 0.0013541024150332329)
('PD_PARK', 0.0012064690022698843)
('PD_RICHMOND', 0.0011167044502051627)
('PD_TARAVAL', 0.00098511698257612319)
('isWeekend', 0.0)


In [147]:
# score the results
print forest.score(X_train, y_train)
print forest.score(X_valid, y_valid)

0.896478743577
0.288602585274


In [148]:
y_test = forest.predict_proba(process(dt_test))

In [149]:
submission = pandas.DataFrame(y_test, index=dt_test.index, columns=forest.classes_)
submission.to_csv('y_test.csv', index_label='Id')