# Basic Imports and Reads

In [3]:
from datetime import datetime
import numpy as np
import pandas
import sklearn

FILE_TRAIN = 'train.csv'
FILE_TEST  = 'test.csv'
with open(FILE_TRAIN, 'r') as f:
    dt = pandas.read_csv(f)
with open(FILE_TEST, 'r') as f:
    dt_test = pandas.read_csv(f)

# Exploration of Data
Here we do a basic exploration of the types of columns, number of rows, and the type of data they contain.

In [4]:
dt         = dt.drop(['Descript', 'Resolution'], axis=1)
categories = dt.Category.unique()
pds        = dt.PdDistrict.unique()

In [5]:
def get_street_name(address):
    if '/' in address:
        address = map(str.strip, address.split('/'))
        return 0, ','.join(address)
    tokens = address.split(' Block of ')
    return int(tokens[0]), tokens[1]

dt['Street'] = dt.Address.apply(lambda x: x.strip())
dt['Block'], dt['Streets'] = zip(*dt.Street.apply(lambda x: get_street_name(x)))

streets  = dt.Street.unique()
count_c  = dt.groupby('Category').size()
count_s  = dt.groupby('Street').size()
count_cs = dt.groupby(['Street', 'Category']).size()

In [68]:
avg_district = dt.groupby('PdDistrict')['X', 'Y'].mean()
for pd in pds:
    dt.loc[(dt['Y']==90) & (dt['PdDistrict'] == pd), 'X'] = avg_district['X'][pd]
    dt.loc[(dt['Y']==90) & (dt['PdDistrict'] == pd), 'Y'] = avg_district['Y'][pd]

In [69]:
log_st = {}
p_c_s  = {}
p_s    = {}
N = float(len(dt))
for st in streets:
    p_s[st]    = count_s[st] / N
    log_st[st] = np.log(p_s[st]) - np.log(1. - p_s[st])
    # p_c_s[st]  = np.array([count_cs[st].get(c, 0) / N / p_s[st] for c in categories])

In [70]:
# probs = []
# for k, v in p_c_s.iteritems():
#     probs.append([k] + list(v))

In [71]:
# df_probs = pandas.DataFrame(probs, columns=(['Street'] + list(categories)))
# df_probs.to_csv('p_s_c.csv')

In [74]:
DISTRICTS = dt.PdDistrict.unique()

def get_season(month):
    spring = summer = fall = winter = 0
    if 3 <= month <= 5:
        spring = 1
    elif 6 <= month <= 8:
        summer = 1
    elif 9 <= month <= 11:
        fall = 1
    else:
        winter = 1
    return spring, summer, fall, winter

def process(df_orig, log_st):
    df = df_orig.copy()
    
    df['DateTimes'] = pandas.to_datetime(df.Dates)
    df['DayOfWeek'] = df.DateTimes.dt.dayofweek
    df['DayOfYear'] = df.DateTimes.dt.dayofyear
    df['Year']      = df.DateTimes.dt.year
    df['Month']     = df.DateTimes.dt.month
    df['Hour']      = df.DateTimes.dt.hour
    df['Spring'], df['Summer'], df['Fall'], df['Winter'] = zip(*df.Month.apply(get_season))
    
    df['isWeekend']      = df.DayOfWeek.apply(lambda x: 1 if x in ('Saturday', 'Sunday') else 0)
    df['isAwake']        = df.Hour.apply(lambda x: 1 if (x == 0 or 8 <= x <= 23) else 0)
    df['isIntersection'] = df.Address.apply(lambda x: 1 if '/' in x else 0)
    df['log_street']     = df.Street.apply(lambda x: log_st.get(x, 0))
    
    # streets   = df['Streets'].str.get_dummies(sep=',')
    districts = pandas.get_dummies(df.PdDistrict, prefix='PD')
    df = pandas.concat([df, districts], axis=1)
    for d in DISTRICTS:
        if 'PD_%s' % d not in df.columns:
            df[d] = 0
    
    cols = [
        'Id',
        'Dates', 
        'DateTimes',
        'PdDistrict',
        'Address', 
        'Street',
        'Streets',
        'Category',
    ]
    for col in cols:
        if col in df.columns:
            df = df.drop(col, axis=1)
    
    return df

In [75]:
ytrain = dt.Category
%time xtrain = process(dt, log_st)
features = xtrain.columns.values
process_feats = xtrain.columns.values

CPU times: user 6.82 s, sys: 974 ms, total: 7.8 s
Wall time: 8.02 s


In [76]:
# cut out validation set
from sklearn import cross_validation
classes = dt.Category.values
X_train, X_valid, y_train, y_valid = \
    cross_validation.train_test_split(np.array(xtrain), np.array(ytrain), test_size=0.4, random_state=0)
    
# X_train, y_train = np.array(xtrain), np.array(ytrain)

In [97]:
# train the model
from sklearn.ensemble import RandomForestClassifier
N_EST = 100
RSEED = None
%time forest = RandomForestClassifier(n_estimators=N_EST, random_state=RSEED, criterion='entropy').fit(X_train, y_train)

CPU times: user 4min 52s, sys: 14 s, total: 5min 6s
Wall time: 5min 15s


In [98]:
feature_importance = zip(features, forest.feature_importances_)
for x in sorted(feature_importance, key=lambda (x, y): -y):
    print x

('DayOfYear', 0.13049041647223342)
('Hour', 0.11798803710014039)
('Y', 0.11654928908509982)
('X', 0.11467884986422151)
('log_street', 0.10825726162467335)
('Year', 0.10270361913063486)
('DayOfWeek', 0.089319995920925307)
('Month', 0.061787877709017947)
('Block', 0.053403115594212271)
('isIntersection', 0.015026707736697857)
('Summer', 0.013353588361234616)
('Spring', 0.013018017759018043)
('Winter', 0.012511904852853764)
('Fall', 0.012134753801491607)
('isAwake', 0.0084347589132932436)
('PD_TENDERLOIN', 0.0054084176197006593)
('PD_MISSION', 0.0038162592037497577)
('PD_SOUTHERN', 0.0033584654217984418)
('PD_NORTHERN', 0.0031867676505366897)
('PD_BAYVIEW', 0.0029921845579730373)
('PD_INGLESIDE', 0.0027297958081412528)
('PD_PARK', 0.0025647315937376287)
('PD_CENTRAL', 0.0023455514847980584)
('PD_TARAVAL', 0.0021667914950455084)
('PD_RICHMOND', 0.0017728412387710392)
('isWeekend', 0.0)


In [99]:
# score the results
%time p_train = forest.predict_proba(X_train)
%time p_valid = forest.predict_proba(X_valid)
print '-' * 10
print 'Train: %.6f' % sklearn.metrics.log_loss(y_train, p_train)
print 'Valid: %.6f' % sklearn.metrics.log_loss(y_valid, p_valid)
print '-' * 10
print 'N_est=%d' % N_EST
print 'Features:', ', '.join(features)

CPU times: user 57.9 s, sys: 37.3 s, total: 1min 35s
Wall time: 1min 56s
CPU times: user 40.3 s, sys: 31 s, total: 1min 11s
Wall time: 1min 33s
----------
Training: 0.446104
CV      : 5.287879
----------
N_est=100
Features: DayOfWeek, X, Y, Block, DayOfYear, Year, Month, Hour, Spring, Summer, Fall, Winter, isWeekend, isAwake, isIntersection, log_street, PD_BAYVIEW, PD_CENTRAL, PD_INGLESIDE, PD_MISSION, PD_NORTHERN, PD_PARK, PD_RICHMOND, PD_SOUTHERN, PD_TARAVAL, PD_TENDERLOIN


In [93]:
dt_test['Street'] = dt_test.Address.apply(lambda x: x.strip())
dt_test['Block'], dt_test['Streets'] = zip(*dt_test.Street.apply(lambda x: get_street_name(x)))
# for pd in pds:
#     dt_test.loc[(dt_test['Y']==90) & (dt_test['PdDistrict'] == pd), 'X'] = avg_district['X'][pd]
#     dt_test.loc[(dt_test['Y']==90) & (dt_test['PdDistrict'] == pd), 'Y'] = avg_district['Y'][pd]

# y_test = forest.predict_proba(process(dt_test, log_st))
# 
# y_test = []
# batch  = 500
# for i in xrange(0, len(dt_test), batch):
#     xtest = process(dt_test.ix[i:i + batch - 1], log_st)
#     y_test.extend(forest.predict_proba(xtest.to_sparse()))

In [18]:
# submission = pandas.DataFrame(y_test, index=dt_test.index, columns=forest.classes_)
# submission.to_csv('y_test.csv', index_label='Id')