In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb

In [2]:
train_df = pd.read_csv('train.csv', parse_dates=['Dates'])
test_df = pd.read_csv('test.csv', parse_dates=['Dates'])

In [32]:
# pd.Categorical.from_array(train_df.Category).categories

Index(['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC',
       'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES',
       'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING',
       'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON',
       'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT',
       'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY',
       'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE',
       'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS',
       'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS'],
      dtype='object')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
Dates         878049 non-null datetime64[ns]
Category      878049 non-null object
Descript      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
Resolution    878049 non-null object
Address       878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: datetime64[ns](1), float64(2), object(6)
memory usage: 60.3+ MB


In [4]:
print(train_df.shape)
print(test_df.shape)
train_df.columns

(878049, 9)
(884262, 7)


Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y'],
      dtype='object')

In [5]:
drop_col = ['Descript', 'Resolution']
train_df = train_df.drop(drop_col, axis=1)
train_df.head(3)

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414


In [6]:
from sklearn import preprocessing

In [7]:
# Convert crime labels to numbers
label_crime = preprocessing.LabelEncoder()
crime = label_crime.fit_transform(train_df.Category)
np.info(crime)

class:  ndarray
shape:  (878049,)
strides:  (8,)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  True
data pointer: 0x10810a000
byteorder:  little
byteswap:  False
type: int64


In [8]:
def convert_to_vector(pandas_object):
    days = pd.get_dummies(pandas_object.DayOfWeek)
    district = pd.get_dummies(pandas_object.PdDistrict)
    hour = pandas_object.Dates.dt.hour
    hour = pd.get_dummies(hour)
    
    pandas_data = pd.concat([hour, days, district], axis=1)
    return pandas_data

In [10]:
train_data = convert_to_vector(train_df)
train_data['crime'] = crime
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 42 columns):
0             878049 non-null float64
1             878049 non-null float64
2             878049 non-null float64
3             878049 non-null float64
4             878049 non-null float64
5             878049 non-null float64
6             878049 non-null float64
7             878049 non-null float64
8             878049 non-null float64
9             878049 non-null float64
10            878049 non-null float64
11            878049 non-null float64
12            878049 non-null float64
13            878049 non-null float64
14            878049 non-null float64
15            878049 non-null float64
16            878049 non-null float64
17            878049 non-null float64
18            878049 non-null float64
19            878049 non-null float64
20            878049 non-null float64
21            878049 non-null float64
22            878049 non-null float64
23            8

In [11]:
train_data.shape

(878049, 42)

In [12]:
# do same process for test data
test_data = convert_to_vector(test_df)
test_data.shape

(884262, 41)

In [13]:
training, validation = train_test_split(train_data, train_size=.60)
print(training.shape)
print(validation.shape)

(526829, 42)
(351220, 42)


In [14]:
from sklearn.grid_search import GridSearchCV

In [15]:
def set_cv_params():
    cv_params ={}
    cv_params['max_depth'] = [3,5,7]
    cv_params['min_child_weight'] = [1,3,5]
    
    ind_params = {}
    ind_params['learning_rate'] = 0.1
    ind_params['n_estimators'] = 100
    ind_params['seed'] = 0
    ind_params['subsample'] = 0.8
    ind_params['colsample_bytree'] = 0.8
    ind_params['objective'] = 'binary:logistic'
    
    return cv_params, ind_params

In [16]:
cv_params, ind_params = set_cv_params()
print(cv_params)
print(ind_params)

{'min_child_weight': [1, 3, 5], 'max_depth': [3, 5, 7]}
{'learning_rate': 0.1, 'colsample_bytree': 0.8, 'objective': 'binary:logistic', 'seed': 0, 'subsample': 0.8, 'n_estimators': 100}


In [17]:
optimized_GBM = GridSearchCV(xgb.XGBClassifier(ind_params), 
                             cv_params, 
                             scoring='accuracy',
                            cv=5,
                            n_jobs=2)
optimized_GBM

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth={'subsample': 0.8, 'n_estimators': 1000, 'colsample_bytree': 0.8, 'seed': 0, 'learning_rate': 0.1, 'objective': 'binary:logistic'},
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [17]:
train_x = training.drop('crime',1)
test_x = validation.drop('crime',1)

In [18]:
print(train_x.shape)
print(test_x.shape)

(526829, 41)
(351220, 41)


In [19]:
train_y = training.crime
test_y = validation.crime

In [20]:
print(train_y.shape)
print(test_y.shape)

(526829,)
(351220,)


In [22]:
# optimized_GBM.fit(train_x, train_y)



KeyboardInterrupt: 

In [21]:
dtrain = xgb.DMatrix(train_x, label=train_y)

In [22]:
dtrain

<xgboost.core.DMatrix at 0x10b8026d8>

In [23]:
dtest = xgb.DMatrix(test_x, label=test_y)
dtest

<xgboost.core.DMatrix at 0x10b7fd208>

In [24]:
num_class = len(train_df.Category.unique())
num_class

39

In [25]:
# Set parameters for XGBoost
def set_param():
    
    # setup parameters for xgboost
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.4
    param['silent'] = 0
    param['nthread'] = 4
    param['num_class'] = num_class
    param['eval_metric'] = 'mlogloss'

    # Model complexity
    param['max_depth'] = 8 #set to 8
    param['min_child_weight'] = 1
    param['gamma'] = 0 
    param['reg_alfa'] = 0.05

    param['subsample'] = 0.8
    param['colsample_bytree'] = 0.8 #set to 1

    # Imbalanced data
    param['max_delta_step'] = 1
    
    return param

In [26]:
param = set_param()
param

{'colsample_bytree': 0.8,
 'eta': 0.4,
 'eval_metric': 'mlogloss',
 'gamma': 0,
 'max_delta_step': 1,
 'max_depth': 8,
 'min_child_weight': 1,
 'nthread': 4,
 'num_class': 39,
 'objective': 'multi:softprob',
 'reg_alfa': 0.05,
 'silent': 0,
 'subsample': 0.8}

In [27]:
watchlist = [(dtrain, 'train'), (dtest,'eval')]
watchlist

[(<xgboost.core.DMatrix at 0x10b8026d8>, 'train'),
 (<xgboost.core.DMatrix at 0x10b7fd208>, 'eval')]

In [28]:
num_round = 50

In [29]:
bst = xgb.train(param, dtrain, num_round, watchlist)

[0]	train-mlogloss:3.35637	eval-mlogloss:3.35907
[1]	train-mlogloss:3.11894	eval-mlogloss:3.12424
[2]	train-mlogloss:2.9488	eval-mlogloss:2.95654
[3]	train-mlogloss:2.83259	eval-mlogloss:2.84221
[4]	train-mlogloss:2.7571	eval-mlogloss:2.76845
[5]	train-mlogloss:2.70542	eval-mlogloss:2.71822
[6]	train-mlogloss:2.66868	eval-mlogloss:2.68294
[7]	train-mlogloss:2.64173	eval-mlogloss:2.65734
[8]	train-mlogloss:2.62173	eval-mlogloss:2.63868
[9]	train-mlogloss:2.60627	eval-mlogloss:2.6244
[10]	train-mlogloss:2.59423	eval-mlogloss:2.61352
[11]	train-mlogloss:2.58482	eval-mlogloss:2.60533
[12]	train-mlogloss:2.5775	eval-mlogloss:2.59922
[13]	train-mlogloss:2.57167	eval-mlogloss:2.59455
[14]	train-mlogloss:2.56707	eval-mlogloss:2.59103
[15]	train-mlogloss:2.5632	eval-mlogloss:2.58821
[16]	train-mlogloss:2.56005	eval-mlogloss:2.58612
[17]	train-mlogloss:2.55741	eval-mlogloss:2.58454
[18]	train-mlogloss:2.55512	eval-mlogloss:2.58335
[19]	train-mlogloss:2.5532	eval-mlogloss:2.58243
[20]	train-mlogl

In [30]:
yprob = bst.predict(dtest).reshape(test_y.shape[0], num_class)

In [31]:
yprob

array([[  4.07331710e-04,   6.38390854e-02,   6.75221614e-04, ...,
          5.00974394e-02,   3.65669839e-02,   5.78314858e-03],
       [  3.31048155e-04,   9.59921554e-02,   4.28725325e-04, ...,
          6.06774129e-02,   5.48480898e-02,   1.05009554e-02],
       [  1.26571441e-03,   5.81976324e-02,   1.57742354e-04, ...,
          5.03674559e-02,   5.15948124e-02,   8.06064624e-03],
       ..., 
       [  9.19469341e-04,   7.23265037e-02,   4.84762713e-05, ...,
          8.48911181e-02,   4.98450734e-02,   1.42293461e-02],
       [  5.26784395e-04,   1.00609533e-01,   3.97276861e-04, ...,
          5.51696122e-02,   6.51465654e-02,   1.23516237e-02],
       [  4.09121159e-04,   6.74785674e-02,   4.47829894e-04, ...,
          2.56101899e-02,   7.90461823e-02,   6.58803480e-03]], dtype=float32)

In [32]:
ylabel = np.argmax(yprob, axis=1)

In [33]:
ylabel

array([16, 21, 16, ..., 16, 21, 16])

In [34]:
yprob.shape

(351220, 39)

In [35]:
test_data.shape

(884262, 41)

In [36]:
sub_dtest = xgb.DMatrix(test_data)

In [37]:
prob_test_y = bst.predict(sub_dtest).reshape(test_data.shape[0], num_class)

In [39]:
result = pd.DataFrame(prob_test_y, columns=label_crime.classes_)

In [40]:
result.shape

(884262, 39)

In [41]:
result.head(3)

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0.005941,0.084258,5.3e-05,0.000537,0.034522,0.000624,0.006118,0.052948,0.00305,0.000362,...,6e-05,0.009452,0.000184,0.039161,4e-06,0.002862,0.076119,0.111425,0.056858,0.037859
1,0.005941,0.084258,5.3e-05,0.000537,0.034522,0.000624,0.006118,0.052948,0.00305,0.000362,...,6e-05,0.009452,0.000184,0.039161,4e-06,0.002862,0.076119,0.111425,0.056858,0.037859
2,0.000959,0.082737,5.1e-05,0.000237,0.055681,0.0019,0.006135,0.03916,0.006212,0.000309,...,2.4e-05,0.005183,0.00015,0.027955,3e-06,0.003968,0.075359,0.059715,0.046672,0.00661


In [42]:
result.to_csv('xgboost.csv', index=True, index_label='Id')