In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split



In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
test_ids = test.Id
test = test.loc[:, 'Dates':]

data = train.append(test)
data = data.fillna(-1)
data = data.drop(['Address', 'Descript', 'Resolution'], axis=1)

data

Unnamed: 0,Category,Dates,DayOfWeek,PdDistrict,X,Y
0,WARRANTS,2015-05-13 23:53:00,Wednesday,NORTHERN,-122.425892,37.774599
1,OTHER OFFENSES,2015-05-13 23:53:00,Wednesday,NORTHERN,-122.425892,37.774599
2,OTHER OFFENSES,2015-05-13 23:33:00,Wednesday,NORTHERN,-122.424363,37.800414
3,LARCENY/THEFT,2015-05-13 23:30:00,Wednesday,NORTHERN,-122.426995,37.800873
4,LARCENY/THEFT,2015-05-13 23:30:00,Wednesday,PARK,-122.438738,37.771541
5,LARCENY/THEFT,2015-05-13 23:30:00,Wednesday,INGLESIDE,-122.403252,37.713431
6,VEHICLE THEFT,2015-05-13 23:30:00,Wednesday,INGLESIDE,-122.423327,37.725138
7,VEHICLE THEFT,2015-05-13 23:30:00,Wednesday,BAYVIEW,-122.371274,37.727564
8,LARCENY/THEFT,2015-05-13 23:00:00,Wednesday,RICHMOND,-122.508194,37.776601
9,LARCENY/THEFT,2015-05-13 23:00:00,Wednesday,CENTRAL,-122.419088,37.807802


# Feature engineering

## Datetime feature

In [3]:
data['Dates'] = pd.to_datetime(data.Dates)
data['year'] = data.Dates.apply(lambda x: x.year)
data['month'] = data.Dates.apply(lambda x: x.month)
data['day'] = data.Dates.apply(lambda x: x.day)
data['hour'] = data.Dates.apply(lambda x: x.hour)

data = data.drop(['Dates'], axis=1)

## DayofWeek categorical

In [4]:
data['DayOfWeek'] = pd.Categorical(data.DayOfWeek).codes

## PdDistrict categorical

In [5]:
data['PdDistrict'] = pd.Categorical(data.PdDistrict).codes

## Category categorical

In [6]:
cates = pd.Categorical(data.Category)
cates_dict = dict(enumerate(cates.categories))
data['Category'] = cates.codes

cates_dict

{0: -1,
 1: 'ARSON',
 2: 'ASSAULT',
 3: 'BAD CHECKS',
 4: 'BRIBERY',
 5: 'BURGLARY',
 6: 'DISORDERLY CONDUCT',
 7: 'DRIVING UNDER THE INFLUENCE',
 8: 'DRUG/NARCOTIC',
 9: 'DRUNKENNESS',
 10: 'EMBEZZLEMENT',
 11: 'EXTORTION',
 12: 'FAMILY OFFENSES',
 13: 'FORGERY/COUNTERFEITING',
 14: 'FRAUD',
 15: 'GAMBLING',
 16: 'KIDNAPPING',
 17: 'LARCENY/THEFT',
 18: 'LIQUOR LAWS',
 19: 'LOITERING',
 20: 'MISSING PERSON',
 21: 'NON-CRIMINAL',
 22: 'OTHER OFFENSES',
 23: 'PORNOGRAPHY/OBSCENE MAT',
 24: 'PROSTITUTION',
 25: 'RECOVERED VEHICLE',
 26: 'ROBBERY',
 27: 'RUNAWAY',
 28: 'SECONDARY CODES',
 29: 'SEX OFFENSES FORCIBLE',
 30: 'SEX OFFENSES NON FORCIBLE',
 31: 'STOLEN PROPERTY',
 32: 'SUICIDE',
 33: 'SUSPICIOUS OCC',
 34: 'TREA',
 35: 'TRESPASS',
 36: 'VANDALISM',
 37: 'VEHICLE THEFT',
 38: 'WARRANTS',
 39: 'WEAPON LAWS'}

## Lat/Lon map

In [7]:
data['X'] = data['X'].map(lambda x: '{:.2f}'.format(x)).astype(float)
data['Y'] = data['Y'].map(lambda x: '{:.2f}'.format(x)).astype(float)

In [8]:
data

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,year,month,day,hour
0,38,6,4,-122.43,37.77,2015,5,13,23
1,22,6,4,-122.43,37.77,2015,5,13,23
2,22,6,4,-122.42,37.80,2015,5,13,23
3,17,6,4,-122.43,37.80,2015,5,13,23
4,17,6,5,-122.44,37.77,2015,5,13,23
5,17,6,2,-122.40,37.71,2015,5,13,23
6,37,6,2,-122.42,37.73,2015,5,13,23
7,37,6,0,-122.37,37.73,2015,5,13,23
8,17,6,6,-122.51,37.78,2015,5,13,23
9,17,6,1,-122.42,37.81,2015,5,13,23


## Lat/Lon categorical

In [9]:
data['X'] = pd.Categorical(data.X).codes
data['Y'] = pd.Categorical(data.Y).codes

In [10]:
data

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,year,month,day,hour
0,38,6,4,8,6,2015,5,13,23
1,22,6,4,8,6,2015,5,13,23
2,22,6,4,9,9,2015,5,13,23
3,17,6,4,8,9,2015,5,13,23
4,17,6,5,7,6,2015,5,13,23
5,17,6,2,11,0,2015,5,13,23
6,37,6,2,9,2,2015,5,13,23
7,37,6,0,14,2,2015,5,13,23
8,17,6,6,0,7,2015,5,13,23
9,17,6,1,9,10,2015,5,13,23


In [11]:
X_train = data[:len(train)]
y_train = X_train.Category
X_train = X_train.drop(['Category'], axis=1)

X_test = data[len(train):]
X_test = X_test.drop(['Category'], axis=1)

X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.15)
X_train

Unnamed: 0,DayOfWeek,PdDistrict,X,Y,year,month,day,hour
127974,1,2,10,1,2013,9,2,23
455333,4,9,10,8,2008,12,11,8
60614,0,0,11,6,2014,7,25,14
90845,1,4,7,8,2014,3,3,7
332945,6,5,8,7,2010,10,6,0
605882,2,4,8,6,2006,10,7,0
772854,4,5,6,5,2004,5,27,19
366822,6,8,5,3,2010,4,7,10
822823,1,4,9,8,2003,9,29,19
866318,3,2,7,0,2003,2,23,20


In [12]:
d_train = xgb.DMatrix(X_train, label=y_train)
d_cv = xgb.DMatrix(X_cv, label=y_cv)

num_class = len(cates_dict)

params = {'objective': 'multi:softprob', 'eta': 0.4, 'silent': 0,
          'nthread': 4, 'eval_metric': 'mlogloss', 'max_depth': 8,
          'min_child_weight': 1, 'gamma': 0, 'reg_alfa': 0.05,
          'subsample': 0.8, 'colsample_bytree': 0.8,
          'num_class': num_class, 'max_delta_step': 1}
watchlist = [(d_train,'train'), (d_cv, 'eval')]
num_round = 20

# Train XGBoost
bst = xgb.train(params, d_train, num_round, watchlist)

[0]	train-mlogloss:3.36599	eval-mlogloss:3.36894
[1]	train-mlogloss:3.10909	eval-mlogloss:3.11603
[2]	train-mlogloss:2.91603	eval-mlogloss:2.92777
[3]	train-mlogloss:2.77957	eval-mlogloss:2.79571
[4]	train-mlogloss:2.68805	eval-mlogloss:2.70883
[5]	train-mlogloss:2.62383	eval-mlogloss:2.64864
[6]	train-mlogloss:2.57765	eval-mlogloss:2.60651
[7]	train-mlogloss:2.54297	eval-mlogloss:2.5759
[8]	train-mlogloss:2.51632	eval-mlogloss:2.55377
[9]	train-mlogloss:2.49448	eval-mlogloss:2.53624
[10]	train-mlogloss:2.47715	eval-mlogloss:2.52302
[11]	train-mlogloss:2.46301	eval-mlogloss:2.51298
[12]	train-mlogloss:2.45055	eval-mlogloss:2.50475
[13]	train-mlogloss:2.44025	eval-mlogloss:2.49881
[14]	train-mlogloss:2.43119	eval-mlogloss:2.494
[15]	train-mlogloss:2.42313	eval-mlogloss:2.49015
[16]	train-mlogloss:2.41594	eval-mlogloss:2.48713
[17]	train-mlogloss:2.40944	eval-mlogloss:2.48479
[18]	train-mlogloss:2.40293	eval-mlogloss:2.48293
[19]	train-mlogloss:2.39717	eval-mlogloss:2.48128


In [13]:
d_test = xgb.DMatrix(X_test)
y_predict = bst.predict(d_test)

In [14]:
y_test = y_predict.reshape(-1, num_class)

40

In [34]:
labels = list(cates_dict.values())
ans = pd.DataFrame(y_test, columns=labels).drop([-1], axis=1)
ans = ans.applymap(lambda x: '%.4f' % x if x > 0.0001 else '0')

In [35]:
ans.to_csv('data/submit.csv', index_label='Id')