# San-Crime Exercise author by ljs93kr
### Read train data

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import train_test_split

In [3]:
train = pd.read_csv('train.csv', parse_dates=['Dates'])
test = pd.read_csv('test.csv', parse_dates=['Dates'])

In [4]:
train.head(3)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414


### test data 에서 사용하지 않는 칼럼 잘라내기

In [5]:
drop_col = ['Descript', 'Resolution']
train = train.drop(drop_col, axis=1)
train.head(3)

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414


### Text -> Binary Array converting

In [6]:
from sklearn import preprocessing

In [7]:
# Convert crime labels to numbers
label_crime = preprocessing.LabelEncoder()
crime = label_crime.fit_transform(train.Category)
np.info(crime)

class:  ndarray
shape:  (878049,)
strides:  (8,)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  True
data pointer: 0x1178ec000
byteorder:  little
byteswap:  False
type: int64


In [8]:
#Get binaried weekdays, districts, and hours
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
days.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 7 columns):
Friday       878049 non-null float64
Monday       878049 non-null float64
Saturday     878049 non-null float64
Sunday       878049 non-null float64
Thursday     878049 non-null float64
Tuesday      878049 non-null float64
Wednesday    878049 non-null float64
dtypes: float64(7)
memory usage: 46.9 MB


In [9]:
hour = train.Dates.dt.hour
hour = pd.get_dummies(hour)
hour.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 24 columns):
0     878049 non-null float64
1     878049 non-null float64
2     878049 non-null float64
3     878049 non-null float64
4     878049 non-null float64
5     878049 non-null float64
6     878049 non-null float64
7     878049 non-null float64
8     878049 non-null float64
9     878049 non-null float64
10    878049 non-null float64
11    878049 non-null float64
12    878049 non-null float64
13    878049 non-null float64
14    878049 non-null float64
15    878049 non-null float64
16    878049 non-null float64
17    878049 non-null float64
18    878049 non-null float64
19    878049 non-null float64
20    878049 non-null float64
21    878049 non-null float64
22    878049 non-null float64
23    878049 non-null float64
dtypes: float64(24)
memory usage: 160.8 MB


In [10]:
# Concat new array
train_data = pd.concat([hour, days, district], axis=1)
train_data['crime'] = crime
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 42 columns):
0             878049 non-null float64
1             878049 non-null float64
2             878049 non-null float64
3             878049 non-null float64
4             878049 non-null float64
5             878049 non-null float64
6             878049 non-null float64
7             878049 non-null float64
8             878049 non-null float64
9             878049 non-null float64
10            878049 non-null float64
11            878049 non-null float64
12            878049 non-null float64
13            878049 non-null float64
14            878049 non-null float64
15            878049 non-null float64
16            878049 non-null float64
17            878049 non-null float64
18            878049 non-null float64
19            878049 non-null float64
20            878049 non-null float64
21            878049 non-null float64
22            878049 non-null float64
23            8

### Define Same Process Function for test data

In [11]:
def convert_to_vector(pandas_object):
    days = pd.get_dummies(pandas_object.DayOfWeek)
    district = pd.get_dummies(pandas_object.PdDistrict)
    hour = pandas_object.Dates.dt.hour
    hour = pd.get_dummies(hour)
    
    pandas_data = pd.concat([hour, days, district], axis=1)
    return pandas_data

In [12]:
train_data = convert_to_vector(train)
train_data['crime'] = crime
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 42 columns):
0             878049 non-null float64
1             878049 non-null float64
2             878049 non-null float64
3             878049 non-null float64
4             878049 non-null float64
5             878049 non-null float64
6             878049 non-null float64
7             878049 non-null float64
8             878049 non-null float64
9             878049 non-null float64
10            878049 non-null float64
11            878049 non-null float64
12            878049 non-null float64
13            878049 non-null float64
14            878049 non-null float64
15            878049 non-null float64
16            878049 non-null float64
17            878049 non-null float64
18            878049 non-null float64
19            878049 non-null float64
20            878049 non-null float64
21            878049 non-null float64
22            878049 non-null float64
23            8

In [13]:
# do same process for test data
test_data = convert_to_vector(test)
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Data columns (total 41 columns):
0             884262 non-null float64
1             884262 non-null float64
2             884262 non-null float64
3             884262 non-null float64
4             884262 non-null float64
5             884262 non-null float64
6             884262 non-null float64
7             884262 non-null float64
8             884262 non-null float64
9             884262 non-null float64
10            884262 non-null float64
11            884262 non-null float64
12            884262 non-null float64
13            884262 non-null float64
14            884262 non-null float64
15            884262 non-null float64
16            884262 non-null float64
17            884262 non-null float64
18            884262 non-null float64
19            884262 non-null float64
20            884262 non-null float64
21            884262 non-null float64
22            884262 non-null float64
23            8

In [15]:
training, validation = train_test_split(train_data, train_size=.60)
training.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 526829 entries, 62117 to 861182
Data columns (total 42 columns):
0             526829 non-null float64
1             526829 non-null float64
2             526829 non-null float64
3             526829 non-null float64
4             526829 non-null float64
5             526829 non-null float64
6             526829 non-null float64
7             526829 non-null float64
8             526829 non-null float64
9             526829 non-null float64
10            526829 non-null float64
11            526829 non-null float64
12            526829 non-null float64
13            526829 non-null float64
14            526829 non-null float64
15            526829 non-null float64
16            526829 non-null float64
17            526829 non-null float64
18            526829 non-null float64
19            526829 non-null float64
20            526829 non-null float64
21            526829 non-null float64
22            526829 non-null float64
23         

In [16]:
features = ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
 'Wednesday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION',
 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']

### Using Naive Bayes -> BernoulliNB

In [16]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import log_loss

NB_model= BernoulliNB()
NB_model.fit(training[features], training['crime'])
predicted = np.array(NB_model.predict_proba(validation[features]))
log_loss(validation['crime'], predicted)

2.6140311810420531

### if using Logistic Regression

In [63]:
from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression(C=.01)
LR_model

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### BernoulliNB 모델에 비해서 계산 시간이 엄청나게 걸린다... 거의 30초쯤?

In [64]:
LR_model.fit(training[features], training['crime'])

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [67]:
predicted = np.array(LR_model.predict_proba(validation[features]))
log_loss(validation['crime'], predicted)

2.6221642109377497

In [6]:
# category_tab = pd.crosstab(index=train['Category'], columns='count')
# category_tab['count']

Category
ARSON                            1513
ASSAULT                         76876
BAD CHECKS                        406
BRIBERY                           289
BURGLARY                        36755
DISORDERLY CONDUCT               4320
DRIVING UNDER THE INFLUENCE      2268
DRUG/NARCOTIC                   53971
DRUNKENNESS                      4280
EMBEZZLEMENT                     1166
EXTORTION                         256
FAMILY OFFENSES                   491
FORGERY/COUNTERFEITING          10609
FRAUD                           16679
GAMBLING                          146
KIDNAPPING                       2341
LARCENY/THEFT                  174900
LIQUOR LAWS                      1903
LOITERING                        1225
MISSING PERSON                  25989
NON-CRIMINAL                    92304
OTHER OFFENSES                 126182
PORNOGRAPHY/OBSCENE MAT            22
PROSTITUTION                     7484
RECOVERED VEHICLE                3138
ROBBERY                         23000
RUN

In [7]:
# category_tab['count'] / len(train['Category']) * 100

Category
ARSON                           0.172314
ASSAULT                         8.755320
BAD CHECKS                      0.046239
BRIBERY                         0.032914
BURGLARY                        4.185985
DISORDERLY CONDUCT              0.492000
DRIVING UNDER THE INFLUENCE     0.258300
DRUG/NARCOTIC                   6.146696
DRUNKENNESS                     0.487444
EMBEZZLEMENT                    0.132794
EXTORTION                       0.029156
FAMILY OFFENSES                 0.055919
FORGERY/COUNTERFEITING          1.208247
FRAUD                           1.899552
GAMBLING                        0.016628
KIDNAPPING                      0.266614
LARCENY/THEFT                  19.919162
LIQUOR LAWS                     0.216731
LOITERING                       0.139514
MISSING PERSON                  2.959858
NON-CRIMINAL                   10.512397
OTHER OFFENSES                 14.370724
PORNOGRAPHY/OBSCENE MAT         0.002506
PROSTITUTION                    0.852344
RECOVER

### Submission Code

In [70]:
model = BernoulliNB()
model.fit(train_data[features], train_data['crime'])
predicted = model.predict_proba(test_data[features])

In [71]:
label_crime.classes_


array(['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
       'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
       'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
       'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
       'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
       'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS'], dtype=object)

In [72]:
# 이 과정 진짜 오래걸림..
result = pd.DataFrame(predicted, columns=label_crime.classes_)
result.to_csv('using-bernoulliNB.csv', index=True, index_label = 'Id')

In [73]:
result.head(10)

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0.005451,0.136201,0.000127,0.000711,0.037038,0.002288,0.003024,0.038701,0.003837,0.000713,...,0.000197,0.004098,0.000395,0.042298,7e-06,0.006365,0.068189,0.087251,0.043807,0.019875
1,0.005451,0.136201,0.000127,0.000711,0.037038,0.002288,0.003024,0.038701,0.003837,0.000713,...,0.000197,0.004098,0.000395,0.042298,7e-06,0.006365,0.068189,0.087251,0.043807,0.019875
2,0.001434,0.094518,0.000178,0.000171,0.049296,0.004202,0.003837,0.032659,0.005038,0.000807,...,6.5e-05,0.00655,0.000635,0.029117,2e-06,0.006709,0.057947,0.062714,0.039502,0.007129
3,0.002421,0.13136,0.000136,0.000755,0.035027,0.002009,0.004002,0.022246,0.003368,0.000621,...,0.000224,0.004022,0.000829,0.037946,1e-06,0.004583,0.077765,0.127807,0.027799,0.014417
4,0.002421,0.13136,0.000136,0.000755,0.035027,0.002009,0.004002,0.022246,0.003368,0.000621,...,0.000224,0.004022,0.000829,0.037946,1e-06,0.004583,0.077765,0.127807,0.027799,0.014417
5,0.001893,0.099612,0.00019,0.000271,0.045245,0.002351,0.004848,0.017481,0.005772,0.000938,...,0.000183,0.003229,0.00092,0.044159,1e-06,0.005241,0.086093,0.102017,0.021517,0.008329
6,0.002421,0.13136,0.000136,0.000755,0.035027,0.002009,0.004002,0.022246,0.003368,0.000621,...,0.000224,0.004022,0.000829,0.037946,1e-06,0.004583,0.077765,0.127807,0.027799,0.014417
7,0.002421,0.13136,0.000136,0.000755,0.035027,0.002009,0.004002,0.022246,0.003368,0.000621,...,0.000224,0.004022,0.000829,0.037946,1e-06,0.004583,0.077765,0.127807,0.027799,0.014417
8,0.001211,0.115019,0.000129,0.000639,0.025699,0.010666,0.004813,0.060362,0.010406,0.000661,...,0.000251,0.00505,0.000601,0.030088,2e-06,0.007934,0.049275,0.063164,0.052253,0.011228
9,0.001293,0.096548,0.000273,0.000143,0.045293,0.005764,0.002799,0.015389,0.009128,0.001443,...,6.4e-05,0.005649,0.000697,0.03095,2e-06,0.009838,0.058118,0.049925,0.02825,0.005263


### For more improve the score, Using hour data

In [1]:
feature2 = [x for x in range(0, 24)]

In [17]:
features = features + feature2
features

['Friday',
 'Monday',
 'Saturday',
 'Sunday',
 'Thursday',
 'Tuesday',
 'Wednesday',
 'BAYVIEW',
 'CENTRAL',
 'INGLESIDE',
 'MISSION',
 'NORTHERN',
 'PARK',
 'RICHMOND',
 'SOUTHERN',
 'TARAVAL',
 'TENDERLOIN',
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23]

In [18]:
model = BernoulliNB()
model.fit(train_data[features], train_data['crime'])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [20]:
predicted = model.predict_proba(test_data[features])
predicted

array([[  6.92099782e-03,   1.30650586e-01,   2.23360616e-05, ...,
          1.19041100e-01,   3.91560944e-02,   2.27862556e-02],
       [  6.92099782e-03,   1.30650586e-01,   2.23360616e-05, ...,
          1.19041100e-01,   3.91560944e-02,   2.27862556e-02],
       [  1.81097001e-03,   9.02003393e-02,   3.10391571e-05, ...,
          8.51241197e-02,   3.51263391e-02,   8.13166137e-03],
       ..., 
       [  2.68025950e-03,   1.09806573e-01,   1.57461372e-03, ...,
          8.17980664e-02,   2.66547665e-02,   1.35902789e-02],
       [  6.04465553e-03,   1.14027985e-01,   1.48013572e-03, ...,
          5.59274482e-02,   4.20679684e-02,   1.87634931e-02],
       [  2.06397585e-03,   8.20078563e-02,   2.17505984e-03, ...,
          6.43040891e-02,   2.03189022e-02,   7.73265547e-03]])

In [21]:
result = pd.DataFrame(predicted, columns=label_crime.classes_)
result.to_csv('bernoulliNB-usingHour.csv', index=True, index_label = 'Id')

### 정말 2.58로 점수가 향상되었다 ㅎㅎ