In [69]:
# 旧金山犯罪分类预测问题
# https://www.kaggle.com/c/sf-crime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt 

import time

#用pandas载入csv训练数据，并解析第一列为日期格式
# parse_dates : boolean or list of ints or names or list of lists or dict, default False
# boolean. True -> 解析索引
# list of ints or names. e.g. If [1, 2, 3] -> 解析1,2,3列的值作为独立的日期列；
# list of lists. e.g. If [[1, 3]] -> 合并1,3列作为一个日期列使用
# dict, e.g. {‘foo’ : [1, 3]} -> 将1,3列合并，并给合并后的列起名为"foo"
train=pd.read_csv('./data/Kaggle_sf_crime_data/train.csv', parse_dates = ['Dates'])
test=pd.read_csv('./data/Kaggle_sf_crime_data/test.csv', parse_dates = ['Dates'])

In [23]:
train[:1]

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599


In [24]:
# Date: 日期
# Category: 犯罪类型，比如 Larceny/盗窃罪 等.
# Descript: 对于犯罪更详细的描述
# DayOfWeek: 星期几
# PdDistrict: 所属警区
# Resolution: 处理结果，比如说『逮捕』『逃了』
# Address: 发生街区位置
# X and Y: GPS坐标
# train.csv中的数据时间跨度为12年，包含了90w+的记录。
# 这部分数据，大部分都是『类别』型，比如犯罪类型，比如星期几。

# 我们对特征值进行处理如下：
# 星期一/Monday = 1,0,0,0,...
# 星期二/Tuesday = 0,1,0,0,...
# 星期三/Wednesday = 0,0,1,0,...

# 用pandas的get_dummies()可以直接拿到这样的一个二值化的01向量。Pandas里面还有一个很有用的方法LabelEncoder可以用于对类别编号。


# 对于已有的数据特征，我们打算做下面的粗略变换：
# 用LabelEncoder**对犯罪类型做编号**；
# 处理时间，在我看来，也许犯罪发生的时间点(小时)是非常重要的，因此我们会用Pandas把这部分数据抽出来；
# 对街区，星期几，时间点用get_dummies()因子化；
# 做一些组合特征，比如把上述三个feature拼在一起，再因子化一下；

In [27]:
# 使用LabelEncoder对不同犯罪类型编号
leCrime = preprocessing.LabelEncoder()
crime = leCrime.fit_transform(train.Category)

array([37])

In [33]:
# 因子化星期几 街区 小时 等特征
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
hour = train.Dates.dt.hour
hour = pd.get_dummies(hour) 

In [35]:
# 组合特征
trainData = pd.concat([hour,days,district],axis=1)
trainData['crime']=crime

In [40]:
#对于测试数据做同样的处理
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)

hour = test.Dates.dt.hour
hour = pd.get_dummies(hour) 

testData = pd.concat([hour, days, district], axis=1)

In [60]:
testData[:1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [47]:
train.Category.drop_duplicates()  

0                            WARRANTS
1                      OTHER OFFENSES
3                       LARCENY/THEFT
6                       VEHICLE THEFT
12                          VANDALISM
14                       NON-CRIMINAL
16                            ROBBERY
17                            ASSAULT
38                        WEAPON LAWS
49                           BURGLARY
54                     SUSPICIOUS OCC
61                        DRUNKENNESS
102            FORGERY/COUNTERFEITING
107                     DRUG/NARCOTIC
110                   STOLEN PROPERTY
111                   SECONDARY CODES
123                          TRESPASS
148                    MISSING PERSON
238                             FRAUD
242                        KIDNAPPING
280                           RUNAWAY
351       DRIVING UNDER THE INFLUENCE
375             SEX OFFENSES FORCIBLE
426                      PROSTITUTION
591                DISORDERLY CONDUCT
661                             ARSON
811         

In [48]:
# 我们可以快速地筛出一部分重要的特征，搭建一个baseline系统，再考虑步步优化。
# 比如我们这里简单一点，就只取星期几和街区作为分类器输入特征，我们用scikit-learn
# 中的train_test_split函数拿到训练集和交叉验证集，用朴素贝叶斯和逻辑回归都建立模型，
# 对比一下它们的表现：

In [49]:
# 只取星期几和街区作为分类器输入特征
features = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION',
 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']

In [50]:
# 分割训练集(3/5)和测试集(2/5)
training, validation = train_test_split(trainData, train_size=.60)

In [57]:
training[:1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,crime
123474,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,20


In [58]:
validation[:1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,crime
194402,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,21


In [52]:
# 朴素贝叶斯建模 计算log_loss
model = BernoulliNB()
nbStart = time.time()
model.fit(training[features],training['crime'])
nbCostTime = time.time() - nbStart
predicted = np.array(model.predict_proba(validation[features]))
print ("朴素贝叶斯建模耗时 %f 秒" %(nbCostTime))
print ("朴素贝叶斯log损失为 %f" %(log_loss(validation['crime'], predicted)))

朴素贝叶斯建模耗时 1.126662 秒
朴素贝叶斯log损失为 2.614267


In [53]:
#逻辑回归建模 计算log_loss
model = LogisticRegression(C = .01)
lrStart = time.time()
model.fit(training[features], training['crime'])
lrCostTime = time.time() - lrStart
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['crime'], predicted)
print ("逻辑回归建模耗时 %f 秒" %(lrCostTime))
print ("逻辑回归log损失为 %f" %(log_loss(validation['crime'], predicted)))

逻辑回归建模耗时 75.901641 秒
逻辑回归log损失为 2.621046


In [61]:
training['crime'][:1]

123474    20
Name: crime, dtype: int64

In [62]:
# 目前的特征和参数设定下，朴素贝叶斯的log损失还低一些，另外我们可以明显看到，
# 朴素贝叶斯建模消耗的时间远小于逻辑回归建模

In [None]:
# 考虑到犯罪类型可能和犯罪事件发生的小时时间点相关，我们加入小时时间点特征再次建模

In [63]:
# 添加犯罪的小时时间点作为特征
features = ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
'Wednesday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION',
'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']

hourFea = [x for x in range(0,24)]
features = features + hourFea

# 分割训练集(3/5)和测试集(2/5)
training, validation = train_test_split(trainData, train_size=.60)

# 朴素贝叶斯建模，计算log_loss
model = BernoulliNB()
nbStart = time.time()
model.fit(training[features], training['crime'])
nbCostTime = time.time() - nbStart
predicted = np.array(model.predict_proba(validation[features]))
print ("朴素贝叶斯建模耗时 %f 秒" %(nbCostTime))
print ("朴素贝叶斯log损失为 %f" %(log_loss(validation['crime'], predicted)))

#逻辑回归建模，计算log_loss
model = LogisticRegression(C=.01)
lrStart= time.time()
model.fit(training[features], training['crime'])
lrCostTime = time.time() - lrStart
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['crime'], predicted)
print ("逻辑回归建模耗时 %f 秒" %(lrCostTime))
print ("逻辑回归log损失为 %f" %(log_loss(validation['crime'], predicted)))

朴素贝叶斯建模耗时 19.646911 秒
朴素贝叶斯log损失为 2.586421
逻辑回归建模耗时 90.872917 秒
逻辑回归log损失为 2.595009


In [None]:
# 可以看到在这三个类别特征下，朴素贝叶斯相对于逻辑回归，依旧有一定的优势(log损失更小)，同时训练时间很短，
# 这意味着模型虽然简单，但是效果依旧强大。