# Baseline script of San Francisco Crime Classification

Baseline script. Hope this helps.

In [1]:
import numpy as np
import pandas as pd

## Load datasets

In [2]:
train = pd.read_csv("../data/train.csv", parse_dates=["Dates"])

train.drop("Address", axis=1, inplace=True)
train.drop("Resolution", axis=1, inplace=True)
train.drop("Descript", axis=1, inplace=True)

print(train.shape)
train.head(3)

(878049, 6)


Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,X,Y
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,-122.424363,37.800414


## Feature Engineering

### DayOfWeek

In [3]:
print(train["DayOfWeek"].unique())

day_of_week_dataframe = pd.get_dummies(train["DayOfWeek"], prefix="DayOfWeek").astype(np.bool)
print(day_of_week_dataframe.shape)
day_of_week_dataframe.head(3)

['Wednesday' 'Tuesday' 'Monday' 'Sunday' 'Saturday' 'Friday' 'Thursday']
(878049, 7)


Unnamed: 0,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,False,False,False,False,False,False,True
1,False,False,False,False,False,False,True
2,False,False,False,False,False,False,True


In [4]:
train = pd.concat([train, day_of_week_dataframe], axis=1)
train.drop("DayOfWeek", axis=1, inplace=True)

print(train.shape)
train.head(3)

(878049, 12)


Unnamed: 0,Dates,Category,PdDistrict,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,2015-05-13 23:53:00,WARRANTS,NORTHERN,-122.425892,37.774599,False,False,False,False,False,False,True
1,2015-05-13 23:53:00,OTHER OFFENSES,NORTHERN,-122.425892,37.774599,False,False,False,False,False,False,True
2,2015-05-13 23:33:00,OTHER OFFENSES,NORTHERN,-122.424363,37.800414,False,False,False,False,False,False,True


### PdDistrict

In [5]:
print(train["PdDistrict"].unique())

pd_district_dataframe = pd.get_dummies(train["PdDistrict"], prefix="PdDistrict").astype(np.bool)

print(pd_district_dataframe.shape)
pd_district_dataframe.head(3)

['NORTHERN' 'PARK' 'INGLESIDE' 'BAYVIEW' 'RICHMOND' 'CENTRAL' 'TARAVAL'
 'TENDERLOIN' 'MISSION' 'SOUTHERN']
(878049, 10)


Unnamed: 0,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,False,False,False,False,True,False,False,False,False,False
1,False,False,False,False,True,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False


In [6]:
train = pd.concat([train, pd_district_dataframe], axis=1)
train.drop("PdDistrict", axis=1, inplace=True)

print(train.shape)
train.head(3)

(878049, 21)


Unnamed: 0,Dates,Category,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,...,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,2015-05-13 23:53:00,WARRANTS,-122.425892,37.774599,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,2015-05-13 23:53:00,OTHER OFFENSES,-122.425892,37.774599,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,2015-05-13 23:33:00,OTHER OFFENSES,-122.424363,37.800414,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


### Dates

In [7]:
train["Dates_year"] = train["Dates"].dt.year
train["Dates_month"] = train["Dates"].dt.month
train["Dates_day"] = train["Dates"].dt.day
train["Dates_hour"] = train["Dates"].dt.hour
train["Dates_minute"] = train["Dates"].dt.minute
train["Dates_second"] = train["Dates"].dt.second

train.drop("Dates", axis=1, inplace=True)

print(train.shape)
train.head(3)

(878049, 26)


Unnamed: 0,Category,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,...,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,Dates_year,Dates_month,Dates_day,Dates_hour,Dates_minute,Dates_second
0,WARRANTS,-122.425892,37.774599,False,False,False,False,False,False,True,...,False,False,False,False,2015,5,13,23,53,0
1,OTHER OFFENSES,-122.425892,37.774599,False,False,False,False,False,False,True,...,False,False,False,False,2015,5,13,23,53,0
2,OTHER OFFENSES,-122.424363,37.800414,False,False,False,False,False,False,True,...,False,False,False,False,2015,5,13,23,33,0


## Score

In [8]:
label_name = "Category"
feature_names = train.columns.difference([label_name])

X_train = train[feature_names]

print(X_train.shape)
X_train.head(3)

(878049, 25)


Unnamed: 0,Dates_day,Dates_hour,Dates_minute,Dates_month,Dates_second,Dates_year,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,...,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,X,Y
0,13,23,53,5,0,2015,False,False,False,False,...,False,False,True,False,False,False,False,False,-122.425892,37.774599
1,13,23,53,5,0,2015,False,False,False,False,...,False,False,True,False,False,False,False,False,-122.425892,37.774599
2,13,23,33,5,0,2015,False,False,False,False,...,False,False,True,False,False,False,False,False,-122.424363,37.800414


In [9]:
y_train = train[label_name]

print(y_train.shape)
y_train.head(3)

(878049,)


0          WARRANTS
1    OTHER OFFENSES
2    OTHER OFFENSES
Name: Category, dtype: object

### Evaluate using Naive Bayes

In [10]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import cross_val_score, StratifiedKFold

kfold = StratifiedKFold(y_train, n_folds=6)

model = BernoulliNB()
%time score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='log_loss').mean()
score = -1.0 * score

print("Use BernoulliNB. Score = {0:.6f}".format(score))

CPU times: user 56.4 s, sys: 4.8 s, total: 1min 1s
Wall time: 49.9 s
Use BernoulliNB. Score = 2.564320
