# 1. Import

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from lightgbm import LGBMClassifier

from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [3]:
from sklearn.cross_validation import train_test_split



In [4]:
from sklearn.metrics import f1_score

# 2. Functions

In [5]:
# DecisionTreeClassifier

def function_DecisionTreeClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    dec_tree = DecisionTreeClassifier()
    dec_tree = dec_tree.fit(X_train, Y_train)

    # predict
    dec_tree_pred = dec_tree.predict(X_test)
    
    # score
    dec_tree_score = f1_score(Y_test, dec_tree_pred, average=None)
    dec_tree_score_micro = f1_score(Y_test, dec_tree_pred, average='micro')
    
    return dec_tree_score, dec_tree_score_micro

In [6]:
# ExtraTreeClassifier

def function_ExtraTreeClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    ext_tree = ExtraTreeClassifier()
    ext_tree = ext_tree.fit(X_train, Y_train)

    # predict
    ext_tree_pred = ext_tree.predict(X_test)
    
    # score
    ext_tree_score = f1_score(Y_test, ext_tree_pred, average=None)
    ext_tree_score_micro = f1_score(Y_test, ext_tree_pred, average='micro')
    
    return ext_tree_score, ext_tree_score_micro

In [7]:
# RandomForestClassifier

def function_RandomForestClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    ran_for = RandomForestClassifier()
    ran_for = ran_for.fit(X_train, Y_train)

    # predict
    ran_for_pred = ran_for.predict(X_test)
    
    # score
    ran_for_score = f1_score(Y_test, ran_for_pred, average=None)
    ran_for_score_micro = f1_score(Y_test, ran_for_pred, average='micro')
    
    return ran_for_score, ran_for_score_micro

In [8]:
# LGBMClassifier

def function_LGBMClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    lgbm = LGBMClassifier()
    lgbm = lgbm.fit(X_train, Y_train)

    # predict
    lgbm_pred = lgbm.predict(X_test)
    
    # score
    lgbm_score = f1_score(Y_test, lgbm_pred, average=None)
    lgbm_score_micro = f1_score(Y_test, lgbm_pred, average='micro')
    
    return lgbm_score, lgbm_score_micro

In [9]:
# BernoulliNB

def function_BernoulliNB(X_train, Y_train, X_test, Y_test):
    
    # fit
    bernoulli = BernoulliNB()
    bernoulli = bernoulli.fit(X_train, Y_train)

    # predict
    bernoulli_pred = bernoulli.predict(X_test)
    
    # score
    bernoulli_score = f1_score(Y_test, bernoulli_pred, average=None)
    bernoulli_score_micro = f1_score(Y_test, bernoulli_pred, average='micro')
    
    return bernoulli_score, bernoulli_score_micro

In [10]:
# KNeighborsClassifier

def function_KNeighborsClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    kn = KNeighborsClassifier()
    kn = kn.fit(X_train, Y_train)

    # predict
    kn_pred = kn.predict(X_test)
    
    # score
    kn_score = f1_score(Y_test, kn_pred, average=None)
    kn_score_micro = f1_score(Y_test, kn_pred, average='micro')
    
    return kn_score, kn_score_micro

In [11]:
# GaussianNB

def function_GaussianNB(X_train, Y_train, X_test, Y_test):
    
    # fit
    gaus = GaussianNB()
    gaus = gaus.fit(X_train, Y_train)

    # predict
    gaus_pred = gaus.predict(X_test)
    
    # score
    gaus_score = f1_score(Y_test, gaus_pred, average=None)
    gaus_score_micro = f1_score(Y_test, gaus_pred, average='micro')
    
    return gaus_score, gaus_score_micro

# 3. Data

In [12]:
df = pd.read_csv('crime-boston_1.csv')

In [13]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,DAY_OF_WEEK,DISTRICT,HOUR,Lat,Long,MONTH,OCCURRED_ON_DATE,OFFENSE_CODE_GROUP,REPORTING_AREA,SHOOTING,UCR_PART,YEAR,Day,Night,ToNight,ToDay
0,0,0,Sunday,2.0,6,42.346381,-71.103794,7,2012-07-08 06:00:00,Residential Burglary,629.0,0.0,1.0,2012,1,0,16,0
1,1,1,Sunday,1.0,6,42.316841,-71.074585,7,2012-07-08 06:03:00,Aggravated Assault,327.0,1.0,1.0,2012,1,0,16,0
2,2,2,Sunday,2.0,6,42.342841,-71.09699,7,2012-07-08 06:26:00,Robbery,625.0,0.0,1.0,2012,1,0,16,0
3,3,3,Sunday,1.0,6,42.316441,-71.065829,7,2012-07-08 06:56:00,Other,258.0,0.0,1.0,2012,1,0,16,0
4,4,4,Sunday,9.0,7,42.270516,-71.1199,7,2012-07-08 07:15:00,Robbery,496.0,0.0,1.0,2012,1,0,15,0


In [14]:
df = df.drop('Unnamed: 0',1)
df = df.drop('Unnamed: 0.1',1)

In [15]:
df_model = df[[
    'DISTRICT',
    'MONTH',
    'DAY_OF_WEEK',
    'HOUR',
    'UCR_PART',
    'Lat',
    'Long',
    'REPORTING_AREA',
    'Day',
    'Night',
    'ToDay',
    'ToNight'
]]

In [16]:
# DAY_OF_WEEK

df_model['DAY_OF_WEEK'] = df_model['DAY_OF_WEEK'].map({
    'Tuesday':2, 
    'Saturday':6, 
    'Monday':1, 
    'Sunday':7, 
    'Thursday':4, 
    'Wednesday':3,
    'Friday':5
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [17]:
df_model['UCR_PART'].unique()

array([  1.,   2.,   3.,   4.,  nan])

In [18]:
df_model['UCR_PART'].value_counts()

3.0    242191
2.0    192276
1.0    124036
4.0     16630
Name: UCR_PART, dtype: int64

In [19]:
df_model = df_model.loc[(df_model.UCR_PART == 1) | (df_model.UCR_PART == 2) | (df_model.UCR_PART == 3)]

In [20]:
df_model = df_model.dropna()

print(df_model.shape)
df_model.isnull().sum()

(535765, 12)


DISTRICT          0
MONTH             0
DAY_OF_WEEK       0
HOUR              0
UCR_PART          0
Lat               0
Long              0
REPORTING_AREA    0
Day               0
Night             0
ToDay             0
ToNight           0
dtype: int64

In [21]:
x = df_model[[
    'DISTRICT',
    'REPORTING_AREA', 
    'MONTH',
    'DAY_OF_WEEK',
    'HOUR',
    'Lat',
    'Long',
    'Day',
    'Night',
    'ToDay',
    'ToNight'
]]
y = df_model['UCR_PART']

# 4. Split data

In [22]:
# Split dataframe into random train and test subsets

X_train, X_test, Y_train, Y_test = train_test_split(
    x,
    y, 
    test_size = 0.1,
    random_state=42
)

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(482188, 11) (482188,)
(53577, 11) (53577,)


# 5. Modeling

X - 'DISTRICT', 'REPORTING_AREA', 'MONTH', 'DAY_OF_WEEK', 'HOUR', 'Lat', 'Long', 'Day', 'Night', 'ToDay', 'ToNight'

Y - UCR_PART

In [23]:
function_DecisionTreeClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.32470175,  0.42248641,  0.48664049]), 0.42630233122421934)

In [24]:
function_ExtraTreeClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.29587036,  0.40789333,  0.47163113]), 0.40853351251469849)

In [25]:
function_RandomForestClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.31877172,  0.427429  ,  0.50495896]), 0.43822909084121919)

In [26]:
function_LGBMClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.21774295,  0.35460945,  0.59430236]), 0.47259085055154265)

In [27]:
function_BernoulliNB(X_train, Y_train, X_test, Y_test)

  'precision', 'predicted', average, warn_for)


(array([ 0.        ,  0.26398692,  0.58987558]), 0.44061817570972617)

In [28]:
function_KNeighborsClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.31590751,  0.41177696,  0.45375251]), 0.40543516807585345)

In [29]:
function_GaussianNB(X_train, Y_train, X_test, Y_test)

(array([ 0.24142313,  0.05317789,  0.56371384]), 0.40188887022416336)