# 1. Import

In [None]:
import numpy as np
import pandas as pd

In [62]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from lightgbm import LGBMClassifier

from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [63]:
from sklearn.cross_validation import train_test_split

In [64]:
from sklearn.metrics import f1_score

# 2. Functions

In [65]:
# DecisionTreeClassifier

def function_DecisionTreeClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    dec_tree = DecisionTreeClassifier()
    dec_tree = dec_tree.fit(X_train, Y_train)

    # predict
    dec_tree_pred = dec_tree.predict(X_test)
    
    # score
    dec_tree_score = f1_score(Y_test, dec_tree_pred, average=None)
    dec_tree_score_micro = f1_score(Y_test, dec_tree_pred, average='micro')
    
    return dec_tree_score, dec_tree_score_micro

In [66]:
# ExtraTreeClassifier

def function_ExtraTreeClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    ext_tree = ExtraTreeClassifier()
    ext_tree = ext_tree.fit(X_train, Y_train)

    # predict
    ext_tree_pred = ext_tree.predict(X_test)
    
    # score
    ext_tree_score = f1_score(Y_test, ext_tree_pred, average=None)
    ext_tree_score_micro = f1_score(Y_test, ext_tree_pred, average='micro')
    
    return ext_tree_score, ext_tree_score_micro

In [67]:
# RandomForestClassifier

def function_RandomForestClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    ran_for = RandomForestClassifier()
    ran_for = ran_for.fit(X_train, Y_train)

    # predict
    ran_for_pred = ran_for.predict(X_test)
    
    # score
    ran_for_score = f1_score(Y_test, ran_for_pred, average=None)
    ran_for_score_micro = f1_score(Y_test, ran_for_pred, average='micro')
    
    return ran_for_score, ran_for_score_micro

In [68]:
# LGBMClassifier

def function_LGBMClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    lgbm = LGBMClassifier()
    lgbm = lgbm.fit(X_train, Y_train)

    # predict
    lgbm_pred = lgbm.predict(X_test)
    
    # score
    lgbm_score = f1_score(Y_test, lgbm_pred, average=None)
    lgbm_score_micro = f1_score(Y_test, lgbm_pred, average='micro')
    
    return lgbm_score, lgbm_score_micro

In [69]:
# BernoulliNB

def function_BernoulliNB(X_train, Y_train, X_test, Y_test):
    
    # fit
    bernoulli = BernoulliNB()
    bernoulli = bernoulli.fit(X_train, Y_train)

    # predict
    bernoulli_pred = bernoulli.predict(X_test)
    
    # score
    bernoulli_score = f1_score(Y_test, bernoulli_pred, average=None)
    bernoulli_score_micro = f1_score(Y_test, bernoulli_pred, average='micro')
    
    return bernoulli_score, bernoulli_score_micro

In [70]:
# KNeighborsClassifier

def function_KNeighborsClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    kn = KNeighborsClassifier()
    kn = kn.fit(X_train, Y_train)

    # predict
    kn_pred = kn.predict(X_test)
    
    # score
    kn_score = f1_score(Y_test, kn_pred, average=None)
    kn_score_micro = f1_score(Y_test, kn_pred, average='micro')
    
    return kn_score, kn_score_micro

In [71]:
# GaussianNB

def function_GaussianNB(X_train, Y_train, X_test, Y_test):
    
    # fit
    gaus = GaussianNB()
    gaus = gaus.fit(X_train, Y_train)

    # predict
    gaus_pred = gaus.predict(X_test)
    
    # score
    gaus_score = f1_score(Y_test, gaus_pred, average=None)
    gaus_score_micro = f1_score(Y_test, gaus_pred, average='micro')
    
    return gaus_score, gaus_score_micro

# 3. Data

In [72]:
df = pd.read_csv('crimes.csv')

In [73]:
df.columns

Index(['Unnamed: 0', 'DAY_OF_WEEK', 'DISTRICT', 'HOUR', 'Lat', 'Long', 'MONTH',
       'OCCURRED_ON_DATE', 'OFFENSE_CODE_GROUP', 'REPORTING_AREA', 'SHOOTING',
       'UCR_PART', 'YEAR', 'Day', 'Night', 'ToNight', 'ToDay', 'DayNumber',
       'precipitation', 'temperature', 'dewpoint', 'humidity', 'wind'],
      dtype='object')

In [74]:
df = df[[
    'DAY_OF_WEEK', 
    'DISTRICT', 
    'HOUR', 
    'Lat', 
    'Long', 
    'MONTH',
    'REPORTING_AREA',
    'UCR_PART',
    'Day', 
    'Night', 
    'ToNight', 
    'ToDay', 
    'precipitation', 
    'temperature', 
    'dewpoint', 
    'humidity', 
    'wind'
]]

In [75]:
df.columns

Index(['DAY_OF_WEEK', 'DISTRICT', 'HOUR', 'Lat', 'Long', 'MONTH',
       'REPORTING_AREA', 'UCR_PART', 'Day', 'Night', 'ToNight', 'ToDay',
       'precipitation', 'temperature', 'dewpoint', 'humidity', 'wind'],
      dtype='object')

In [76]:
# DAY_OF_WEEK

df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].map({
    'Tuesday':2, 
    'Saturday':6, 
    'Monday':1, 
    'Sunday':7, 
    'Thursday':4, 
    'Wednesday':3,
    'Friday':5
})

df['DAY_OF_WEEK'].unique()

array([7, 1, 2, 3, 5, 4, 6])

In [77]:
df.UCR_PART.unique()

array([ 1.,  2.,  3.,  4.])

In [78]:
df = df.loc[(df.UCR_PART == 1) | (df.UCR_PART == 2) | (df.UCR_PART == 3)]

In [79]:
df.isnull().sum()

DAY_OF_WEEK         0
DISTRICT           94
HOUR                0
Lat               733
Long              733
MONTH               0
REPORTING_AREA    589
UCR_PART            0
Day                 0
Night               0
ToNight             0
ToDay               0
precipitation       0
temperature         0
dewpoint            0
humidity            0
wind                0
dtype: int64

In [80]:
df = df.dropna()

In [81]:
x = df.drop('UCR_PART',1)

In [82]:
x.isnull().sum()

DAY_OF_WEEK       0
DISTRICT          0
HOUR              0
Lat               0
Long              0
MONTH             0
REPORTING_AREA    0
Day               0
Night             0
ToNight           0
ToDay             0
precipitation     0
temperature       0
dewpoint          0
humidity          0
wind              0
dtype: int64

In [83]:
y = df.UCR_PART

# 4. Split data

In [84]:
# Split dataframe into random train and test subsets

X_train, X_test, Y_train, Y_test = train_test_split(
    x,
    y, 
    test_size = 0.1,
    random_state=42
)

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(234279, 16) (234279,)
(26031, 16) (26031,)


# 5. Modeling 

X - > 'DAY_OF_WEEK', 
    'DISTRICT', 
    'HOUR', 
    'Lat', 
    'Long', 
    'MONTH',
    'REPORTING_AREA',
    'UCR_PART',
    'Day', 
    'Night', 
    'ToNight', 
    'ToDay', 
    'precipitation', 
    'temperature', 
    'dewpoint', 
    'humidity', 
    'wind'
    
Y - > UCR_part

In [85]:
function_DecisionTreeClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.33415343,  0.43678956,  0.40043443]), 0.39748761092543505)

In [86]:
function_ExtraTreeClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.31425815,  0.4189911 ,  0.38706516]), 0.38062310322308018)

In [87]:
function_RandomForestClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.35501773,  0.46726799,  0.39408419]), 0.41458261303830046)

In [88]:
function_LGBMClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.30267247,  0.53409974,  0.39327731]), 0.44612193154316004)

In [89]:
function_BernoulliNB(X_train, Y_train, X_test, Y_test)

  'precision', 'predicted', average, warn_for)


(array([ 0.        ,  0.45718402,  0.42622651]), 0.38630863201567367)

In [90]:
function_KNeighborsClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.33083322,  0.44188209,  0.33144512]), 0.37689677691982637)

In [91]:
function_GaussianNB(X_train, Y_train, X_test, Y_test)

(array([ 0.38760984,  0.29073707,  0.34658524]), 0.34585686297107293)