# Import

In [77]:
import pandas as pd
import numpy as np

In [78]:
from sklearn.cross_validation import train_test_split

In [79]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [80]:
from sklearn.metrics import f1_score

# Data

In [81]:
df = pd.read_csv('labels-dataset.csv')

In [82]:
df = df.drop('Unnamed: 0',1)

In [83]:
df.head()

Unnamed: 0,DAY_OF_WEEK,DISTRICT,HOUR,MONTH,Day,Night,ToNight,ToDay,temperatureMin,temperatureMax,...,snow,Motor_Vehicle_Accident_Response,Larceny,Medical_Assistance,Simple_Assault,Violations,Investigate_Person,Vandalism,Drug_Violation,Larceny_From_Motor_Vehicle
0,7.0,4.0,7.0,7.0,1.0,0.0,15.0,0.0,30.0,45.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,7.0,7.0,7.0,7.0,1.0,0.0,15.0,0.0,30.0,45.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7.0,3.0,8.0,7.0,1.0,0.0,14.0,0.0,30.0,45.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,7.0,9.0,8.0,7.0,1.0,0.0,14.0,0.0,30.0,45.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7.0,4.0,9.0,7.0,1.0,0.0,13.0,0.0,30.0,45.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [84]:
df.isnull().sum()

DAY_OF_WEEK                            0
DISTRICT                               0
HOUR                                   0
MONTH                                  0
Day                                    0
Night                                  0
ToNight                                0
ToDay                                  0
temperatureMin                         0
temperatureMax                         0
temperatureDifference                  0
precipitation                      26959
snow                               21268
Motor_Vehicle_Accident_Response        0
Larceny                                0
Medical_Assistance                     0
Simple_Assault                         0
Violations                             0
Investigate_Person                     0
Vandalism                              0
Drug_Violation                         0
Larceny_From_Motor_Vehicle             0
dtype: int64

In [85]:
df = df.fillna(0)

# X & Y

In [86]:
df.columns

Index(['DAY_OF_WEEK', 'DISTRICT', 'HOUR', 'MONTH', 'Day', 'Night', 'ToNight',
       'ToDay', 'temperatureMin', 'temperatureMax', 'temperatureDifference',
       'precipitation', 'snow', 'Motor_Vehicle_Accident_Response', 'Larceny',
       'Medical_Assistance', 'Simple_Assault', 'Violations',
       'Investigate_Person', 'Vandalism', 'Drug_Violation',
       'Larceny_From_Motor_Vehicle'],
      dtype='object')

In [87]:
X = df[[
    'DAY_OF_WEEK', 
    'DISTRICT', 
    'HOUR', 
    'MONTH', 
    'Day', 
    'Night', 
    'ToNight',
    'ToDay', 
    'temperatureMin', 
    'temperatureMax', 
    'temperatureDifference',
    'precipitation', 
    'snow'
]]

In [88]:
Y = df[[
    'Motor_Vehicle_Accident_Response', 
    'Larceny',
    'Medical_Assistance', 
    'Simple_Assault', 
    'Violations',
    'Investigate_Person', 
    'Vandalism', 
    'Drug_Violation',
    'Larceny_From_Motor_Vehicle'
]]

In [89]:
# Split dataframe into random train and test subsets

X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y, 
    test_size = 0.1,
    random_state=42
)

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(200634, 13) (200634, 9)
(22293, 13) (22293, 9)


# Function

In [90]:
# DecisionTreeClassifier

def function_DecisionTreeClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    dec_tree = DecisionTreeClassifier()
    dec_tree = dec_tree.fit(X_train, Y_train)

    # predict
    dec_tree_pred = dec_tree.predict(X_test)
    
    # score
    dec_tree_score = f1_score(Y_test, dec_tree_pred, average=None)
    dec_tree_score_micro = f1_score(Y_test, dec_tree_pred, average='micro')
    
    return dec_tree_score, dec_tree_score_micro

In [91]:
# ExtraTreeClassifier

def function_ExtraTreeClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    ext_tree = ExtraTreeClassifier()
    ext_tree = ext_tree.fit(X_train, Y_train)

    # predict
    ext_tree_pred = ext_tree.predict(X_test)
    
    # score
    ext_tree_score = f1_score(Y_test, ext_tree_pred, average=None)
    ext_tree_score_micro = f1_score(Y_test, ext_tree_pred, average='micro')
    
    return ext_tree_score, ext_tree_score_micro

In [92]:
# RandomForestClassifier

def function_RandomForestClassifier(X_train, Y_train, X_test, Y_test):
    
    # fit
    ran_for = RandomForestClassifier()
    ran_for = ran_for.fit(X_train, Y_train)

    # predict
    ran_for_pred = ran_for.predict(X_test)
    
    # score
    ran_for_score = f1_score(Y_test, ran_for_pred, average=None)
    ran_for_score_micro = f1_score(Y_test, ran_for_pred, average='micro')
    
    return ran_for_score, ran_for_score_micro

# Modeling

In [93]:
function_DecisionTreeClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.22440692,  0.27793035,  0.1789813 ,  0.17692546,  0.16399623,
         0.1410694 ,  0.15760187,  0.16473616,  0.13214905]),
 0.18867610324729392)

In [94]:
function_ExtraTreeClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.2238902 ,  0.26960352,  0.17035196,  0.16111372,  0.15727392,
         0.14005323,  0.13292061,  0.16644533,  0.12967914]),
 0.18175968572202653)

In [95]:
function_RandomForestClassifier(X_train, Y_train, X_test, Y_test)

(array([ 0.10781303,  0.19610369,  0.05658747,  0.06330472,  0.0519263 ,
         0.04043364,  0.04434447,  0.05642633,  0.03782506]),
 0.085606826091837881)