# read csv

In [1]:
import pandas as pd

import numpy as np

df = pd.read_csv('train.csv')

# clean data

In [2]:
df = pd.concat([df, pd.get_dummies(df['DayOfWeek'], prefix='DayOfWeek')], axis=1)

df = pd.concat([df, pd.get_dummies(df['PdDistrict'], prefix='PdDistrict')], axis=1)

In [3]:
df = pd.concat([df, pd.get_dummies(df.Dates.map(lambda x: pd.to_datetime(x).hour), prefix="hour")], axis=1)

df = pd.concat([df, pd.get_dummies(df.Dates.map(lambda x: pd.to_datetime(x).month), prefix="month")], axis=1)

df = pd.concat([df, pd.get_dummies(df.Dates.map(lambda x: pd.to_datetime(x).year), prefix="year")], axis=1)

In [4]:
columns = df.columns.values[7:]

x = df[columns]
y = df['Category']

#  grid search

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV

parameter_grid = {
    'max_depth': [1,2,3,4,5,6,7,8,9]
}

grid_search = GridSearchCV(DecisionTreeClassifier(), parameter_grid, cv=3, verbose=3)

In [6]:
grid_search.fit(x, y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] max_depth=1 .....................................................
[CV] ............................ max_depth=1, score=0.199181 -   3.2s
[CV] max_depth=1 .....................................................
[CV] ............................ max_depth=1, score=0.157861 -   3.2s
[CV] max_depth=1 .....................................................
[CV] ............................ max_depth=1, score=0.162245 -   3.2s
[CV] max_depth=2 .....................................................
[CV] ............................ max_depth=2, score=0.202847 -   3.6s
[CV] max_depth=2 .....................................................
[CV] ............................ max_depth=2, score=0.116044 -   3.6s
[CV] max_depth=2 .....................................................
[CV] ............................ max_depth=2, score=0.166082 -   3.6s
[CV] max_depth=3 .....................................................
[CV] ............

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  2.4min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [7]:
sorted(grid_search.grid_scores_, key=lambda x: x.mean_validation_score)

[mean: 0.14586, std: 0.04248, params: {'max_depth': 7},
 mean: 0.14748, std: 0.04357, params: {'max_depth': 8},
 mean: 0.14894, std: 0.04365, params: {'max_depth': 6},
 mean: 0.14924, std: 0.04531, params: {'max_depth': 9},
 mean: 0.15109, std: 0.04289, params: {'max_depth': 5},
 mean: 0.15720, std: 0.03591, params: {'max_depth': 4},
 mean: 0.16166, std: 0.03557, params: {'max_depth': 2},
 mean: 0.17194, std: 0.03116, params: {'max_depth': 3},
 mean: 0.17310, std: 0.01853, params: {'max_depth': 1}]

# decision tree model fit

In [8]:
x_data = x.values
y_data = y.values

In [9]:
model = DecisionTreeClassifier(max_depth=7)
model.fit(x_data, y_data)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

# make result

In [10]:
test_df = pd.read_csv('test.csv')

In [11]:
test_df = pd.concat([test_df, pd.get_dummies(test_df['DayOfWeek'], prefix='DayOfWeek')], axis=1)
test_df = pd.concat([test_df, pd.get_dummies(test_df['PdDistrict'], prefix='PdDistrict')], axis=1)
test_df = pd.concat([test_df, pd.get_dummies(test_df.Dates.map(lambda x: pd.to_datetime(x).hour), prefix="hour")], axis=1)
test_df = pd.concat([test_df, pd.get_dummies(test_df.Dates.map(lambda x: pd.to_datetime(x).month), prefix="month")], axis=1)
test_df = pd.concat([test_df, pd.get_dummies(test_df.Dates.map(lambda x: pd.to_datetime(x).year), prefix="year")], axis=1)

In [12]:
test_values = test_df[test_df.columns.values[5:]].values

In [13]:
output = model.predict_proba(test_values)

In [14]:
b = model.classes_
columns = np.append(['Id'], b)

test_data = test_df.values

result = np.c_[test_data[:,0].astype(int), output.astype(float)]


df_result = pd.DataFrame(result[:,:],columns=columns)

df_result['Id'] = df_result['Id'].astype(int)

df_result.to_csv('first.csv', index=False)