In [1]:
# Data, Datasets & Utils
import pandas as pd
from pandas.plotting import scatter_matrix
import pprint
import numpy as np
from time import time
from numpy import log2 as log

# Validation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# Metrics
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB


# Hyper-parameter optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature selection & feature engineering
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

# Stats
from scipy.stats import randint as sp_randint
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import shapiro     # Shapiro Wilk
from scipy.stats import normaltest  # D’Agostino’s K^2
from scipy.stats import anderson    # Anderson-Darling
from scipy.stats import ttest_ind    # independent student t-test; assumes normality
from scipy.stats import mannwhitneyu # non-parametric; doesn't assume normality

# Visualisation
import matplotlib.pyplot as plot 
import seaborn as sns
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from sklearn.tree import export_graphviz


In [2]:
%matplotlib inline

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from collections import defaultdict
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.model_selection import cross_val_predict
from collections import Counter
import dask.array as da
import dask.dataframe

In [4]:
#loading data file
df_sf = pd.read_csv('Datasets\Police_Department_Incidents.csv')

In [5]:
#preprocessing
#handling unique missing value by dropping it
df_sf = df_sf.dropna()
df_sf.isnull().sum() 


IncidntNum    0
Category      0
Descript      0
DayOfWeek     0
Date          0
Time          0
PdDistrict    0
Resolution    0
Address       0
X             0
Y             0
Location      0
PdId          0
dtype: int64

In [6]:
df_coor = df_sf.drop(columns=['IncidntNum'])
df_coor.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150499 entries, 0 to 150499
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Category    150499 non-null  object 
 1   Descript    150499 non-null  object 
 2   DayOfWeek   150499 non-null  object 
 3   Date        150499 non-null  object 
 4   Time        150499 non-null  object 
 5   PdDistrict  150499 non-null  object 
 6   Resolution  150499 non-null  object 
 7   Address     150499 non-null  object 
 8   X           150499 non-null  float64
 9   Y           150499 non-null  float64
 10  Location    150499 non-null  object 
 11  PdId        150499 non-null  float64
dtypes: float64(3), object(9)
memory usage: 14.9+ MB


In [7]:
#selecting object values to tranform into int 
sel = df_coor.select_dtypes(exclude=['float64']).columns 
sel 

Index(['Category', 'Descript', 'DayOfWeek', 'Date', 'Time', 'PdDistrict',
       'Resolution', 'Address', 'Location'],
      dtype='object')

In [8]:
#encoding objects
encode = defaultdict(preprocessing.LabelEncoder)
df_coor[sel] = df_coor[sel].apply(lambda x: encode[x.name].fit_transform(x.astype(str)))

In [9]:
# outliers dropped
df_coor.drop(df_sf[df_sf['Y'] == df_sf['Y'].max()].index, inplace=True)
df_coor.drop(df_sf[df_sf['X'] == df_sf['X'].max()].index, inplace=True)

In [10]:
X = df_coor[['Category', 'Descript', 'DayOfWeek', 'Date', 'Time',
             'Resolution', 'Address', 'X', 'Y', 'Location', 'PdId']]
y = df_coor['PdDistrict']

In [None]:
from dask.distributed import LocalCluster
cluster = LocalCluster()
cluster

In [None]:
from dask.distributed import Client
client = Client(cluster)
client

In [11]:
# Utility function to report best scores
def report(results, rank_metric='score', n_top=3):
    """
    Utility function to report best scores.
    :param results: the cv_results_ data structure from the optimisation algorithm
    :param rank_metric: name of the metric to report results for
    :param n_top: the number of top results to report
    """
    print("\nModels ranked according to", rank_metric)
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_" + rank_metric] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.2f} (+/- {1:.2f})".format(
                  results["mean_test_" + rank_metric][candidate],
                  results["std_test_" + rank_metric][candidate]*2))
            print("Params: {0}".format(results['params'][candidate]))
            print("")

In [None]:
n_iter_search = 11
n_folds = 5

# instantiating the model
model = DecisionTreeClassifier()

# defining parameter grid
param_grid = {"max_depth": [2, 4, 6, 8, None],
              "max_features": [2, 4, 6, 8, 10],
              "min_samples_split": [2, 4, 6, 8, 10],
              "criterion": ["gini", "entropy"],
              "splitter": ["best", "random"]}

# run grid search
print("\n> STARTING GRID SEARCH ...")
grid_search = GridSearchCV(model, param_grid=param_grid, cv=n_folds)

start_time = time()
grid_search.fit(X, y)
end_time = time()

print(type(grid_search))

print("> GRID SEARCH COMPLETE")

print("\nGridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (end_time - start_time, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)


> STARTING GRID SEARCH ...


In [None]:
# instantiating the model
model = DecisionTreeClassifier()

# defining parameter grid
param_grid = {"max_depth": [2, 4, 6, 8, None],
              "max_features": [2, 4, 6, 8, 10],
              "min_samples_split": [2, 4, 6, 8, 10],
              "criterion": ["gini", "entropy"],
              "splitter": ["best", "random"]}

# defining multiple metrics for scoring
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'prec_macro': make_scorer(precision_score, average='macro', zero_division=0), 
    'rec_macro': make_scorer(recall_score, average='macro', zero_division=0)
}

# run grid search
print("\n> STARTING GRID SEARCH ...")
n_folds = 10
grid_search = GridSearchCV(model, param_grid=param_grid, cv=n_folds, scoring=scoring, refit='prec_macro')
start = time()
grid_search.fit(X, y)

print("> GRID SEARCH COMPLETE")
print("\nGridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))

# Get the best model according to each of the 3 metrics used
report(grid_search.cv_results_, 'accuracy', n_top=1)
report(grid_search.cv_results_, 'prec_macro', n_top=1)
report(grid_search.cv_results_, 'rec_macro', n_top=1)

try:
    print("Best score: %0.2f " % (grid_search.best_score_))
except:
    print("Best score not available (refit set to False presumably)")