In [1]:
# Data, Datasets & Utils
import pandas as pd
from pandas.plotting import scatter_matrix
import pprint
import numpy as np
from time import time
from numpy import log2 as log

# Validation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# Metrics
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB


# Hyper-parameter optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature selection & feature engineering
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

# Stats
from scipy.stats import randint as sp_randint
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import shapiro     # Shapiro Wilk
from scipy.stats import normaltest  # D’Agostino’s K^2
from scipy.stats import anderson    # Anderson-Darling
from scipy.stats import ttest_ind    # independent student t-test; assumes normality
from scipy.stats import mannwhitneyu # non-parametric; doesn't assume normality

# Visualisation
import matplotlib.pyplot as plot 
import seaborn as sns
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from sklearn.tree import export_graphviz


In [2]:
%matplotlib inline

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from collections import defaultdict
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.model_selection import cross_val_predict
from collections import Counter
import dask.array as da
import dask.dataframe

In [4]:
#now lets do the same but with dask
from dask.distributed import LocalCluster
from dask.distributed import Client
import joblib
import dask.dataframe
from dask_ml.model_selection import train_test_split
from dask_ml.preprocessing import StandardScaler
from dask_ml import preprocessing
from sklearn.metrics import plot_confusion_matrix

In [5]:
#loading data file
df_sf = pd.read_csv('Datasets\Police_Department_Incidents.csv')

In [6]:
df_sf = df_sf.dropna()
df_sf.isnull().sum() 

IncidntNum    0
Category      0
Descript      0
DayOfWeek     0
Date          0
Time          0
PdDistrict    0
Resolution    0
Address       0
X             0
Y             0
Location      0
PdId          0
dtype: int64

In [7]:
df_coor = df_sf.drop(columns=['IncidntNum'])
df_coor.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150499 entries, 0 to 150499
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Category    150499 non-null  object 
 1   Descript    150499 non-null  object 
 2   DayOfWeek   150499 non-null  object 
 3   Date        150499 non-null  object 
 4   Time        150499 non-null  object 
 5   PdDistrict  150499 non-null  object 
 6   Resolution  150499 non-null  object 
 7   Address     150499 non-null  object 
 8   X           150499 non-null  float64
 9   Y           150499 non-null  float64
 10  Location    150499 non-null  object 
 11  PdId        150499 non-null  float64
dtypes: float64(3), object(9)
memory usage: 14.9+ MB


In [8]:
sel = df_coor.select_dtypes(exclude=['float64']).columns 
sel 

Index(['Category', 'Descript', 'DayOfWeek', 'Date', 'Time', 'PdDistrict',
       'Resolution', 'Address', 'Location'],
      dtype='object')

In [9]:
#encoding objects
encode = defaultdict(preprocessing.LabelEncoder)
df_coor[sel] = df_coor[sel].apply(lambda x: encode[x.name].fit_transform(x.astype(str)))

In [10]:
#separating the class from the 
X = df_coor[[ 'Date', 'Time', 'Address', 'X', 'Y', 'Location', 'Category']]
y = df_coor['PdDistrict']

In [11]:
from dask.distributed import LocalCluster
cluster = LocalCluster()
cluster

VBox(children=(HTML(value='<h2>LocalCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    …

In [12]:
from dask.distributed import Client
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:50180  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 15.88 GiB


In [13]:
%%time
#K-Nearest Neighbor Classifier scaled up?
with joblib.parallel_backend('dask'):
    predicting_district = KNeighborsClassifier(n_neighbors=5)
    predicting_district.fit(X, y)

    scores = cross_validate(predicting_district, X, y, cv=6, return_train_score=True, return_estimator=True)
    y_pred = cross_val_predict(predicting_district, X, y, cv=3)

    #Printing results
    pprint.pprint(scores)
    print()
    print('\n')
    print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
    print('\n')
    print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
    print('\n')
    kung_fu_sion = confusion_matrix(y, y_pred)
    print(kung_fu_sion)
    report = classification_report(y, y_pred)
    print('\n')
    print(report)
    
    

{'estimator': [KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier()],
 'fit_time': array([1.02012801, 0.7781589 , 0.92677641, 0.83481002, 0.82204223,
       0.97402716]),
 'score_time': array([ 6.80327129,  8.05861831,  8.86705136, 11.7331419 ,  9.78627133,
       15.48223591]),
 'test_score': array([0.82518737, 0.82522027, 0.80257545, 0.79994419, 0.80082127,
       0.8017781 ]),
 'train_score': array([0.89204641, 0.89211903, 0.89464662, 0.89496555, 0.89660809,
       0.89707055])}



Accuracy (Training): 0.89 (+/- 0.00)


Accuracy (Testing):  0.81 (+/- 0.02)


[[10496     0  1275  1229    20   239    59    98   887     0]
 [    0 15960     0     0   942     0    25   717     0    22]
 [ 1684     0  8715    45     0    28     0     6  1116     0]
 [  849     0    51 17517    57   625    27   115   262     0]
 [   14  1238  

In [14]:
%%time
#K-Nearest Neighbor Classifier scaled up?
with joblib.parallel_backend('dask'):
    predicting_district = KNeighborsClassifier(n_neighbors=10)
    predicting_district.fit(X, y)

    scores = cross_validate(predicting_district, X, y, cv=6, return_train_score=True, return_estimator=True)
    y_pred = cross_val_predict(predicting_district, X, y, cv=3)

    #Printing results
    pprint.pprint(scores)
    print()
    print('\n')
    print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
    print('\n')
    print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
    print('\n')
    kung_fu_sion = confusion_matrix(y, y_pred)
    print(kung_fu_sion)
    report = classification_report(y, y_pred)
    print('\n')
    print(report)
    
    

{'estimator': [KNeighborsClassifier(n_neighbors=10),
               KNeighborsClassifier(n_neighbors=10),
               KNeighborsClassifier(n_neighbors=10),
               KNeighborsClassifier(n_neighbors=10),
               KNeighborsClassifier(n_neighbors=10),
               KNeighborsClassifier(n_neighbors=10)],
 'fit_time': array([2.53101254, 1.02164674, 1.38714767, 1.59076238, 1.01818132,
       1.02609062]),
 'score_time': array([12.7243185 , 10.17077279, 10.36648846, 13.43763018, 13.33828211,
       13.59990597]),
 'test_score': array([0.81326742, 0.80943268, 0.79057529, 0.78722641, 0.78595064,
       0.78048878]),
 'train_score': array([0.84611889, 0.84529885, 0.84987561, 0.8487992 , 0.85057728,
       0.85069688])}



Accuracy (Training): 0.85 (+/- 0.00)


Accuracy (Testing):  0.79 (+/- 0.02)


[[10289     0  1235  1539    15   250    68   111   796     0]
 [    0 16088     0     0   817     0    17   721     0    23]
 [ 1820     0  8703    40     0    23     0     5  1003  

In [15]:
%%time
#K-Nearest Neighbor Classifier scaled up
#with repeated stratfied cross validation
with joblib.parallel_backend('dask'):
    predicting_district_cv = KNeighborsClassifier(n_neighbors=5)
    predicting_district_cv.fit(X, y)
    rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
    scores = cross_validate(predicting_district_cv, X, y, cv=rkf, return_train_score=True, return_estimator=True)
    y_pred = cross_val_predict(predicting_district_cv, X, y, cv=3)

    #Printing results
    pprint.pprint(scores)
    print()
    print('\n')
    print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
    print('\n')
    print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
    print('\n')
    kung_fu_sion = confusion_matrix(y, y_pred)
    print(kung_fu_sion)
    report = classification_report(y, y_pred)
    print('\n')
    print(report)
    
    

{'estimator': [KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier(),
               KNeighborsClassifier()],
 'fit_time': array([0.93506908, 0.69387627, 1.80346608, 0.89725184, 0.92529893,
       0.92114329, 0.74655318, 0.67215276]),
 'score_time': array([10.49958825, 12.30703712, 21.81250763, 14.2313087 , 13.04472756,
       13.85384321, 13.31379747, 13.78076673]),
 'test_score': array([0.83237209, 0.83205316, 0.83099003, 0.83061344, 0.83146844,
       0.82857143, 0.83465781, 0.82968318]),
 'train_score': array([0.88772437, 0.88796357, 0.88777752, 0.8872381 , 0.88751174,
       0.88832681, 0.8880876 , 0.8877608 ])}



Accuracy (Training): 0.89 (+/- 0.00)


Accuracy (Testing):  0.83 (+/- 0.00)


[[10496     0  1275  1229    20   239    59    98   887     0]
 [    0 15960     0     0   

In [None]:
%%time
with joblib.parallel_backend('dask'):
    fig, ax = plt.subplots(figsize=(10, 10))
    plot_confusion_matrix(predicting_district, X, y, ax=ax)
    plt.show()

In [None]:
%%time
with joblib.parallel_backend('dask'):
    dt_pred_distr = DecisionTreeClassifier()
    dt_pred_distr.fit(X,y)
    #rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
    scores = cross_validate(dt_pred_distr, X, y, cv=4, return_train_score=True, return_estimator=True)
    dt_y_pred = cross_val_predict(dt_pred_distr, X, y, cv=3)

    pprint.pprint(scores)
    print()
    print('\n')
    print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
    print('\n')
    print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
    print('\n')
    print(confusion_matrix(y, dt_y_pred))
    print('\n')
    report = classification_report(y, dt_y_pred)
    print(report)

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
with joblib.parallel_backend('dask'):
    
    rf_pred_distr = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_pred_distr.fit(X,y)
    rkf = RepeatedStratifiedKFold(n_splits=4,  n_repeats=2)
    scores = cross_validate(rf_pred_distr, X, y, cv=rkf, return_train_score=True, return_estimator=True)
    rf_y_pred = cross_val_predict(rf_pred_distr, X, y, cv=3)

    pprint.pprint(scores)
    print()
    print('\n')
    print("Accuracy (Training): %0.2f (+/- %0.2f)" % (scores['train_score'].mean(), scores['train_score'].std() * 2))
    print('\n')
    print("Accuracy (Testing):  %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))
    print('\n')
    print(confusion_matrix(y, rf_y_pred))
    print('\n')
    report = classification_report(y, rf_y_pred)
    print(report)