In [34]:
# Data, Datasets & Utils
import pandas as pd
from pandas.plotting import scatter_matrix
import pprint
import numpy as np
from time import time
from numpy import log2 as log

# Validation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# Metrics
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB


# Hyper-parameter optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature selection & feature engineering
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

# Stats
from scipy.stats import randint as sp_randint
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import shapiro     # Shapiro Wilk
from scipy.stats import normaltest  # D’Agostino’s K^2
from scipy.stats import anderson    # Anderson-Darling
from scipy.stats import ttest_ind    # independent student t-test; assumes normality
from scipy.stats import mannwhitneyu # non-parametric; doesn't assume normality

# Visualisation
import matplotlib.pyplot as plot 
import seaborn as sns
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from sklearn.tree import export_graphviz


In [35]:
%matplotlib inline

In [36]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from collections import defaultdict
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedStratifiedKFold 
from sklearn.model_selection import cross_val_predict

In [37]:
#loading data file
df_sf = pd.read_csv('Datasets\Police_Department_Incidents.csv')

In [38]:
#preprocessing
#handling unique missing value by dropping it
df_sf = df_sf.dropna()
df_sf.isnull().sum() 

IncidntNum    0
Category      0
Descript      0
DayOfWeek     0
Date          0
Time          0
PdDistrict    0
Resolution    0
Address       0
X             0
Y             0
Location      0
PdId          0
dtype: int64

In [39]:
#selecting object values to tranform into int 
sel = df_sf.select_dtypes(exclude=['float64']).columns 
sel 

Index(['IncidntNum', 'Category', 'Descript', 'DayOfWeek', 'Date', 'Time',
       'PdDistrict', 'Resolution', 'Address', 'Location'],
      dtype='object')

In [40]:
encode = defaultdict(preprocessing.LabelEncoder)
df_sf[sel] = df_sf[sel].apply(lambda x: encode[x.name].fit_transform(x.astype(str)))
df_sf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150499 entries, 0 to 150499
Data columns (total 13 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   IncidntNum  150499 non-null  int32  
 1   Category    150499 non-null  int32  
 2   Descript    150499 non-null  int32  
 3   DayOfWeek   150499 non-null  int32  
 4   Date        150499 non-null  int32  
 5   Time        150499 non-null  int32  
 6   PdDistrict  150499 non-null  int32  
 7   Resolution  150499 non-null  int32  
 8   Address     150499 non-null  int32  
 9   X           150499 non-null  float64
 10  Y           150499 non-null  float64
 11  Location    150499 non-null  int32  
 12  PdId        150499 non-null  float64
dtypes: float64(3), int32(10)
memory usage: 10.3 MB


In [41]:
#feature selection
df_sf = df_sf.drop(columns='IncidntNum')

In [42]:
df_sf.columns

Index(['Category', 'Descript', 'DayOfWeek', 'Date', 'Time', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y', 'Location', 'PdId'],
      dtype='object')

In [43]:
df_sf.describe()

Unnamed: 0,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId
count,150499.0,150499.0,150499.0,150499.0,150499.0,150499.0,150499.0,150499.0,150499.0,150499.0,150499.0,150499.0
mean,19.338653,369.139071,2.960292,180.615745,607.564934,4.35872,6.635074,7085.819673,-122.423599,37.768921,11530.24147,16164400000000.0
std,10.218805,181.796489,2.019448,105.981944,386.541916,2.781658,4.006033,4316.813492,0.02621,0.023637,5050.469311,553598300000.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-122.513642,37.707922,0.0,1135120000000.0
25%,16.0,269.0,1.0,88.0,299.0,2.0,0.0,3296.0,-122.434036,37.756486,8303.0,16032800000000.0
50%,20.0,359.0,3.0,180.0,554.0,4.0,9.0,7039.0,-122.416903,37.775421,12617.0,16065400000000.0
75%,24.0,467.0,5.0,272.0,856.0,7.0,9.0,9741.0,-122.406605,37.785063,15261.0,16097600000000.0
max,38.0,725.0,6.0,365.0,1438.0,9.0,13.0,16128.0,-122.365565,37.819975,19384.0,99100900000000.0


In [44]:
X = df_sf[['Descript', 'DayOfWeek', 'Date', 'Time', 'PdDistrict', 
           'Resolution', 'Address', 'X', 'Y', 'Location', 'PdId']]
y = df_sf['Category']

In [45]:
y.value_counts()

16    40408
21    19599
20    17866
1     13577
35     8589
36     6419
37     5914
4      5802
32     5782
19     4338
7      4243
25     3299
13     2635
27     1841
34     1812
38     1658
28      940
30      882
24      736
5       658
23      641
12      619
8       465
6       378
0       286
15      257
9       168
17      156
26      140
31       69
3        66
10       60
11       53
18       42
29       40
2        34
14       20
22        4
33        3
Name: Category, dtype: int64

In [46]:
model = DecisionTreeClassifier()
# Instantiating (setting up) RFE
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(), scoring='accuracy')
start_time = time()
rfecv.fit(X, y)
end_time = time()
# Printing out the results
print("RFECV took %.2f seconds to identify %d features" % 
      ((end_time - start_time), rfecv.n_features_))
print(rfecv.ranking_)




RFECV took 26.82 seconds to identify 1 features
[ 1 10  6  7 11  3  8  4  9  5  2]


In [47]:
feature_ranking_df = np.array([])
feature_ranking_df = pd.DataFrame()
feature_ranking_df['feature'] = X.columns
feature_ranking_df['rank'] = rfecv.ranking_
feature_ranking_df = feature_ranking_df.sort_values(by='rank', ascending=True)
feature_ranking_df

Unnamed: 0,feature,rank
0,Descript,1
10,PdId,2
5,Resolution,3
7,X,4
9,Location,5
2,Date,6
3,Time,7
6,Address,8
8,Y,9
1,DayOfWeek,10


In [56]:
#now lets try PCA (principal component analysis) for dimensionality reduction
from sklearn.decomposition import PCA
pca = PCA(n_components = 11) # 2D PCA for the plot
X_reduced = pd.DataFrame(pca.fit_transform(X))

In [57]:
list(X_reduced.columns) 
feature_ranking_df = np.array([])
feature_ranking_df = pd.DataFrame()
feature_ranking_df['feature'] = X_reduced.columns
feature_ranking_df['rank'] = rfecv.ranking_
feature_ranking_df = feature_ranking_df.sort_values(by='rank', ascending=True)
feature_ranking_df

Unnamed: 0,feature,rank
0,0,1
10,10,2
5,5,3
7,7,4
9,9,5
2,2,6
3,3,7
6,6,8
8,8,9
1,1,10


In [None]:
#for district

In [58]:
X_2 = df_sf[['Descript', 'DayOfWeek', 'Date', 'Time', 'Category', 
            'Resolution', 'Address', 'X', 'Y', 'Location', 'PdId']]
y_2 = df_sf['PdDistrict']

In [59]:
y_2.value_counts()

7    28445
4    20100
3    19503
1    17666
0    14303
2    11594
8    11325
9     9942
6     8922
5     8699
Name: PdDistrict, dtype: int64

In [60]:
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(), scoring='accuracy')
start_time = time()
rfecv.fit(X_2, y_2)
end_time = time()
# Printing out the results
print("RFECV took %.2f seconds to identify %d features" % 
      ((end_time - start_time), rfecv.n_features_))
print(rfecv.ranking_)

RFECV took 28.76 seconds to identify 3 features
[6 7 5 4 8 9 3 1 1 1 2]


In [61]:
feature_ranking_df = np.array([])
feature_ranking_df = pd.DataFrame()
feature_ranking_df['feature'] = X_2.columns
feature_ranking_df['rank'] = rfecv.ranking_
feature_ranking_df = feature_ranking_df.sort_values(by='rank', ascending=True)
feature_ranking_df

Unnamed: 0,feature,rank
7,X,1
8,Y,1
9,Location,1
10,PdId,2
6,Address,3
3,Time,4
2,Date,5
0,Descript,6
1,DayOfWeek,7
4,Category,8


In [82]:
#now lets try PCA (principal component analysis) for dimensionality reduction
from sklearn.decomposition import PCA
pca = PCA(n_components = 11) # 2D PCA for the plot
X_reduced = pd.DataFrame(pca.fit_transform(X_2))

In [83]:
list(X_reduced.columns) 
feature_ranking_df = np.array([])
feature_ranking_df = pd.DataFrame()
feature_ranking_df['feature'] = X_reduced.columns
feature_ranking_df['rank'] = rfecv.ranking_
feature_ranking_df = feature_ranking_df.sort_values(by='rank', ascending=True)
feature_ranking_df

Unnamed: 0,feature,rank
7,7,1
8,8,1
9,9,1
10,10,2
6,6,3
3,3,4
2,2,5
0,0,6
1,1,7
4,4,8


In [None]:
#for location

In [78]:
X_3 = df_sf[['Descript', 'DayOfWeek', 'Date', 'Time', 'Category', 
            'Resolution', 'Address', 'X', 'Y', 'PdDistrict', 'PdId']]
y_3 = df_sf['Location']

In [79]:
y_3.value_counts()

12617    3536
8303      625
15041     624
15261     513
10437     494
         ... 
9644        1
10706       1
4250        1
16735       1
7508        1
Name: Location, Length: 19385, dtype: int64

In [None]:
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(), scoring='accuracy')
start_time = time()
rfecv.fit(X_2, y_2)
end_time = time()
# Printing out the results
print("RFECV took %.2f seconds to identify %d features" % 
      ((end_time - start_time), rfecv.n_features_))
print(rfecv.ranking_)

In [None]:
feature_ranking_df = np.array([])
feature_ranking_df = pd.DataFrame()
feature_ranking_df['feature'] = X_2.columns
feature_ranking_df['rank'] = rfecv.ranking_
feature_ranking_df = feature_ranking_df.sort_values(by='rank', ascending=True)
feature_ranking_df

In [None]:
#now lets try PCA (principal component analysis) for dimensionality reduction
from sklearn.decomposition import PCA
pca = PCA(n_components = 11) # 2D PCA for the plot
X_reduced = pd.DataFrame(pca.fit_transform(X))

In [None]:
list(X_reduced.columns) 
feature_ranking_df = np.array([])
feature_ranking_df = pd.DataFrame()
feature_ranking_df['feature'] = X_reduced.columns
feature_ranking_df['rank'] = rfecv.ranking_
feature_ranking_df = feature_ranking_df.sort_values(by='rank', ascending=True)
feature_ranking_df