# Package Import

In [29]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from collections import Counter 
from imblearn.under_sampling import (ClusterCentroids, RandomUnderSampler,
                                     NearMiss,
                                     InstanceHardnessThreshold,
                                     CondensedNearestNeighbour,
                                     EditedNearestNeighbours,
                                     RepeatedEditedNearestNeighbours,
                                     AllKNN,
                                     NeighbourhoodCleaningRule,
                                     OneSidedSelection)
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

# Data Cleaning

In [None]:
# read raw data
csv = pd.read_csv('NYPD_Arrests_Data__Historic_.csv')
csv.head(3)

In [None]:
# Drop na drop duplicates
data = csv[['ARREST_DATE','PD_DESC','OFNS_DESC','LAW_CAT_CD', 'ARREST_BORO','AGE_GROUP','PERP_SEX','PERP_RACE']].drop_duplicates().dropna()
data = data.reset_index(drop=True)


In [None]:
data['LAW_CAT_CD'].unique()
#Level of offense: felony, misdemeanor, violation
# Because data description did not explain I, we only analyze F, M, V in our project

# drop I
# Get names of indexes for which column CAT_CD is I
indexNames = data[ data['LAW_CAT_CD'] == 'I' ].index
 
# Delete these row indexes from dataFrame
data.drop(indexNames , inplace=True)

In [None]:
data = data.reset_index(drop=True)

In [None]:
data['ARREST_BORO'].unique()
#Borough of arrest. B(Bronx), S(Staten Island), K(Brooklyn), M(Manhattan), Q(Queens)

In [None]:
data['OFNS_DESC'].unique()

In [None]:
# extract year and month
data['YEAR'] = pd.DatetimeIndex(data['ARREST_DATE']).year
data['MONTH'] = pd.DatetimeIndex(data['ARREST_DATE']).month

In [None]:
# keep only the data from 2008.1 to 2011.7
data3 = data.loc[(data['YEAR'] == 2008)|(data['YEAR'] == 2009)|(data['YEAR'] == 2010)]
data4 = data.loc[(data['YEAR'] == 2011)]
data5 = data4.loc[(data4['MONTH'].isin([1,2,3,4,5,6,7]))]
data2= pd.concat([data3,data5])
data2=data2.reset_index(drop=True)
# save the cleaned data
data2.to_csv('NYCCrime_TimeModified.csv')

# Modeling

In [2]:
data2 = pd.read_csv('NYCCrime_TimeModified.csv')

In [3]:
data2

Unnamed: 0.1,Unnamed: 0,ARREST_DATE,PD_DESC,OFNS_DESC,LAW_CAT_CD,ARREST_BORO,AGE_GROUP,PERP_SEX,PERP_RACE,YEAR,MONTH
0,0,12/31/2010,"IMPERSONATION 2, PUBLIC SERVANT",FRAUDS,M,S,18-24,M,BLACK,2010,12
1,1,12/31/2010,"MARIJUANA, POSSESSION 4 & 5",DANGEROUS DRUGS,M,S,25-44,F,ASIAN / PACIFIC ISLANDER,2010,12
2,2,12/31/2010,"FORGERY,ETC.,UNCLASSIFIED-FELONY",FORGERY,F,Q,18-24,F,BLACK,2010,12
3,3,12/31/2010,"CONTROLLED SUBSTANCE, POSSESSION 7",DANGEROUS DRUGS,M,Q,45-64,M,BLACK,2010,12
4,4,12/31/2010,"LOITERING,GAMBLING,OTHER","LOITERING/GAMBLING (CARDS, DICE, ETC)",V,Q,25-44,M,WHITE HISPANIC,2010,12
...,...,...,...,...,...,...,...,...,...,...,...
958155,958155,01/01/2011,"STOLEN PROPERTY 3,POSSESSION",POSSESSION OF STOLEN PROPERTY 5,M,M,18-24,F,BLACK,2011,1
958156,958156,01/01/2011,"CONTEMPT,CRIMINAL",OFFENSES AGAINST PUBLIC ADMINISTRATION,M,B,25-44,M,WHITE HISPANIC,2011,1
958157,958157,01/01/2011,"CONTROLLED SUBSTANCE, POSSESSION 7",DANGEROUS DRUGS,M,K,18-24,M,WHITE,2011,1
958158,958158,01/01/2011,"ASSAULT 2,1,PEACE OFFICER",FELONY ASSAULT,F,B,25-44,M,WHITE HISPANIC,2011,1


In [6]:
Counter(data2['LAW_CAT_CD'])

Counter({'M': 630596, 'F': 272879, 'V': 54685})

In [7]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958160 entries, 0 to 958159
Data columns (total 11 columns):
Unnamed: 0     958160 non-null int64
ARREST_DATE    958160 non-null object
PD_DESC        958160 non-null object
OFNS_DESC      958160 non-null object
LAW_CAT_CD     958160 non-null object
ARREST_BORO    958160 non-null object
AGE_GROUP      958160 non-null object
PERP_SEX       958160 non-null object
PERP_RACE      958160 non-null object
YEAR           958160 non-null int64
MONTH          958160 non-null int64
dtypes: int64(3), object(8)
memory usage: 80.4+ MB


In [8]:
def labeling(org_label):
    if org_label == 'V':
        return 0
    elif org_label == 'M':
        return 1
    else:
        return 2
    
##Level of offense: felony (2), misdemeanor (1), violation (0)

In [9]:
data2['LAW_CAT_LABEL'] = data2['LAW_CAT_CD'].apply(labeling)

In [10]:
# Get feature feature space by dropping useless feature
to_drop = ['ARREST_DATE','PD_DESC','OFNS_DESC','LAW_CAT_CD','YEAR','MONTH','LAW_CAT_LABEL','Unnamed: 0']
X = data2.drop(to_drop, axis=1)
y = data2['LAW_CAT_LABEL']

In [12]:
# factors
X['ARREST_BORO'] = preprocessing.LabelEncoder().fit_transform(X['ARREST_BORO'])
X['AGE_GROUP'] = preprocessing.LabelEncoder().fit_transform(X['AGE_GROUP'])
X['PERP_SEX'] = preprocessing.LabelEncoder().fit_transform(X['PERP_SEX'])
X['PERP_RACE'] = preprocessing.LabelEncoder().fit_transform(X['PERP_RACE'])

In [13]:
y

0         1
1         1
2         2
3         1
4         0
         ..
958155    1
958156    1
958157    1
958158    2
958159    1
Name: LAW_CAT_LABEL, Length: 958160, dtype: int64

In [14]:
X.duplicated()
# X has duplication

0         False
1         False
2         False
3         False
4         False
          ...  
958155     True
958156     True
958157     True
958158     True
958159     True
Length: 958160, dtype: bool

In [15]:
# Splite data into training and testing
# Reserve 25% for testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)


In [16]:
Counter(y_train)

Counter({1: 473118, 2: 204531, 0: 40971})

In [17]:
# Random Forest
classifier_RF = RandomForestClassifier()

In [18]:
# fit RF model
classifier_RF.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
# Prediction of test data
classifier_RF.predict(X_test)

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [20]:
# Accuracy of test data
classifier_RF.score(X_test, y_test)

0.6573975118978042

In [21]:
# get precision, recall, fscore
y_pred = classifier_RF.predict(X_test)
precision_recall_fscore_support(y_test, y_pred,labels=[0,1,2])

  _warn_prf(average, modifier, msg_start, len(result))


(array([0.        , 0.65741123, 0.        ]),
 array([0.        , 0.99996825, 0.        ]),
 array([0.        , 0.79328888, 0.        ]),
 array([ 13714, 157478,  68348], dtype=int64))

# Undersampling

In [23]:
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))

[(0, 40971), (1, 40971), (2, 40971)]


In [24]:
# fit model after undersamling 
classifier_RF.fit(X_resampled, y_resampled)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
# Prediction of test data
classifier_RF.predict(X_test)

array([2, 2, 0, ..., 1, 1, 1], dtype=int64)

In [26]:
# Accuracy of test data
classifier_RF.score(X_test, y_test)

0.34529514903565167

In [27]:
y_pred = classifier_RF.predict(X_test)
precision_recall_fscore_support(y_test, y_pred,labels=[0,1,2])

(array([0.08561496, 0.71857806, 0.3235592 ]),
 array([0.63570074, 0.34002845, 0.29916018]),
 array([0.15090616, 0.46162003, 0.3108817 ]),
 array([ 13714, 157478,  68348], dtype=int64))

# Gradient Boosting Decision Tree

In [30]:
# set learning rate list
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=3, random_state=0)
    gb_clf.fit(X_resampled, y_resampled)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_resampled, y_resampled)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))

Learning rate:  0.05
Accuracy score (training): 0.420
Accuracy score (validation): 0.316
Learning rate:  0.075
Accuracy score (training): 0.420
Accuracy score (validation): 0.306
Learning rate:  0.1
Accuracy score (training): 0.420
Accuracy score (validation): 0.308
Learning rate:  0.25
Accuracy score (training): 0.427
Accuracy score (validation): 0.355
Learning rate:  0.5
Accuracy score (training): 0.427
Accuracy score (validation): 0.350
Learning rate:  0.75
Accuracy score (training): 0.427
Accuracy score (validation): 0.344
Learning rate:  1
Accuracy score (training): 0.428
Accuracy score (validation): 0.343


In [31]:
# select the learning rate with the highest accuracy to fit the GBDT model, lr is 0.25
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.25, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(X_resampled, y_resampled)
predictions = gb_clf2.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix:
[[ 9737  2038  1939]
 [74428 55740 27310]
 [34359 19762 14227]]
Classification Report
              precision    recall  f1-score   support

           0       0.08      0.71      0.15     13714
           1       0.72      0.35      0.47    157478
           2       0.33      0.21      0.25     68348

    accuracy                           0.33    239540
   macro avg       0.38      0.42      0.29    239540
weighted avg       0.57      0.33      0.39    239540

