In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

#--------------------------------------------------#

df = pd.read_csv('/content/drive/MyDrive/BDA/MCI_2014_to_2017.csv', sep=',')

col_list = ['occurrenceyear', 'occurrencemonth', 'occurrenceday', 'occurrencedayofyear', 'occurrencedayofweek', 'occurrencehour', 'MCI', 'Division', 'Hood_ID', 'premisetype']

df2 = df[col_list]
df2 = df2[df2['occurrenceyear'] > 2013]  # Drop "stale" crimes before 2014

crime_var = pd.factorize(df2['MCI'])
df2['MCI'] = crime_var[0]
definition_list_MCI = crime_var[1]

independent_vars = ['premisetype', 'occurrenceyear', 'occurrencemonth', 'occurrenceday', 'occurrencedayofweek', 'occurrencehour', 'occurrencedayofyear', 'Division', 'Hood_ID']

for var in independent_vars:
    encoded_var = pd.factorize(df2[var])
    df2[var] = encoded_var[0]

X = df2.drop(['MCI'], axis=1).values
y = df2['MCI'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=21)

binary_encoder = OneHotEncoder(sparse=False)
encoded_X = binary_encoder.fit_transform(X)
X_train_OH, X_test_OH, y_train_OH, y_test_OH = train_test_split(encoded_X, y, test_size=0.25, random_state=21)

#--------------------------------------------------#

#3) MODELING AND TESTING:

classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Numeric Encoded Model w/ SKLEARN:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=definition_list_MCI))

classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
classifier.fit(X_train_OH, y_train_OH)
y_pred_OH = classifier.predict(X_test_OH)
print("One Hot Encoded Model w/ SKLEARN:")
print("Accuracy:", accuracy_score(y_test_OH, y_pred_OH))
print("Confusion Matrix:\n", confusion_matrix(y_test_OH, y_pred_OH))
print("Classification Report:\n", classification_report(y_test_OH, y_pred_OH, target_names=definition_list_MCI))

classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42, class_weight='balanced')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Balanced Class Weight Model w/ SKLEARN:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=definition_list_MCI))

grad_class = GradientBoostingClassifier(learning_rate=0.1, n_estimators=10, random_state=42)
grad_class.fit(X_train_OH, y_train_OH)
y_pred_OH = grad_class.predict(X_test_OH)
print("Gradient Boosting Classifier:")
print("Accuracy:", accuracy_score(y_test_OH, y_pred_OH))
print("Confusion Matrix:\n", confusion_matrix(y_test_OH, y_pred_OH))
print("Classification Report:\n", classification_report(y_test_OH, y_pred_OH, target_names=definition_list_MCI))




Numeric Encoded Model w/ SKLEARN:
Accuracy: 0.6337963247495544
Confusion Matrix:
 [[15289  1451   484    34   448]
 [ 3472  3151    28    22   126]
 [ 2004   191  1185     7   226]
 [  695   227    28    18    62]
 [ 1769   454   173    16   982]]
Classification Report:
                  precision    recall  f1-score   support

        Assault       0.66      0.86      0.75     17706
Break and Enter       0.58      0.46      0.51      6799
        Robbery       0.62      0.33      0.43      3613
     Theft Over       0.19      0.02      0.03      1030
     Auto Theft       0.53      0.29      0.37      3394

       accuracy                           0.63     32542
      macro avg       0.52      0.39      0.42     32542
   weighted avg       0.61      0.63      0.60     32542

One Hot Encoded Model w/ SKLEARN:
Accuracy: 0.6495298383627313
Confusion Matrix:
 [[15692  1246   379    15   374]
 [ 3344  3286    27    19   123]
 [ 2133   153  1149     2   176]
 [  727   204    27    17    55

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
