In [1]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

In [2]:
pd.set_option('display.max_columns', 100)

#### Created .csv file with added features of Date, CrimeRate, CrimeCategory (low, lowmed, med, high)
#### Note, time for date is simply center of the 'time block'

# Need to unzip the CrimeData.csv.zip file

In [3]:
data = pd.read_csv('data/CrimeData.csv')

#### Using pd.get_dummies to perform the onehotencoding

In [4]:
data2 = pd.get_dummies(data, prefix=['wkday'], columns=['Weekday'])
data2 = pd.get_dummies(data2, prefix=['tod'], columns=['tod'])

In [5]:
data2.head()

Unnamed: 0.1,Unnamed: 0,year,month,day,offensegroup,apparent_temp,cloud_cover,dew_point,humidity,percip_intensity,percip_probability,pressure,temperature,uv_index,visibility,wind_bearing,wind_gust,wind_speed,TotalPop,PerCapitaIncome,MedianHouseholdInc,MedianAge,HousingUnits,Date,CrimeRate,CrimeCategory,BlockGroup,wkday_0.0,wkday_1.0,wkday_2.0,wkday_3.0,wkday_4.0,wkday_5.0,wkday_6.0,tod_Afternoon,tod_Early Morning,tod_Evening,tod_Midnight,tod_Morning,tod_Night
0,0,2013,1,10,1,44.43,0.34,27.77,0.52,0.0,0.0,1031.65,44.43,0.0,5.77,330.0,2.49,0.74,1240.0,109147.0,104083.0,37.0,743.0,2013-01-10 04:30:00,80.645161,Med,1001,0,0,0,1,0,0,0,0,1,0,0,0,0
1,1,2013,1,30,1,57.28,0.98,55.38,0.93,0.1323,0.99,996.1,57.28,0.0,3.57,110.0,4.92,1.33,1240.0,109147.0,104083.0,37.0,743.0,2013-01-30 22:30:00,80.645161,Med,1001,0,0,1,0,0,0,0,0,0,0,0,0,1
2,2,2013,2,4,1,32.68,0.91,18.64,0.56,0.0,0.0,1015.17,32.68,0.0,6.67,161.0,1.22,0.36,1240.0,109147.0,104083.0,37.0,743.0,2013-02-04 17:30:00,80.645161,Med,1001,1,0,0,0,0,0,0,0,0,1,0,0,0
3,3,2013,2,14,1,43.85,0.0,28.66,0.55,0.0053,0.22,1016.32,43.85,4.0,6.54,305.0,6.58,2.49,1240.0,109147.0,104083.0,37.0,743.0,2013-02-14 15:00:00,80.645161,Med,1001,0,0,0,1,0,0,0,1,0,0,0,0,0
4,4,2013,3,8,1,44.84,0.72,27.55,0.44,0.0,0.0,1020.88,48.26,1.0,6.4,339.0,16.9,7.55,1240.0,109147.0,104083.0,37.0,743.0,2013-03-08 15:00:00,80.645161,Med,1001,0,0,0,0,1,0,0,1,0,0,0,0,0


#### Labelencoding the crime categories for use in sklearn functions
#### Note, y is used as the target.  For the purposes of this notebook, only looking at the categorical target

In [6]:
le = preprocessing.LabelEncoder()
le.fit(data['CrimeCategory'])
y = le.transform(data2['CrimeCategory']) 
y

array([3, 3, 3, ..., 2, 2, 2])

#### Features to be used in the model

In [7]:
X = data2[[#'offensegroup',
           'BlockGroup',
           #'year',
           #'month',
           #'day',
           #'apparent_temp',
           #'cloud_cover',
           #'dew_point',
           #'humidity',
           #'CrimeCategory',
           'percip_intensity',
           #'percip_probability',
           #'pressure',
           'temperature',
           'uv_index',
           #'visibility',
           #'wind_bearing',
           #'wind_gust',
           'wind_speed',
           #'TotalPop',
           #'PerCapitaIncome',
           #'MedianHouseholdInc',
           #'MedianAge',
           #'HousingUnits',         
           'tod_Evening',
           'tod_Midnight',
           'tod_Morning',
           'tod_Night',
           'tod_Afternoon',
           'tod_Early Morning',
           'wkday_0.0',
           'wkday_1.0',
           'wkday_2.0',
           'wkday_3.0',
           'wkday_4.0',
           'wkday_5.0',
           'wkday_6.0']]

#### Creating the split for training and test data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#### Running Random Forest Classifier
#### Outputting accuracy score, as well as confusion matrix
## Now running without census data - producing a much lower accuracy score

In [9]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print('Random Forest model')
print('Accuracy score = %0.3f' % accuracy_score(expected, predicted))
confusion_matrix(expected, predicted)



Random Forest model
Accuracy score = 0.427


array([[ 3384,   776,  3332,  1863],
       [ 1049,  3482,  2993,  1056],
       [ 2587,  2255, 10017,  2765],
       [ 2371,  1338,  4633,  3211]])

#### Save the model to disk for later use

In [10]:
filename = 'RandomForestClassifier.sav'
joblib.dump(model, filename)

['RandomForestClassifier.sav']

#### Load the model from disk

In [11]:
filename = 'RandomForestClassifier.sav'
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, y_test)
print(result)

0.4265155374426898


#### Run the model on a single instance
#### Note, this instance is from the 'X' subset above
#### Result should produce a 0, 1, 2, or 3 - aligning to Crime Rate Categories of Low, Low/Med, Med, or High

In [13]:
result = model.predict(X.iloc[[500]])
result[0]

3