### 資料前處理

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from pyspark.mllib.regression import LabeledPoint
import matplotlib.pyplot as plt

In [2]:
# 1 read data
df_2015 = pd.DataFrame(pd.read_csv('the-counted-2015.csv'))
df_2016 = pd.DataFrame(pd.read_csv('the-counted-2016.csv'))
df = pd.concat([df_2015, df_2016], ignore_index = True)

In [3]:
# 2 處理資料
df = df.drop(['uid', 'name', 'day', 'year', 'streetaddress'], 1)
df = df.drop(df.index[np.where((df['raceethnicity'] == 'Unknown') | (df['age'] == 'Unknown') | (df['armed'] == 'Unknown') | (df['lawenforcementagency'] == 'Unknown'))])

# replace data to int
df = df.replace(['Male', 'Female', 'Non-conforming', '40s'], ['0', '1', '0', '40'])
df['month'] = df['month'].replace(list(Counter(df['month'])), np.arange(len(Counter(df['month']))) + 1)
df['city'] = df['city'].replace(list(Counter(df['city'])), np.arange(len(Counter(df['city']))))
df['state'] = df['state'].replace(list(Counter(df['state'])), np.arange(len(Counter(df['state']))))
df['classification'] = df['classification'].replace(list(Counter(df['classification'])), np.arange(len(Counter(df['classification']))))
df['lawenforcementagency'] = df['lawenforcementagency'].replace(list(Counter(df['lawenforcementagency'])), np.arange(len(Counter(df['lawenforcementagency']))))
df['armed'] = df['armed'].replace(list(Counter(df['armed'])), np.arange(len(Counter(df['armed']))))

# target value
cls = df['raceethnicity']
df = df.drop(['raceethnicity'], 1)
df.insert(loc = 0, column = 'raceethnicity', value = cls)
df['raceethnicity'] = df['raceethnicity'].replace(list(Counter(df['raceethnicity'])), np.arange(len(Counter(df['raceethnicity']))))

# convert to spark dataframe
dfSpark = spark.createDataFrame(df)

In [4]:
df

Unnamed: 0,raceethnicity,age,gender,month,city,state,classification,lawenforcementagency,armed
0,0,22,0,1,0,0,0,0,0
1,1,47,0,1,1,1,1,1,1
2,1,19,0,1,2,2,2,2,0
3,2,23,0,1,3,3,1,3,0
4,3,53,0,1,4,4,1,4,1
5,1,32,0,1,5,5,1,5,2
6,2,22,0,1,6,6,1,6,1
7,2,39,0,1,7,7,1,7,3
8,1,25,0,1,8,5,1,8,4
9,0,26,0,1,9,5,3,9,0


In [5]:
dfSpark

DataFrame[raceethnicity: bigint, age: string, gender: string, month: bigint, city: bigint, state: bigint, classification: bigint, lawenforcementagency: bigint, armed: bigint]

In [6]:
# 3 split data
train, test = dfSpark.randomSplit([0.75, 0.25])
train = train.rdd.map(lambda x: LabeledPoint(x[0], x[1:])) # convert to labeled RDD
test = test.rdd.map(lambda x: LabeledPoint(x[0], x[1:]))
#print(type(train), train.count())

### 方法一：Decision Tree

In [7]:
# 4-1 Decision Tree
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.evaluation import MulticlassMetrics
# train a model
DT_model = DecisionTree.trainClassifier(train, numClasses = 7, categoricalFeaturesInfo = {}, impurity = 'gini', maxDepth = 5, maxBins = 32)
# evaluate model and compute test error
DT_predictions = DT_model.predict(test.map(lambda x: x.features))
DT_labelsAndPredictions = test.map(lambda x: x.label).zip(DT_predictions)
DT_metrics = MulticlassMetrics(DT_labelsAndPredictions)
print('Accuracy = %s' % DT_metrics.accuracy)
print('Decision Tree Classification Model:')
print(DT_model.toDebugString())

Accuracy = 0.5456238361266295
Decision Tree Classification Model:
DecisionTreeModel classifier of depth 5 with 59 nodes
  If (feature 0 <= 41.0)
   If (feature 4 <= 18.0)
    If (feature 4 <= 4.0)
     If (feature 4 <= 0.0)
      If (feature 0 <= 27.0)
       Predict: 0.0
      Else (feature 0 > 27.0)
       Predict: 1.0
     Else (feature 4 > 0.0)
      If (feature 6 <= 190.0)
       Predict: 1.0
      Else (feature 6 > 190.0)
       Predict: 1.0
    Else (feature 4 > 4.0)
     If (feature 4 <= 5.0)
      If (feature 6 <= 190.0)
       Predict: 2.0
      Else (feature 6 > 190.0)
       Predict: 1.0
     Else (feature 4 > 5.0)
      If (feature 4 <= 12.0)
       Predict: 1.0
      Else (feature 4 > 12.0)
       Predict: 1.0
   Else (feature 4 > 18.0)
    If (feature 3 <= 181.0)
     If (feature 3 <= 85.0)
      If (feature 0 <= 26.0)
       Predict: 0.0
      Else (feature 0 > 26.0)
       Predict: 0.0
     Else (feature 3 > 85.0)
      If (feature 4 <= 35.0)
       Predict: 0.0
      

### 方法一：Decision Tree（using pyspark.ml）

#### 資料前處理

In [8]:
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

In [10]:
df.to_csv('out_police.csv')
# Read CSV
spark_df = spark.read.csv('out_police.csv', header=True)
# Drop Unused Column
spark_df = spark_df.drop(spark_df.columns[0])
# Convert columns types
def convertColumn(df, colNames, newType):
    for name in colNames:
        df = df.withColumn(name, df[name].astype(newType))
    return df

spark_df = convertColumn(spark_df, spark_df.columns[1:], IntegerType())
# Vectorize Features
vecAssembler = VectorAssembler(inputCols=['age','gender','month','city','state','classification','lawenforcementagency','armed'], outputCol="features")
spark_df = vecAssembler.transform(spark_df)
# Encode Labels
stringIndexer = StringIndexer(inputCol='raceethnicity', outputCol='label', handleInvalid='error')
model = stringIndexer.fit(spark_df)
td = model.transform(spark_df)
# Split Data
train, test = td.randomSplit(weights=[0.75, 0.25])

#### 訓練Model

In [11]:
from pyspark.ml.classification import DecisionTreeClassifier
# Training
dt = DecisionTreeClassifier(maxDepth=6, featuresCol='features', labelCol='label')
model = dt.fit(train)
print(model.toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_49bcb5a56689699c5a58) of depth 6 with 103 nodes
  If (feature 0 <= 41.0)
   If (feature 4 <= 18.0)
    If (feature 3 <= 186.0)
     If (feature 4 <= 7.0)
      If (feature 4 <= 3.0)
       If (feature 7 <= 3.0)
        Predict: 0.0
       Else (feature 7 > 3.0)
        Predict: 0.0
      Else (feature 4 > 3.0)
       If (feature 0 <= 23.0)
        Predict: 2.0
       Else (feature 0 > 23.0)
        Predict: 2.0
     Else (feature 4 > 7.0)
      If (feature 2 <= 2.0)
       If (feature 3 <= 146.0)
        Predict: 0.0
       Else (feature 3 > 146.0)
        Predict: 1.0
      Else (feature 2 > 2.0)
       If (feature 1 <= 0.0)
        Predict: 1.0
       Else (feature 1 > 0.0)
        Predict: 0.0
    Else (feature 3 > 186.0)
     If (feature 4 <= 12.0)
      If (feature 4 <= 7.0)
       If (feature 4 <= 4.0)
        Predict: 0.0
       Else (feature 4 > 4.0)
        Predict: 0.0
      Else (feature 4 > 7.0)
       If (feature 

In [12]:
# Features Importances
model.featureImportances

SparseVector(8, {0: 0.2362, 1: 0.0137, 2: 0.0764, 3: 0.211, 4: 0.3643, 5: 0.0235, 6: 0.0217, 7: 0.0532})

#### 測試Model

In [13]:
# Testing
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
prediction = model.transform(test)
metric = evaluator.evaluate(prediction)
print('Decision Tree Accuracy:', metric * 100, '%')

Decision Tree Accuracy: 52.06286836935167 %


#### 調整超參數

In [14]:
# Tuning
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
grid = ParamGridBuilder().addGrid(dt.maxDepth, [5, 21]).build()
tvs = TrainValidationSplit(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.3)
tvsModel = tvs.fit(train)
print('Tuned Decision Tree Accuracy:', evaluator.evaluate(tvsModel.transform(test)) * 100, '%')

Tuned Decision Tree Accuracy: 52.455795677799614 %


### 方法二：Random Forest

In [15]:
# 4-2 Random Forest
from pyspark.mllib.tree import RandomForest, RandomForestModel
# train a model
RF_model = RandomForest.trainClassifier(train, numClasses = 7, categoricalFeaturesInfo = {},\
                                     numTrees = 10, featureSubsetStrategy = 'auto', impurity = 'gini', maxDepth = 4, maxBins = 32)
# evaluate model and compute test error
RF_predictions = RF_model.predict(test.map(lambda x: x.features))
RF_labelsAndPredictions = test.map(lambda x: x.label).zip(RF_predictions)
RF_metrics = MulticlassMetrics(RF_labelsAndPredictions)
print('Accuracy = %s' % RF_metrics.accuracy)
print('Random Forest Classification Model:')
print(RF_model.toDebugString())

AssertionError: the data should be RDD of LabeledPoint

### 方法三：Naive Bayes

In [16]:
# 4-3 Naive Bayes
# train a model
NB_model = NaiveBayes.train(train, 1.0)
# make prediction and accuracy
NB_predictionAndLabels = test.map(lambda p: (NB_model.predict(p.features), p.label))
NB_accuracy = 1.0 * NB_predictionAndLabels.filter(lambda (x, v): x == v).count() / test.count()
print('Accuracy = {}'.format(NB_accuracy))

SyntaxError: invalid syntax (<ipython-input-16-cfd4944f918b>, line 6)

### 方法三：Gradient Boosted Trees

In [17]:
# Gradient-Boosted Trees
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
# train a model
GBT_model = GradientBoostedTrees.trainClassifier(train, categoricalFeaturesInfo = {}, numIterations = 100)
# evaluate model and compute test error
GBT_predictions = GBT_model.predict(test.map(lambda x: x.features))
GBT_labelsAndPredictions = test.map(lambda lp: lp.label).zip(GBT_predictions)
GBT_metrics = MulticlassMetrics(GBT_labelsAndPredictions)
print('Accuracy = %s' % GBT_metrics.accuracy)
print('Learned classification GBT model:')
print(GBT_model.toDebugString())

AssertionError: the data should be RDD of LabeledPoint

### 方法四：Multilayer Perceptron

In [28]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
# Training
# Input Layer: 8 (features), Hidden Layer: 12 & 3, Output Layer: 7 (label)
mlp = MultilayerPerceptronClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', maxIter=100, layers=[8, 12, 3, 7], solver='gd')
model = mlp.fit(train)
model.weights

DenseVector([-0.7947, 0.2644, 0.1687, -0.003, -0.8, -0.3186, -0.5332, -0.6875, 0.622, 0.187, -0.0053, -0.3149, 0.3494, -0.0279, -0.3729, -0.5595, -0.761, 0.6962, 0.0336, 0.2336, -0.8372, 0.0031, 0.736, 0.0566, -0.6806, 0.1415, 0.0595, 0.0401, -0.3021, 0.546, 0.433, -0.8363, 0.6478, 0.5327, -0.3254, 0.0129, 0.3969, 0.7003, 0.5147, 0.7365, -0.8119, -0.7529, -0.1358, 0.7862, -0.4078, 0.4195, 0.1544, 0.0658, -0.3542, 0.036, -0.4295, -0.4769, -0.4192, -0.8333, -0.6212, 0.5187, -0.5452, 0.6955, 0.1465, -0.3619, -0.5296, -0.1954, 0.2036, -0.1685, -0.2158, -0.0635, 0.3703, -0.6913, -0.753, 0.1671, -0.6528, 0.1887, -0.7331, -0.4326, -0.6428, 0.1578, 0.0187, 0.5103, -0.2622, 0.7054, -0.5331, 0.5268, 0.5179, -0.6358, -0.2656, 0.7124, -0.0805, 0.2158, -0.5013, 0.077, -0.3856, 0.2305, -0.7508, 0.3228, -0.0838, 0.0767, -0.1616, 0.7033, -0.6882, -0.7715, -0.5523, 0.6135, 0.1901, -0.5022, 0.5926, 0.7525, 0.2548, -0.7233, -0.4642, -0.5651, -0.1963, 0.5437, -0.4572, 0.4388, 0.0888, -0.2819, 0.5583, 0.25

In [29]:
# Testing
print('Multilayer Perceptron Accuracy:', evaluator.evaluate(model.transform(test)) * 100, '%')

Multilayer Perceptron Accuracy: 49.50884086444008 %
