In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from pyspark.mllib.regression import LabeledPoint
import matplotlib.pyplot as plt

In [3]:
# 1 read data
df_2015 = pd.DataFrame(pd.read_csv('file:/media/sf_Hadoop/data/the-counted-2015.csv'))
df_2016 = pd.DataFrame(pd.read_csv('file:/media/sf_Hadoop/data/the-counted-2016.csv'))
df = pd.concat([df_2015, df_2016], ignore_index = True)

In [4]:
# 2 處理資料
df = df.drop(['uid', 'name', 'day', 'year', 'streetaddress'], 1)
df = df.drop(df.index[np.where((df['raceethnicity'] == 'Unknown') | (df['age'] == 'Unknown') | (df['armed'] == 'Unknown') | (df['lawenforcementagency'] == 'Unknown'))])
# replace data to int
df = df.replace(['Male', 'Female', 'Non-conforming', '40s'], ['0', '1', '0', '40'])
df['month'] = df['month'].replace(list(Counter(df['month'])), np.arange(len(Counter(df['month']))) + 1)
df['city'] = df['city'].replace(list(Counter(df['city'])), np.arange(len(Counter(df['city']))))
df['state'] = df['state'].replace(list(Counter(df['state'])), np.arange(len(Counter(df['state']))))
df['classification'] = df['classification'].replace(list(Counter(df['classification'])), np.arange(len(Counter(df['classification']))))
df['lawenforcementagency'] = df['lawenforcementagency'].replace(list(Counter(df['lawenforcementagency'])), np.arange(len(Counter(df['lawenforcementagency']))))
df['armed'] = df['armed'].replace(list(Counter(df['armed'])), np.arange(len(Counter(df['armed']))))
# target value
cls = df['raceethnicity']
df = df.drop(['raceethnicity'], 1)
df.insert(loc = 0, column = 'raceethnicity', value = cls)
#df.to_csv('out_police.csv')
# convert to spark dataframe
dfSpark = spark.createDataFrame(df)

In [5]:
# 3 split data
train, test = dfSpark.randomSplit([0.75, 0.25])
train = train.rdd.map(lambda x: LabeledPoint(x[1], x[2:])) # convert to labeled RDD
test = test.rdd.map(lambda x: LabeledPoint(x[1], x[2:]))
#print(type(train), train.count())

In [14]:
# 4 GradientBoostedTree (GBT)
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
# Train a GradientBoostedTrees model
model = GradientBoostedTrees.trainClassifier(train,categoricalFeaturesInfo={}, numIterations=10)
# Evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
print('Accuracy = ' + str(1-testErr))
print('Learned classification GBT model:')
print(model.toDebugString())

Accuracy = 0.0
Learned classification GBT model:
TreeEnsembleModel classifier with 10 trees

  Tree 0:
    If (feature 4 <= 3.0)
     If (feature 5 <= 1164.0)
      If (feature 3 <= 41.0)
       Predict: 71.38407494145198
      Else (feature 3 > 41.0)
       Predict: 78.47058823529412
     Else (feature 5 > 1164.0)
      If (feature 4 <= 1.0)
       Predict: 88.12820512820512
      Else (feature 4 > 1.0)
       Predict: 64.2
    Else (feature 4 > 3.0)
     If (feature 1 <= 11.0)
      If (feature 2 <= 1137.0)
       Predict: 84.0909090909091
      Else (feature 2 > 1137.0)
       Predict: 132.33333333333334
     Else (feature 1 > 11.0)
      If (feature 3 <= 29.0)
       Predict: 163.66666666666666
      Else (feature 3 > 29.0)
       Predict: 55.0
  Tree 1:
    Predict: 0.0
  Tree 2:
    Predict: 0.0
  Tree 3:
    Predict: 0.0
  Tree 4:
    Predict: 0.0
  Tree 5:
    Predict: 0.0
  Tree 6:
    Predict: 0.0
  Tree 7:
    Predict: 0.0
  Tree 8:
    Predict: 0.0
  Tree 9:
    Predict: 0.

In [13]:
df

Unnamed: 0,raceethnicity,age,gender,month,city,state,classification,lawenforcementagency,armed
0,Black,22,0,6,350,28,1,723,1
1,White,47,0,6,876,49,0,672,5
2,White,19,0,6,334,5,4,578,1
3,Hispanic/Latino,23,0,6,989,44,0,412,1
4,Asian/Pacific Islander,53,0,6,921,0,0,1171,5
5,White,32,0,6,663,23,0,651,4
6,Hispanic/Latino,22,0,6,809,32,0,637,5
7,Hispanic/Latino,39,0,6,200,22,0,1161,2
8,White,25,0,6,1077,23,0,159,6
9,Black,26,0,6,680,23,3,1014,1
