In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from pyspark.mllib.regression import LabeledPoint
import matplotlib.pyplot as plt

In [2]:
# 1 read data
df_2015 = pd.DataFrame(pd.read_csv('file:/media/sf_Ubuntu/the-counted-2015.csv'))
df_2016 = pd.DataFrame(pd.read_csv('file:/media/sf_Ubuntu/the-counted-2016.csv'))
df = pd.concat([df_2015, df_2016], ignore_index = True)

In [3]:
# 2 處理資料
df = df.drop(['uid', 'name', 'day', 'year', 'streetaddress'], 1)
df = df.drop(df.index[np.where((df['raceethnicity'] == 'Unknown') | (df['age'] == 'Unknown') | (df['armed'] == 'Unknown') | (df['lawenforcementagency'] == 'Unknown'))])

# replace data to int
df = df.replace(['Male', 'Female', 'Non-conforming', '40s'], ['0', '1', '0', '40'])
df['month'] = df['month'].replace(list(Counter(df['month'])), np.arange(len(Counter(df['month']))) + 1)
df['city'] = df['city'].replace(list(Counter(df['city'])), np.arange(len(Counter(df['city']))))
df['state'] = df['state'].replace(list(Counter(df['state'])), np.arange(len(Counter(df['state']))))
df['classification'] = df['classification'].replace(list(Counter(df['classification'])), np.arange(len(Counter(df['classification']))))
df['lawenforcementagency'] = df['lawenforcementagency'].replace(list(Counter(df['lawenforcementagency'])), np.arange(len(Counter(df['lawenforcementagency']))))
df['armed'] = df['armed'].replace(list(Counter(df['armed'])), np.arange(len(Counter(df['armed']))))

# target value
cls = df['raceethnicity']
df = df.drop(['raceethnicity'], 1)
df.insert(loc = 0, column = 'raceethnicity', value = cls)
df['raceethnicity'] = df['raceethnicity'].replace(list(Counter(df['raceethnicity'])), np.arange(len(Counter(df['raceethnicity']))))
#df.to_csv('out_police.csv')

# convert to spark dataframe
dfSpark = spark.createDataFrame(df)

In [4]:
df

Unnamed: 0,raceethnicity,age,gender,month,city,state,classification,lawenforcementagency,armed
0,2,22,0,6,350,28,1,723,1
1,5,47,0,6,876,49,0,672,5
2,5,19,0,6,334,5,4,578,1
3,3,23,0,6,989,44,0,412,1
4,0,53,0,6,921,0,0,1171,5
5,5,32,0,6,663,23,0,651,4
6,3,22,0,6,809,32,0,637,5
7,3,39,0,6,200,22,0,1161,2
8,5,25,0,6,1077,23,0,159,6
9,2,26,0,6,680,23,3,1014,1


In [5]:
dfSpark

DataFrame[raceethnicity: bigint, age: string, gender: string, month: bigint, city: bigint, state: bigint, classification: bigint, lawenforcementagency: bigint, armed: bigint]

In [6]:
# 3 split data
train, test = dfSpark.randomSplit([0.75, 0.25])
train = train.rdd.map(lambda x: LabeledPoint(x[0], x[1:])) # convert to labeled RDD
test = test.rdd.map(lambda x: LabeledPoint(x[0], x[1:]))
#print(type(train), train.count())

In [8]:
# 4-1 Decision Tree
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.evaluation import MulticlassMetrics
# train a model
DT_model = DecisionTree.trainClassifier(train, numClasses = 7, categoricalFeaturesInfo = {}, impurity = 'gini', maxDepth = 5, maxBins = 32)
# evaluate model and compute test error
DT_predictions = DT_model.predict(test.map(lambda x: x.features))
DT_labelsAndPredictions = test.map(lambda x: x.label).zip(DT_predictions)
DT_metrics = MulticlassMetrics(DT_labelsAndPredictions)
print('Accuracy = %s' % DT_metrics.accuracy)
print('Decision Tree Classification Model:')
print(DT_model.toDebugString())

Accuracy = 0.518590998043
Decision Tree Classification Model:
DecisionTreeModel classifier of depth 5 with 59 nodes
  If (feature 0 <= 41.0)
   If (feature 4 <= 34.0)
    If (feature 4 <= 31.0)
     If (feature 0 <= 25.0)
      If (feature 4 <= 22.0)
       Predict: 2.0
      Else (feature 4 > 22.0)
       Predict: 3.0
     Else (feature 0 > 25.0)
      If (feature 4 <= 21.0)
       Predict: 5.0
      Else (feature 4 > 21.0)
       Predict: 5.0
    Else (feature 4 > 31.0)
     If (feature 0 <= 28.0)
      If (feature 0 <= 26.0)
       Predict: 5.0
      Else (feature 0 > 26.0)
       Predict: 3.0
     Else (feature 0 > 28.0)
      If (feature 6 <= 490.0)
       Predict: 5.0
      Else (feature 6 > 490.0)
       Predict: 5.0
   Else (feature 4 > 34.0)
    If (feature 0 <= 28.0)
     If (feature 2 <= 9.0)
      If (feature 6 <= 417.0)
       Predict: 5.0
      Else (feature 6 > 417.0)
       Predict: 2.0
     Else (feature 2 > 9.0)
      If (feature 3 <= 366.0)
       Predict: 2.0
      

In [27]:
# 4-2 Random Forest
from pyspark.mllib.tree import RandomForest, RandomForestModel
# train a model
RF_model = RandomForest.trainClassifier(train, numClasses = 7, categoricalFeaturesInfo = {},\
                                     numTrees = 10, featureSubsetStrategy = 'auto', impurity = 'gini', maxDepth = 4, maxBins = 32)
# evaluate model and compute test error
RF_predictions = RF_model.predict(test.map(lambda x: x.features))
RF_labelsAndPredictions = test.map(lambda x: x.label).zip(RF_predictions)
RF_metrics = MulticlassMetrics(RF_labelsAndPredictions)
print('Accuracy = %s' % RF_metrics.accuracy)
print('Random Forest Classification Model:')
print(RF_model.toDebugString())

Accuracy = 0.555772994129
Random Forest Classification Model:
TreeEnsembleModel classifier with 10 trees

  Tree 0:
    If (feature 0 <= 28.0)
     If (feature 4 <= 34.0)
      If (feature 4 <= 21.0)
       If (feature 3 <= 116.0)
        Predict: 5.0
       Else (feature 3 > 116.0)
        Predict: 2.0
      Else (feature 4 > 21.0)
       If (feature 2 <= 3.0)
        Predict: 3.0
       Else (feature 2 > 3.0)
        Predict: 3.0
     Else (feature 4 > 34.0)
      If (feature 3 <= 634.0)
       If (feature 3 <= 261.0)
        Predict: 2.0
       Else (feature 3 > 261.0)
        Predict: 5.0
      Else (feature 3 > 634.0)
       If (feature 2 <= 9.0)
        Predict: 2.0
       Else (feature 2 > 9.0)
        Predict: 5.0
    Else (feature 0 > 28.0)
     If (feature 5 <= 3.0)
      If (feature 0 <= 45.0)
       If (feature 4 <= 31.0)
        Predict: 5.0
       Else (feature 4 > 31.0)
        Predict: 5.0
      Else (feature 0 > 45.0)
       If (feature 6 <= 1141.0)
        Predict: 5.

In [48]:
# 4-3 Naive Bayes
# train a model
NB_model = NaiveBayes.train(train, 1.0)
# make prediction and accuracy
NB_predictionAndLabels = test.map(lambda p: (NB_model.predict(p.features), p.label))
NB_accuracy = 1.0 * NB_predictionAndLabels.filter(lambda (x, v): x == v).count() / test.count()
print('Accuracy = {}'.format(NB_accuracy))

ValueError: `data` should be an RDD of LabeledPoint

In [39]:
#from sklearn.datasets import dump_svmlight_file
#from sklearn.datasets import load_svmlight_file
#X = df.iloc[:, 1:9]
#y = df.iloc[:, 0]
#dump_svmlight_file(X, y, 'smvlight.dat', zero_based = True, multilabel = False)
#inputData = load_svmlight_file('smvlight.dat')

In [47]:
#from pyspark.ml.classification import LogisticRegression, OneVsRest
#from pyspark.ml.evaluation import MulticlassClassificationEvaluator
#inputData = spark.read.format('libsvm').load('file:/media/sf_Ubuntu/sample_libsvm_data.txt')
#inputData
# generate the train/test split.
#(train, test) = inputData.randomSplit([0.8, 0.2])
# instantiate the base classifier
#lr = LogisticRegression(maxIter = 10, tol = 1E-6, fitIntercept = True)
# instantiate the One Vs Rest Classifier
#ovr = OneVsRest(classifier = lr)
# train a model
#ovrModel = ovr.fit(train)
# evaluate model and compute test error
#OVR_predictions = ovrModel.transform(test)
#OVR__metrics = MulticlassMetrics(OVR_predictions)
#print('Accuracy = %s' % OVR__metrics.accuracy)

DataFrame[label: double, features: vector]

In [15]:
# Gradient-Boosted Trees
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
# train a model
GBT_model = GradientBoostedTrees.trainClassifier(train, categoricalFeaturesInfo = {}, numIterations = 100)
# evaluate model and compute test error
GBT_predictions = GBT_model.predict(test.map(lambda x: x.features))
GBT_labelsAndPredictions = test.map(lambda lp: lp.label).zip(GBT_predictions)
GBT_metrics = MulticlassMetrics(GBT_labelsAndPredictions)
print('Accuracy = %s' % GBT_metrics.accuracy)
print('Learned classification GBT model:')
print(GBT_model.toDebugString())

Accuracy = 0.0
Learned classification GBT model:
TreeEnsembleModel classifier with 100 trees

  Tree 0:
    If (feature 0 <= 41.0)
     If (feature 0 <= 28.0)
      If (feature 6 <= 126.0)
       Predict: 6.777777777777778
      Else (feature 6 > 126.0)
       Predict: 5.399026763990268
     Else (feature 0 > 28.0)
      If (feature 4 <= 31.0)
       Predict: 6.11358574610245
      Else (feature 4 > 31.0)
       Predict: 7.402684563758389
    Else (feature 0 > 41.0)
     If (feature 7 <= 3.0)
      If (feature 4 <= 28.0)
       Predict: 6.362637362637362
      Else (feature 4 > 28.0)
       Predict: 7.705882352941177
     Else (feature 7 > 3.0)
      If (feature 4 <= 2.0)
       Predict: 5.5
      Else (feature 4 > 2.0)
       Predict: 7.761904761904762
  Tree 1:
    If (feature 4 <= 5.0)
     If (feature 3 <= 116.0)
      If (feature 0 <= 47.0)
       Predict: -3.999965979210685
      Else (feature 0 > 47.0)
       Predict: 0.0
     Else (feature 3 > 116.0)
      If (feature 6 <= 126.