# 方法四：Multilayer Perceptron

## PySpark版

### 資料前處理

In [1]:
from pyspark.sql import DataFrame

In [2]:
# Read CSV
spark_df = spark.read.csv('out_police.csv', header=True)
# Drop Unused Column
spark_df = spark_df.drop(spark_df.columns[0])

In [3]:
from pyspark.sql.types import *

In [4]:
# Convert columns types
def convertColumn(df, colNames, newType):
    for name in colNames:
        df = df.withColumn(name, df[name].astype(newType))
    return df


spark_df = convertColumn(spark_df, spark_df.columns[1:], IntegerType())

In [5]:
from pyspark.ml.feature import VectorAssembler

In [6]:
# Vectorize Features
vecAssembler = VectorAssembler(inputCols=spark_df.columns[1:], outputCol="features")
spark_df = vecAssembler.transform(spark_df)

In [7]:
from pyspark.ml.feature import StringIndexer

In [8]:
# Encode Labels
stringIndexer = StringIndexer(inputCol='raceethnicity', outputCol='label', handleInvalid='error')
model = stringIndexer.fit(spark_df)
td = model.transform(spark_df)

In [9]:
# Split Data
train, test = td.randomSplit(weights=[0.75, 0.25])

### 訓練Model

In [10]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [11]:
# Training
mlp = MultilayerPerceptronClassifier(featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, layers=[8, 12, 3, 7], solver='gd')
model = mlp.fit(train)

In [12]:
# Input Layer: 8 (features), Hidden Layer: 12 & 3, Output Layer: 7 (label)
model.layers

[8, 12, 3, 7]

In [13]:
model.weights

DenseVector([-0.7952, 0.2686, 0.177, -0.0032, -0.8001, -0.3183, -0.5332, -0.6912, 0.6253, 0.187, -0.0041, -0.3147, 0.3494, -0.0279, -0.3729, -0.5595, -0.761, 0.6962, 0.0336, 0.2336, -0.8372, 0.0031, 0.736, 0.0566, -0.6806, 0.1422, 0.0598, 0.0403, -0.3021, 0.546, 0.433, -0.8364, 0.6478, 0.5327, -0.3254, 0.0129, 0.3963, 0.7009, 0.5205, 0.7365, -0.8119, -0.7506, -0.1358, 0.7848, -0.4064, 0.4195, 0.1545, 0.0644, -0.354, 0.0367, -0.428, -0.4769, -0.4192, -0.8333, -0.6212, 0.5178, -0.5443, 0.6955, 0.1465, -0.3619, -0.5296, -0.1953, 0.2037, -0.1685, -0.2158, -0.0635, 0.3703, -0.6914, -0.753, 0.1671, -0.6528, 0.1887, -0.733, -0.4281, -0.6369, 0.1578, 0.0167, 0.5138, -0.2622, 0.704, -0.5315, 0.5268, 0.518, -0.636, -0.2656, 0.7127, -0.0801, 0.2159, -0.5013, 0.077, -0.3856, 0.2305, -0.7507, 0.3228, -0.084, 0.0767, -0.1616, 0.7034, -0.688, -0.7715, -0.5523, 0.6135, 0.1901, -0.5022, 0.5926, 0.7525, 0.2548, -0.7233, -0.4635, -0.5636, -0.1982, 0.5346, -0.4553, 0.4572, 0.0864, -0.2842, 0.5582, 0.2361,

### 測試Model

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [15]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
print('Accuracy:', evaluator.evaluate(model.transform(test)) * 100, '%')

Accuracy: 53.2520325203252 %
