In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('random_forest').getOrCreate()

In [2]:
df=spark.read.csv('cars.csv',inferSchema=True,header=True)

In [3]:
print((df.count(),len(df.columns)))

(5207, 6)


In [4]:
df.describe().select('summary','地区','驾驶员年龄','驾龄','每年保养次数','汽车类型').show() 

+-------+------------------+-----------------+-----------------+------------------+------------------+
|summary|              地区|       驾驶员年龄|             驾龄|      每年保养次数|          汽车类型|
+-------+------------------+-----------------+-----------------+------------------+------------------+
|  count|              5207|             5207|             5207|              5207|              5207|
|   mean| 4.156135970808527|28.91357787593624|8.799404647589784|1.3578836182062608|2.4536201267524484|
| stddev|0.9395952913420921|6.860914622077383|7.303439479306463|1.4217206625193524|0.8772697645768409|
|    min|                 1|             17.5|              0.5|               0.0|                 1|
|    max|                 5|             42.0|             23.0|               5.5|                 4|
+-------+------------------+-----------------+-----------------+------------------+------------------+



In [5]:
df.printSchema()

root
 |-- 地区: integer (nullable = true)
 |-- 驾驶员年龄: double (nullable = true)
 |-- 驾龄: double (nullable = true)
 |-- 每年保养次数: double (nullable = true)
 |-- 汽车类型: integer (nullable = true)
 |-- 故障: integer (nullable = true)



In [6]:
df.groupBy('汽车类型','故障').count().orderBy('汽车类型','故障','count',ascending=True).show()  

+--------+----+-----+
|汽车类型|故障|count|
+--------+----+-----+
|       1|   0|  558|
|       1|   1|  238|
|       2|   0| 1334|
|       2|   1|  481|
|       3|   0| 1577|
|       3|   1|  457|
|       4|   0|  484|
|       4|   1|   78|
+--------+----+-----+



In [7]:
df.groupBy('每年保养次数','故障').count()\
.orderBy('每年保养次数','故障','count',ascending=True).show()

+------------+----+-----+
|每年保养次数|故障|count|
+------------+----+-----+
|         0.0|   0| 1752|
|         0.0|   1|  285|
|         1.0|   0|  680|
|         1.0|   1|  271|
|         2.0|   0|  810|
|         2.0|   1|  374|
|         3.0|   0|  422|
|         3.0|   1|  205|
|         4.0|   0|  172|
|         4.0|   1|   77|
|         5.5|   0|  117|
|         5.5|   1|   42|
+------------+----+-----+



In [8]:
from pyspark.ml.feature import VectorAssembler  #VerctorAssembler 将多个列合并成向量列的特征转换器

In [9]:
df_vec = VectorAssembler(inputCols=['地区', '驾驶员年龄', '驾龄', '每年保养次数', '汽车类型'], outputCol="features")

In [10]:
df = df_vec.transform(df)

In [11]:
df.printSchema()

root
 |-- 地区: integer (nullable = true)
 |-- 驾驶员年龄: double (nullable = true)
 |-- 驾龄: double (nullable = true)
 |-- 每年保养次数: double (nullable = true)
 |-- 汽车类型: integer (nullable = true)
 |-- 故障: integer (nullable = true)
 |-- features: vector (nullable = true)



In [12]:
model_df=df.select(['features','故障'])  #选择训练需要的模型列

In [13]:
train_df,test_df=model_df.randomSplit([0.7,0.3]) # 训练数据和测试数据分为7比3

In [14]:
from pyspark.ml.classification import RandomForestClassifier

In [15]:
train_df.show(10)

+--------------------+----+
|            features|故障|
+--------------------+----+
|[1.0,22.0,2.5,0.0...|   1|
|[1.0,22.0,2.5,1.0...|   1|
|[1.0,22.0,2.5,1.0...|   1|
|[1.0,22.0,2.5,1.0...|   0|
|[1.0,22.0,2.5,1.0...|   0|
|[1.0,22.0,2.5,1.0...|   1|
|[1.0,22.0,2.5,1.0...|   1|
|[1.0,27.0,2.5,0.0...|   0|
|[1.0,27.0,2.5,0.0...|   1|
|[1.0,27.0,2.5,0.0...|   1|
+--------------------+----+
only showing top 10 rows



In [16]:
rf_classifier=RandomForestClassifier(featuresCol='features',labelCol='故障',numTrees=30)
rf_model = rf_classifier.fit(train_df) 

In [17]:
rf_pred=rf_model.transform(test_df)

In [19]:
print('{}{}'.format('评估每个属性的重要性:',rf_model.featureImportances)) 

评估每个属性的重要性:(5,[0,1,2,3,4],[0.5850305699488729,0.033799370247371376,0.23301277206058152,0.0795059584230612,0.06865132932011303])


In [21]:
rf_pred.show(10,False)

+-----------------------+----+---------------------------------------+----------------------------------------+----------+
|features               |故障|rawPrediction                          |probability                             |prediction|
+-----------------------+----+---------------------------------------+----------------------------------------+----------+
|[1.0,17.5,0.5,0.0,2.0] |0   |[18.783571907130153,11.216428092869846]|[0.6261190635710051,0.37388093642899484]|0.0       |
|[1.0,22.0,2.5,0.0,1.0] |1   |[13.564496325738299,16.435503674261703]|[0.45214987752460994,0.5478501224753901]|1.0       |
|[1.0,27.0,6.0,1.0,2.0] |0   |[12.825942561920609,17.17405743807939] |[0.42753141873068695,0.572468581269313] |1.0       |
|[1.0,27.0,6.0,1.0,3.0] |1   |[12.272550916929765,17.727449083070226]|[0.4090850305643256,0.5909149694356743] |1.0       |
|[1.0,27.0,6.0,2.0,2.0] |1   |[13.251959727315288,16.748040272684708]|[0.4417319909105097,0.5582680090894904] |1.0       |
|[1.0,27.0,9.0,1.0

In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator #对二进制分类的评估器,它期望两个输入列:原始预测值和标签

In [23]:
rf_bin=BinaryClassificationEvaluator(labelCol='故障').evaluate(rf_pred)

In [30]:
print('{}{}'.format('使用二元分类评估其器的评估结果:',rf_bin))

使用二元分类评估其器的评估结果:0.734692458171


In [31]:
rf_model.featureImportances

SparseVector(5, {0: 0.585, 1: 0.0338, 2: 0.233, 3: 0.0795, 4: 0.0687})

In [43]:
import_feature=df.schema["features"].metadata["ml_attr"]["attrs"]

In [48]:
import json
print json.dumps(import_feature,encoding='utf-8',ensure_ascii=False)

{"numeric": [{"name": "地区", "idx": 0}, {"name": "驾驶员年龄", "idx": 1}, {"name": "驾龄", "idx": 2}, {"name": "每年保养次数", "idx": 3}, {"name": "汽车类型", "idx": 4}]}


{u'numeric': [{u'idx': 0, u'name': u'\u5730\u533a'},
  {u'idx': 1, u'name': u'\u9a7e\u9a76\u5458\u5e74\u9f84'},
  {u'idx': 2, u'name': u'\u9a7e\u9f84'},
  {u'idx': 3, u'name': u'\u6bcf\u5e74\u4fdd\u517b\u6b21\u6570'},
  {u'idx': 4, u'name': u'\u6c7d\u8f66\u7c7b\u578b'}]}