In [None]:
#匯入套件
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#讀取資料
data = spark.read.parquet('withWeatherDelayencodever2',header=True, inferSchema=True)

#設定features欄位合併單一個欄位並向量化，輸出成features欄位
assembler = VectorAssembler(inputCols=['Year', 'Quarter','Month','DayofMonth','DayOfWeek','CRSDepTime','CRSArrTime','Distance','AWND','PRCP','TMAX','TMIN','WSF2',
                                       'WSF5','SNOW','WT01', 'WT02', 'WT03','WT04','WT05','WT06', 'WT07', 'WT08','WT09','WT10','WT11','WT18','MFR Year', 'Origin','Dest',
                                       'Tail Number','Manufacturer Name', 'Model', 'Airline_AA', 'Airline_AS', 'Airline_B6', 'Airline_DL','Airline_F9', 'Airline_G4', 'Airline_HA',
                                       'Airline_NK', 'Airline_UA', 'Airline_VX', 'Airline_WN', 'TypeOfAircraft_Balloon', 'TypeOfAircraft_Fixed Wing Multi-Engine',
                                       'TypeOfAircraft_Fixed Wing Single-Engine', 'TypeOfAircraft_Rotorcraft','TypeOfEngine_4 Cycle', 'TypeOfEngine_None', 'TypeOfEngine_Reciprocating',
                                       'TypeOfEngine_Turbo-fan', 'TypeOfEngine_Turbo-jet', 'TypeOfEngine_Turbo-prop', 'TypeOfEngine_Turbo-shaft'],
                            outputCol='indexedFeatures')
featureIndexer = assembler.transform(data)
data = featureIndexer.select(['indexedFeatures', 'ArrivalDelayGroups30'])

#將ArrivalDelayGroups30重新編碼輸出成label欄位
labelIndexer = StringIndexer(inputCol='ArrivalDelayGroups30', outputCol='indexedLabel').fit(data)
data = labelIndexer.transform(data)
data = data.select(['indexedFeatures','indexedLabel'])

#設定train,test比重
(trainingData, testData) = data.randomSplit([0.8, 0.2])

#設定模型抓取X,y給予參數，train模型，及模型參數
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=20)

#將prediction重新編碼輸出成predictionLabel欄位
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

#用這個串接多個設定欄位
pipeline = Pipeline(stages=[labelIndexer, rf, labelConverter])

#將trainingData套入模型
model = pipeline.fit(trainingData)

#預估模型
predictions = model.transform(testData)

#設定r2評估模型，並印出
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy is = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))
rfModel = model.stages[2]
print(rfModel)  # summary only