In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier

In [None]:
trainingDFAllCol = spark.read.format("csv") \
    .option("sep", ",") \
    .option("inferSchema", "true") \
    .load("training.csv")

trainingDFPre = trainingDFAllCol.selectExpr("_c0 as label", "_c5 as text")
trainingDF = trainingDFPre.withColumn("label", trainingDFPre["label"].cast("float"))

In [None]:
trainingDFSplit = trainingDF.randomSplit([0.8,0.2],2)

In [None]:
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.3, labelCol="label") #default regParam=0.001
rfc = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=128, seed=42)
dt = DecisionTreeClassifier(maxDepth=2, featuresCol="features", labelCol="label")
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [None]:
model = pipeline.fit(trainingDFSplit[0])

In [None]:
model.save("file:/home/student/Desktop/twitch-big-data-project/models/lr_regParam0.3")

In [None]:
# Make predictions on test documents and print columns of interest
prediction = model.transform(trainingDFSplit[1])
predictionAndLabels = prediction.select("prediction", "label").rdd
metrics = BinaryClassificationMetrics(predictionAndLabels)
print("Area under ROC = %s" % metrics.areaUnderROC)