In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [7]:
df = spark.read.csv("winequality-white.csv", header=True, inferSchema=True, sep=";")

In [8]:
df.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [9]:
from pyspark.ml.feature import VectorAssembler

In [10]:
train_cols = df.columns[:-1]

In [11]:
train_cols

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

In [12]:
vectorAssembler = VectorAssembler(inputCols = train_cols, outputCol = 'features')

In [13]:
v_df = vectorAssembler.transform(df)

In [14]:
v_df = v_df.select(['features', 'quality'])

In [15]:
v_df.show(3)

+--------------------+-------+
|            features|quality|
+--------------------+-------+
|[7.0,0.27,0.36,20...|      6|
|[6.3,0.3,0.34,1.6...|      6|
|[8.1,0.28,0.4,6.9...|      6|
+--------------------+-------+
only showing top 3 rows



In [16]:
(train_df, test_df) = v_df.randomSplit([0.8,0.2])

In [17]:
from pyspark.ml.regression import LinearRegression

In [18]:
lr = LinearRegression(featuresCol='features', labelCol='quality')

In [19]:
lr_model = lr.fit(train_df)

In [20]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("MSE: %f" % trainingSummary.meanSquaredError)
print("MAE: %f" % trainingSummary.meanAbsoluteError)

RMSE: 0.747592
MSE: 0.558893
MAE: 0.581527


In [24]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","quality","features").show(5)

+------------------+-------+--------------------+
|        prediction|quality|            features|
+------------------+-------+--------------------+
| 6.734581909917637|      8|[4.4,0.32,0.39,4....|
|  5.32580751318082|      5|[4.5,0.19,0.21,0....|
|5.7592699342519325|      5|[4.7,0.335,0.14,1...|
| 6.204351982645363|      6|[4.7,0.785,0.0,3....|
| 6.389218154059137|      7|[4.8,0.17,0.28,2....|
+------------------+-------+--------------------+
only showing top 5 rows



In [22]:
from pyspark.ml.evaluation import RegressionEvaluator

lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="quality", metricName="rmse")

print("MSE on test data = %g" % lr_evaluator.evaluate(lr_predictions))

MSE on test data = 0.762249


In [27]:
from pyspark.ml.classification import DecisionTreeClassifier

In [28]:
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='quality')

In [29]:
dtc_model = dtc.fit(train_df)

In [30]:
train_df

DataFrame[features: vector, quality: int]

In [31]:
test_df.head()

Row(features=DenseVector([4.4, 0.32, 0.39, 4.3, 0.03, 31.0, 127.0, 0.989, 3.46, 0.36, 12.8]), quality=8)

In [34]:
dtc_prediction = dtc_model.transform(test_df)

In [35]:
dtc_prediction.select("prediction","quality","features").show()

+----------+-------+--------------------+
|prediction|quality|            features|
+----------+-------+--------------------+
|       7.0|      8|[4.4,0.32,0.39,4....|
|       5.0|      5|[4.5,0.19,0.21,0....|
|       6.0|      5|[4.7,0.335,0.14,1...|
|       7.0|      6|[4.7,0.785,0.0,3....|
|       6.0|      7|[4.8,0.17,0.28,2....|
|       6.0|      6|[4.9,0.47,0.17,1....|
|       6.0|      7|[5.0,0.17,0.56,1....|
|       6.0|      6|[5.0,0.235,0.27,1...|
|       6.0|      5|[5.0,0.24,0.19,5....|
|       5.0|      6|[5.0,0.31,0.0,6.4...|
|       5.0|      6|[5.0,0.33,0.16,1....|
|       5.0|      6|[5.0,0.33,0.16,1....|
|       6.0|      6|[5.0,0.33,0.18,4....|
|       6.0|      6|[5.0,0.33,0.23,11...|
|       6.0|      6|[5.0,0.44,0.04,18...|
|       6.0|      6|[5.1,0.165,0.22,5...|
|       6.0|      5|[5.1,0.21,0.28,1....|
|       6.0|      5|[5.1,0.23,0.18,1....|
|       6.0|      7|[5.1,0.26,0.33,1....|
|       6.0|      6|[5.1,0.29,0.28,8....|
+----------+-------+--------------

In [36]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [39]:
dtс_evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="quality", metricName="accuracy")

In [40]:
dtс_evaluator.evaluate(dtc_prediction)

0.5118110236220472