<a href="https://colab.research.google.com/github/Tanuhlik/BigData/blob/master/White_wine_spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.3-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.3-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
df = spark.read.csv("winequality-white.csv", header=True, inferSchema=True)

In [5]:
df.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [6]:
df.show(3)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|
|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|
|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|      6|
+-------------+----------------+-----------+--------------+---------+-------------------+-----------

In [0]:
df = df.drop('citric acid')
df = df.drop('chlorides')
df = df.drop('free sulfur dioxide')
df = df.drop('total sulfur dioxide')
df = df.drop('sulphates')

In [15]:
df.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [0]:
df = df.dropna()

In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
train_cols = df.columns[0:6]

In [24]:
train_cols

['fixed acidity',
 'volatile acidity',
 'residual sugar',
 'density',
 'pH',
 'alcohol']

In [0]:
vector = VectorAssembler(inputCols = train_cols, outputCol = 'features')

In [0]:
v_df = vector.transform(df)

In [0]:
v_df = v_df.select(['features', 'quality'])

In [28]:
v_df.show(3)

+--------------------+-------+
|            features|quality|
+--------------------+-------+
|[7.0,0.27,20.7,1....|      6|
|[6.3,0.3,1.6,0.99...|      6|
|[8.1,0.28,6.9,0.9...|      6|
+--------------------+-------+
only showing top 3 rows



In [0]:
(train_df, test_df) = v_df.randomSplit([0.8,0.2])

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
lr = LinearRegression(featuresCol='features', labelCol='quality')

In [0]:
lr_model = lr.fit(train_df)

In [0]:
trainingSummary = lr_model.summary

In [34]:
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)

RMSE: 0.752171


In [35]:
print("MSE: %f" % trainingSummary.meanSquaredError)

MSE: 0.565761


In [36]:
print("MAE: %f" % trainingSummary.meanAbsoluteError)

MAE: 0.589120


In [37]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","quality","features").show(5)

+-----------------+-------+--------------------+
|       prediction|quality|            features|
+-----------------+-------+--------------------+
|6.599428561855973|      6|[4.7,0.145,1.0,0....|
|6.662878012000661|      7|[4.8,0.13,1.2,0.9...|
|6.974324408599443|      7|[4.8,0.21,10.2,0....|
|7.046987334216624|      8|[4.9,0.33,1.2,0.9...|
|5.531607118427814|      5|[4.9,0.345,1.0,0....|
+-----------------+-------+--------------------+
only showing top 5 rows



In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="quality", metricName="rmse")

In [40]:
print("MSE on test data = %g" % lr_evaluator.evaluate(lr_predictions))

MSE on test data = 0.770628
