# Spark SQL
### Working on the Iris Dataset

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [None]:
iris_data = [
    (5.1, 3.5, 1.4, 0.2, "Iris-setosa"),
    (4.9, 3.0, 1.4, 0.2, "Iris-setosa"),
    (4.7, 3.2, 1.3, 0.2, "Iris-setosa"),
    (7.0, 3.2, 4.7, 1.4, "Iris-versicolor"),
    (6.4, 3.2, 4.5, 1.5, "Iris-versicolor"),
    (6.9, 3.1, 4.9, 1.5, "Iris-versicolor"),
    (5.9, 3.0, 5.1, 1.8, "Iris-virginica"),
    (6.8, 3.0, 5.5, 2.1, "Iris-virginica"),
    (6.7, 3.1, 5.6, 2.4, "Iris-virginica")
]

In [None]:
spark = SparkSession.builder.appName("Myapp").getOrCreate()

In [None]:
spark

In [None]:
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
df = spark.createDataFrame(iris_data, schema = columns)

In [None]:
df.show()

+------------+-----------+------------+-----------+---------------+
|sepal_length|sepal_width|petal_length|petal_width|        species|
+------------+-----------+------------+-----------+---------------+
|         5.1|        3.5|         1.4|        0.2|    Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|    Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|    Iris-setosa|
|         7.0|        3.2|         4.7|        1.4|Iris-versicolor|
|         6.4|        3.2|         4.5|        1.5|Iris-versicolor|
|         6.9|        3.1|         4.9|        1.5|Iris-versicolor|
|         5.9|        3.0|         5.1|        1.8| Iris-virginica|
|         6.8|        3.0|         5.5|        2.1| Iris-virginica|
|         6.7|        3.1|         5.6|        2.4| Iris-virginica|
+------------+-----------+------------+-----------+---------------+



In [None]:
df.filter(col('species') == 'Iris-setosa').show()

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+



In [None]:
spark.stop()

### Working on the Wine Quality Dataset

In [None]:
spark = SparkSession.builder.appName("Second App").getOrCreate()
spark

In [None]:
df = spark.read.format('csv').option("header","true").option("inferSchema","true").load("winequality-red.csv")

In [None]:
df.show()

+-------------+----------------+-----------+--------------+-------------------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|          chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+-------------------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.4|             0.7|        0.0|           1.9|              0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|
|          7.8|            0.88|        0.0|           2.6|              0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|
|          7.8|            0.76|       0.04|           2.3|              0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|
|         11.2|            0.28|       0

In [None]:
df.filter(col('quality') >= 7).show()

+-------------+----------------+-----------+--------------+--------------------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|           chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+--------------------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.3|            0.65|        0.0|           1.2|               0.065|               15.0|                21.0| 0.9946|3.39|     0.47|   10.0|      7|
|          7.8|            0.58|       0.02|           2.0|               0.073|                9.0|                18.0| 0.9968|3.36|     0.57|    9.5|      7|
|          8.5|            0.28|       0.56|           1.8|               0.092|               35.0|               103.0| 0.9969| 3.3|     0.75|   10.5|      7|
|          8.1|            0.38|  

In [None]:
spark.stop()