In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
spark = SparkSession.builder.appName("Data EDA").getOrCreate()

In [0]:
df = spark.read.csv("/FileStore/tables/iris-1.csv",header=True,inferSchema=True)

In [0]:
df.printSchema()

root
 |-- sepal.length: double (nullable = true)
 |-- sepal.width: double (nullable = true)
 |-- petal.length: double (nullable = true)
 |-- petal.width: double (nullable = true)
 |-- variety: string (nullable = true)



In [0]:
df.groupby("variety").count().show()

+----------+-----+
|   variety|count|
+----------+-----+
| Virginica|   50|
|    Setosa|   50|
|Versicolor|   50|
+----------+-----+



In [0]:
df.show(2)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal.length|petal.width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 2 rows



In [0]:
df = df.withColumnRenamed("sepal.width", "sepal_width").withColumnRenamed("sepal.length", "sepal_length").withColumnRenamed("petal.length", "petal_length").withColumnRenamed("petal.width", "petal_width")
df.groupby("variety").agg({"sepal_length":"avg","sepal_width":"avg"}).show()

+----------+------------------+-----------------+
|   variety|  avg(sepal_width)|avg(sepal_length)|
+----------+------------------+-----------------+
| Virginica|2.9739999999999998|6.587999999999998|
|    Setosa| 3.428000000000001|5.005999999999999|
|Versicolor|2.7700000000000005|            5.936|
+----------+------------------+-----------------+



In [0]:
df.filter(df["petal_length"] > 1.5).show()

+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width|   variety|
+------------+-----------+------------+-----------+----------+
|         5.4|        3.9|         1.7|        0.4|    Setosa|
|         4.8|        3.4|         1.6|        0.2|    Setosa|
|         5.7|        3.8|         1.7|        0.3|    Setosa|
|         5.4|        3.4|         1.7|        0.2|    Setosa|
|         5.1|        3.3|         1.7|        0.5|    Setosa|
|         4.8|        3.4|         1.9|        0.2|    Setosa|
|         5.0|        3.0|         1.6|        0.2|    Setosa|
|         5.0|        3.4|         1.6|        0.4|    Setosa|
|         4.7|        3.2|         1.6|        0.2|    Setosa|
|         4.8|        3.1|         1.6|        0.2|    Setosa|
|         5.0|        3.5|         1.6|        0.6|    Setosa|
|         5.1|        3.8|         1.9|        0.4|    Setosa|
|         5.1|        3.8|         1.6|        0.2|    

In [0]:
df = df.withColumn("sepal_area",round(df["sepal_length"] * df["sepal_width"],2))

In [0]:
df.show(5)

+------------+-----------+------------+-----------+-------+----------+
|sepal_length|sepal_width|petal_length|petal_width|variety|sepal_area|
+------------+-----------+------------+-----------+-------+----------+
|         5.1|        3.5|         1.4|        0.2| Setosa|     17.85|
|         4.9|        3.0|         1.4|        0.2| Setosa|      14.7|
|         4.7|        3.2|         1.3|        0.2| Setosa|     15.04|
|         4.6|        3.1|         1.5|        0.2| Setosa|     14.26|
|         5.0|        3.6|         1.4|        0.2| Setosa|      18.0|
+------------+-----------+------------+-----------+-------+----------+
only showing top 5 rows



In [0]:
df.createOrReplaceTempView("iris_table")

In [0]:
spark.sql("SELECT variety,MAX(sepal_width) as max_sepal_width,MIN(sepal_width) as max_sepal_width FROM iris_table GROUP BY variety ").show()

+----------+---------------+---------------+
|   variety|max_sepal_width|max_sepal_width|
+----------+---------------+---------------+
| Virginica|            3.8|            2.2|
|    Setosa|            4.4|            2.3|
|Versicolor|            3.4|            2.0|
+----------+---------------+---------------+



In [0]:
spark.sql("SELECT VARIANCE(sepal_length), VARIANCE(sepal_width), VARIANCE(petal_length), VARIANCE(petal_width) FROM iris_table GROUP BY variety").show()

+----------------------+---------------------+----------------------+---------------------+
|variance(sepal_length)|variance(sepal_width)|variance(petal_length)|variance(petal_width)|
+----------------------+---------------------+----------------------+---------------------+
|    0.4043428571428571|  0.10400408163265307|    0.3045877551020408|   0.0754326530612245|
|   0.12424897959183674|  0.14368979591836734|  0.030159183673469394|  0.01110612244897959|
|    0.2664326530612246|  0.09846938775510206|   0.22081632653061223|  0.03910612244897961|
+----------------------+---------------------+----------------------+---------------------+



In [0]:
spark.sql("""SELECT * FROM iris_table WHERE sepal_length > (SELECT AVG(sepal_length) FROM iris_table)""").show(10)

+------------+-----------+------------+-----------+----------+----------+
|sepal_length|sepal_width|petal_length|petal_width|   variety|sepal_area|
+------------+-----------+------------+-----------+----------+----------+
|         7.0|        3.2|         4.7|        1.4|Versicolor|      22.4|
|         6.4|        3.2|         4.5|        1.5|Versicolor|     20.48|
|         6.9|        3.1|         4.9|        1.5|Versicolor|     21.39|
|         6.5|        2.8|         4.6|        1.5|Versicolor|      18.2|
|         6.3|        3.3|         4.7|        1.6|Versicolor|     20.79|
|         6.6|        2.9|         4.6|        1.3|Versicolor|     19.14|
|         5.9|        3.0|         4.2|        1.5|Versicolor|      17.7|
|         6.0|        2.2|         4.0|        1.0|Versicolor|      13.2|
|         6.1|        2.9|         4.7|        1.4|Versicolor|     17.69|
|         6.7|        3.1|         4.4|        1.4|Versicolor|     20.77|
+------------+-----------+------------

In [0]:
spark.sql("""SELECT variety,AVG(petal_length * petal_width) as avg_petal_area FROM iris_table GROUP BY variety""").show()

+----------+-------------------+
|   variety|     avg_petal_area|
+----------+-------------------+
| Virginica| 11.296199999999994|
|    Setosa|0.36560000000000026|
|Versicolor|             5.7204|
+----------+-------------------+



In [0]:
spark.sql("""SELECT variety,percentile_approx(sepal_width, 0.5) AS median_petal_area FROM iris_table GROUP BY variety""").show()

+----------+-----------------+
|   variety|median_petal_area|
+----------+-----------------+
| Virginica|              3.0|
|    Setosa|              3.4|
|Versicolor|              2.8|
+----------+-----------------+

