In [2]:
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext
sc

In [4]:
csv_path = r"./spark-data/Iris.csv"
df = spark.read.csv(csv_path, header=True, inferSchema=True)

In [5]:
df.show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [6]:
df.count()

150

In [9]:
df.schema

StructType([StructField('Id', IntegerType(), True), StructField('SepalLengthCm', DoubleType(), True), StructField('SepalWidthCm', DoubleType(), True), StructField('PetalLengthCm', DoubleType(), True), StructField('PetalWidthCm', DoubleType(), True), StructField('Species', StringType(), True)])

In [18]:
from pyspark.sql import functions as f
df_analysis = df.withColumn('SepalCircleCm', (f.col('SepalLengthCm') + f.col('SepalWidthCm')) * 2) \
                .withColumn('PetalCircleCm', (f.col('PetalLengthCm') + f.col('PetalWidthCm')) * 2) \
                .groupBy(f.col('Species')) \
                .agg(f.avg('SepalCircleCm').alias("AverageSepalCircleCm"), \
                     f.avg('PetalCircleCm').alias("AveragePetalCircleCm")) \
                .show()

+---------------+--------------------+--------------------+
|        Species|AverageSepalCircleCm|AveragePetalCircleCm|
+---------------+--------------------+--------------------+
| Iris-virginica|              19.124|  15.155999999999995|
|    Iris-setosa|              16.848|   3.415999999999999|
|Iris-versicolor|  17.412000000000003|  11.171999999999999|
+---------------+--------------------+--------------------+



In [19]:
spark.read.option("header",True) \
          .csv(r"./spark-data/Iris.csv") \
          .createOrReplaceTempView("Iris")

In [20]:
spark.sql("SELECT * FROM Iris") \
     .show(5)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 5 rows

