In [None]:
import $ivy.`org.apache.spark::spark-sql:3.5.0`

In [None]:
import org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.WARN)

## Start

In [None]:
import org.apache.spark.sql._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.functions._

In [None]:
val spark = SparkSession
                .builder()
                .master("local[*]")
                .appName("Dataframe API")
                .getOrCreate()

import spark.implicits._

In [None]:
println(s"spark.version == ${spark.version}")

## 1 - Создание DataFrame

### fromRDD

In [None]:
val columns = Seq("StudentID", "Course")
val data = Seq(("1", "Spark"), ("2", "Scala"), ("3", "Java"))

val fromRDD: DataFrame = spark.sparkContext.parallelize(data).toDF(columns: _*)

fromRDD.show

### fromList

In [None]:
val fromList: DataFrame = data.toDF()

fromList.show

### createDataFrame

In [None]:
val createDataFrame: DataFrame = spark.createDataFrame(data)

createDataFrame.show

### withSchema

In [None]:
val withSchema: DataFrame = {
    val schema = StructType( Array(
      StructField("StudentID", IntegerType, true),
      StructField("Course", StringType, true)
    ))

    val rdd = (spark.sparkContext.parallelize(Seq(
      Row(1, "Spark"),
      Row(2, "Scala")
    )))

    spark.createDataFrame(rdd, schema)
}

withSchema.show

In [None]:
withSchema.printSchema

In [None]:
withSchema.dtypes

### fromFile

In [None]:
val fromFile: DataFrame = spark.read.format("json").load("data/customer_data.json")

fromFile.show(5, false)

## 2 - Операции

In [None]:
val customerDf = fromFile

### basicOperations

In [None]:
customerDf.printSchema()

In [None]:
customerDf.head

#### Select

In [None]:
customerDf.select("Birthdate", "Country").show

In [None]:
customerDf.select(col("Country")).show

In [None]:
customerDf.select('Country).show

In [None]:
customerDf.selectExpr("Birthdate as Date").show

In [None]:
customerDf.withColumn("Flag", lit(true)).show

In [None]:
customerDf.withColumnRenamed("Birthdate", "Date").show

#### Filter

In [None]:
customerDf.filter("Country = 'Norway'").show

In [None]:
customerDf.where('Country =!= "Iceland").show(false)

#### Sort

In [None]:
customerDf.sort('CustomerID.desc).show

In [None]:
customerDf.orderBy("CustomerID").show

#### Repartition

In [None]:
println(s"Num partitions: ${customerDf.repartition(5).rdd.getNumPartitions}")

In [None]:
println(s"Num partitions: ${customerDf.rdd.getNumPartitions}")


In [None]:
val repartitionedDf = customerDf.repartition(5, col("Country"))
println(s"New num partitions: ${repartitionedDf.rdd.getNumPartitions}")

In [None]:
println(s"Num partitions after coalesce: ${repartitionedDf.coalesce(1).rdd.getNumPartitions}")

### functions

In [None]:
customerDf.select("Birthdate").show(5)

In [None]:
customerDf.select(date_format(col("Birthdate"), "yyyy-MM-dd").alias("bd")).show(5)

In [None]:
customerDf.select("Name", "Username").show(5)

In [None]:
customerDf.withColumn("Identity", array('Name, 'Username)).printSchema

In [None]:
customerDf.withColumn("Identity", array('Name, 'Username)).select("Name", "Username", "Identity").show(5, false)

### groupBy

In [None]:
customerDf.groupBy("Country").agg(count(lit(1))).show

In [None]:
customerDf.groupBy("Country").count().show

In [None]:
customerDf.groupBy("Country", "Birthdate").agg(min("CustomerID"), max("CustomerID")).orderBy("Country").show

### union

In [None]:
customerDf.count

In [None]:
customerDf.union(customerDf).count

In [None]:
customerDf.count * 5

In [None]:
(1 to 5).toList.map(a => customerDf).reduce((x, y) => x.union(y)).count

### join

In [None]:
val retailDf = spark.read.format("json").load("data/retail_data.json")

In [None]:
customerDf.printSchema

In [None]:
retailDf.printSchema

In [None]:
(customerDf.dtypes.map(_._1)).toSet.intersect((retailDf.dtypes.map(_._1)).toSet)

In [None]:
customerDf.join(retailDf, "CustomerID").show

In [None]:
customerDf
    .join(retailDf, customerDf("CustomerID") === retailDf("CustomerID"), "left")
    //.select("CustomerID")
    .show

## Stop

In [None]:
spark.stop()