In [2]:
import scala.util.Random
import spark.implicits._

// 데이터세트를 위한 케이스 클래스
case class Usage(uid: Int, uname: String, usage: Int)

val r = new Random(42)

// 스칼라 Usage 클래스의 1000개 인스턴스 생성
// 데이터를 즉시 생성
val data = (0 to 1000).map(i => Usage(i, "user-" + r.alphanumeric.take(5).mkString, r.nextInt(1000)))

// Usage 형태의 데이터세트 생성
val dsUsage = spark.createDataset(data)

dsUsage.show(10)


+---+----------+-----+
|uid|     uname|usage|
+---+----------+-----+
|  0|user-Gpi2C|  525|
|  1|user-DgXDi|  502|
|  2|user-M66yO|  170|
|  3|user-xTOn6|  913|
|  4|user-3xGSz|  246|
|  5|user-2aWRN|  727|
|  6|user-EzZY1|   65|
|  7|user-ZlZMZ|  935|
|  8|user-VjxeG|  756|
|  9|user-iqf1P|    3|
+---+----------+-----+
only showing top 10 rows



import scala.util.Random
import spark.implicits._
defined class Usage
r: scala.util.Random = scala.util.Random@4a823efd
data: scala.collection.immutable.IndexedSeq[Usage] = Vector(Usage(0,user-Gpi2C,525), Usage(1,user-DgXDi,502), Usage(2,user-M66yO,170), Usage(3,user-xTOn6,913), Usage(4,user-3xGSz,246), Usage(5,user-2aWRN,727), Usage(6,user-EzZY1,65), Usage(7,user-ZlZMZ,935), Usage(8,user-VjxeG,756), Usage(9,user-iqf1P,3), Usage(10,user-91S1q,794), Usage(11,user-qHNj0,501), Usage(12,user-7hb94,460), Usage(13,user-bz0WF,142), Usage(14,user-71nwy,479), Usage(15,user-7GZz1,823), Usage(16,user-1CSk6,140), Usage(17,user-WPzlL,246), Usage(18,user-VaEit,451), Usage(19,user-PSaRq,679), Usage(20,user-0Kkzu,332), Usage(21,user-UN3MG,172), Usage(22,user-KwwER,442), Usage(23,user-ZnltJ,923), Us...


In [3]:
dsUsage.show(10)

+---+----------+-----+
|uid|     uname|usage|
+---+----------+-----+
|  0|user-Gpi2C|  525|
|  1|user-DgXDi|  502|
|  2|user-M66yO|  170|
|  3|user-xTOn6|  913|
|  4|user-3xGSz|  246|
|  5|user-2aWRN|  727|
|  6|user-EzZY1|   65|
|  7|user-ZlZMZ|  935|
|  8|user-VjxeG|  756|
|  9|user-iqf1P|    3|
+---+----------+-----+
only showing top 10 rows



In [4]:
// if-then-else 람다식을 사용하여 값을 계산
dsUsage.map ( u => {if (u.usage > 750) (u.usage - 750) * 0.1 + 750 * 0.50 else u.usage * 0.50})
    .show(5, false)

+-----+
|value|
+-----+
|262.5|
|251.0|
|85.0 |
|391.3|
|123.0|
+-----+
only showing top 5 rows



In [7]:
// 사용량을 계산하는 함수를 정의
def computeCostUsage(usage: Int): Double = {
  if (usage > 750) (usage - 750) * 0.1 + 750 * 0.50 else usage * 0.50
}

// map()에 인자로서 함수를 사용
val costDf = dsUsage.map(u => (u.uid, u.uname, computeCostUsage(u.usage)))
  .toDF("user_id", "username", "cost")

costDf.show(10)

+-------+----------+-----+
|user_id|  username| cost|
+-------+----------+-----+
|      0|user-Gpi2C|262.5|
|      1|user-DgXDi|251.0|
|      2|user-M66yO| 85.0|
|      3|user-xTOn6|391.3|
|      4|user-3xGSz|123.0|
|      5|user-2aWRN|363.5|
|      6|user-EzZY1| 32.5|
|      7|user-ZlZMZ|393.5|
|      8|user-VjxeG|375.6|
|      9|user-iqf1P|  1.5|
+-------+----------+-----+
only showing top 10 rows



computeCostUsage: (usage: Int)Double
costDf: org.apache.spark.sql.DataFrame = [user_id: int, username: string ... 1 more field]
