# Étape 3 - Chargement, pré-traitement

In [30]:
import org.apache.spark.sql.functions._


case class Car(
  rownames: Int,
  mpg: Double,
  cylinders: Int,
  displacement: Double,
  horsepower: Double,
  weight: Double,
  acceleration: Double,
  year: Int,
  origin: Int,
  name: String
)

val rdd = spark.sparkContext.textFile("./data-car.csv")

val header = rdd.first()
val dataRDD = rdd.filter(_ != header)

val carRDD = dataRDD.map(line => {
  val cols = line.split(",")
  Car(
    cols(0).toInt,
    cols(1).toDouble,
    cols(2).toInt,
    cols(3).toDouble,
    cols(4).toInt,
    cols(5).toDouble,
    cols(6).toDouble,
    cols(7).toInt,
    cols(8).toInt,
    cols(9)
  )
})


defined class Car
rdd = ./data-car.csv MapPartitionsRDD[350] at textFile at <console>:68
header = rownames,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
dataRDD = MapPartitionsRDD[351] at filter at <console>:71
carRDD = MapPartitionsRDD[352] at map at <console>:73


MapPartitionsRDD[352] at map at <console>:73

# Séance 2

## Étape 1 - RDD vers un dataframe

In [None]:
val carDF = carRDD.toDF()

carDF.show(10)
carDF.printSchema()
carDF.summary().show()

carDF.select("origin").distinct().show()
carDF.select(min("mpg"), max("mpg")).show()
carDF.select(min("horsepower"), max("horsepower")).show()


+--------+----+---------+------------+----------+------+------------+----+------+--------------------+
|rownames| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+--------+----+---------+------------+----------+------+------------+----+------+--------------------+
|       1|18.0|        8|       307.0|     130.0|3504.0|        12.0|  70|     1|chevrolet chevell...|
|       2|15.0|        8|       350.0|     165.0|3693.0|        11.5|  70|     1|   buick skylark 320|
|       3|18.0|        8|       318.0|     150.0|3436.0|        11.0|  70|     1|  plymouth satellite|
|       4|16.0|        8|       304.0|     150.0|3433.0|        12.0|  70|     1|       amc rebel sst|
|       5|17.0|        8|       302.0|     140.0|3449.0|        10.5|  70|     1|         ford torino|
|       6|15.0|        8|       429.0|     198.0|4341.0|        10.0|  70|     1|    ford galaxie 500|
|       7|14.0|        8|       454.0|     220.0|4354.0|         9.0|  70

carDF = [rownames: int, mpg: double ... 8 more fields]


+---------------+---------------+
|min(horsepower)|max(horsepower)|
+---------------+---------------+
|           46.0|          230.0|
+---------------+---------------+



[rownames: int, mpg: double ... 8 more fields]

## Étape 2 - Extraction de dimensions

In [19]:
import org.apache.spark.sql.functions.monotonically_increasing_id

val dfOrigin = carDF
  .select("origin")
  .distinct()
  .withColumn("id_origin", monotonically_increasing_id())

  dfOrigin.show(3)

val carDF_withOriginID = carDF
  .join(dfOrigin, Seq("origin"))
  .drop("origin")

  carDF_withOriginID.show(3)

val dfCylinders = carDF
  .select("cylinders")
  .distinct()
  .withColumn("id_cylinders", monotonically_increasing_id())

dfCylinders.show(3)


val carDF_enriched = carDF_withOriginID
  .join(dfCylinders, Seq("cylinders"))
  .drop("cylinders")

carDF_enriched.show(3)



+------+---------+
|origin|id_origin|
+------+---------+
|     1|        0|
|     3|        1|
|     2|        2|
+------+---------+

+--------+----+---------+------------+----------+------+------------+----+--------------------+---------+
|rownames| mpg|cylinders|displacement|horsepower|weight|acceleration|year|                name|id_origin|
+--------+----+---------+------------+----------+------+------------+----+--------------------+---------+
|       1|18.0|        8|       307.0|     130.0|3504.0|        12.0|  70|chevrolet chevell...|        0|
|       2|15.0|        8|       350.0|     165.0|3693.0|        11.5|  70|   buick skylark 320|        0|
|       3|18.0|        8|       318.0|     150.0|3436.0|        11.0|  70|  plymouth satellite|        0|
+--------+----+---------+------------+----------+------+------------+----+--------------------+---------+
only showing top 3 rows

+---------+------------+
|cylinders|id_cylinders|
+---------+------------+
|        6|           0|

dfOrigin = [origin: int, id_origin: bigint]
carDF_withOriginID = [rownames: int, mpg: double ... 8 more fields]
dfCylinders = [cylinders: int, id_cylinders: bigint]
carDF_enriched = [rownames: int, mpg: double ... 8 more fields]


[rownames: int, mpg: double ... 8 more fields]

## Étape 3 - Tables Hive, SQL

In [24]:
import org.apache.spark.sql.hive.HiveContext

val hc = new HiveContext(sc)

carDF_enriched.write.mode("overwrite").saveAsTable("cars")
dfOrigin.write.mode("overwrite").saveAsTable("dim_origin")
dfCylinders.write.mode("overwrite").saveAsTable("dim_cylinders")

hc.sql("SELECT name, horsepower FROM cars ORDER BY horsepower DESC LIMIT 10").show(3)
hc.sql("SELECT COUNT(*) FROM dim_origin").show()


+--------------------+----------+
|                name|horsepower|
+--------------------+----------+
|  pontiac grand prix|     230.0|
|    pontiac catalina|     225.0|
|buick estate wago...|     225.0|
+--------------------+----------+
only showing top 3 rows

+--------+
|count(1)|
+--------+
|       3|
+--------+



hc = org.apache.spark.sql.hive.HiveContext@361a6881




org.apache.spark.sql.hive.HiveContext@361a6881