# Étape 3 - Chargement, pré-traitement

In [5]:
case class Car(
  rownames: Int,
  mpg: Double,
  cylinders: Int,
  displacement: Double,
  horsepower: Double,
  weight: Double,
  acceleration: Double,
  year: Int,
  origin: Int,
  name: String
)


val carDF = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv("./data-car.csv")
  .as[Car]


carDF.show()

defined class Car
carDF = [rownames: int, mpg: double ... 8 more fields]


+--------+----+---------+------------+----------+------+------------+----+------+--------------------+
|rownames| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+--------+----+---------+------------+----------+------+------------+----+------+--------------------+
|       1|18.0|        8|       307.0|       130|  3504|        12.0|  70|     1|chevrolet chevell...|
|       2|15.0|        8|       350.0|       165|  3693|        11.5|  70|     1|   buick skylark 320|
|       3|18.0|        8|       318.0|       150|  3436|        11.0|  70|     1|  plymouth satellite|
|       4|16.0|        8|       304.0|       150|  3433|        12.0|  70|     1|       amc rebel sst|
|       5|17.0|        8|       302.0|       140|  3449|        10.5|  70|     1|         ford torino|
|       6|15.0|        8|       429.0|       198|  4341|        10.0|  70|     1|    ford galaxie 500|
|       7|14.0|        8|       454.0|       220|  4354|         9.0|  70

[rownames: int, mpg: double ... 8 more fields]

# Séance 2

## Étape 1 - RDD vers un dataframe

In [15]:
import org.apache.spark.sql.functions._


val rdd = spark.sparkContext.textFile("./data-car.csv")

val header = rdd.first()
val dataRDD = rdd.filter(_ != header)

val carRDD = dataRDD.map(line => {
  val cols = line.split(",")
  Car(
    cols(0).toInt,
    cols(1).toDouble,
    cols(2).toInt,
    cols(3).toDouble,
    cols(4).toInt,
    cols(5).toDouble,
    cols(6).toDouble,
    cols(7).toInt,
    cols(8).toInt,
    cols(9)
  )
})


val carDF = carRDD.toDF()

carDF.show(10)
carDF.printSchema()
carDF.summary().show()

carDF.select("origin").distinct().show()

carDF.select(min("mpg"), max("mpg")).show()
carDF.select(min("horsepower"), max("horsepower")).show()


+--------+----+---------+------------+----------+------+------------+----+------+--------------------+
|rownames| mpg|cylinders|displacement|horsepower|weight|acceleration|year|origin|                name|
+--------+----+---------+------------+----------+------+------------+----+------+--------------------+
|       1|18.0|        8|       307.0|     130.0|3504.0|        12.0|  70|     1|chevrolet chevell...|
|       2|15.0|        8|       350.0|     165.0|3693.0|        11.5|  70|     1|   buick skylark 320|
|       3|18.0|        8|       318.0|     150.0|3436.0|        11.0|  70|     1|  plymouth satellite|
|       4|16.0|        8|       304.0|     150.0|3433.0|        12.0|  70|     1|       amc rebel sst|
|       5|17.0|        8|       302.0|     140.0|3449.0|        10.5|  70|     1|         ford torino|
|       6|15.0|        8|       429.0|     198.0|4341.0|        10.0|  70|     1|    ford galaxie 500|
|       7|14.0|        8|       454.0|     220.0|4354.0|         9.0|  70

rdd = ./data-car.csv MapPartitionsRDD[61] at textFile at <console>:35
header = rownames,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
dataRDD = MapPartitionsRDD[62] at filter at <console>:38
carRDD = MapPartitionsRDD[63] at map at <console>:40
carDF = [rownames: int, mpg: double ... 8 more fields]


[rownames: int, mpg: double ... 8 more fields]